{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Thu Oct 31 08:44:36 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2925.971\n", "BogoMIPS: 5851.97\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 29G 435M 452M 41G 40G\n", "Swap: 4.7G 251M 4.4G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "`%env` variables are good for passing to bash cells" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191031_pgen_v074_stringtie_BAM_splitting\n", "env: rsync_gannet=gannet:/volume2/web/Atumefaciens/20190723_stringtie_pgen_v074/\n", "env: wget_bam=--quiet --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20190723_stringtie_pgen_v074/20190723_sorted.merged.bam\n", "env: original_bam=20190723_sorted.merged.bam\n", "env: reassembled_bam=20190723_sorted.merged.reassembled.bam\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20191031_pgen_v074_stringtie_BAM_splitting\n", "wd=\"/home/sam/analyses/20191031_pgen_v074_stringtie_BAM_splitting\"\n", "\n", "%env rsync_gannet=gannet:/volume2/web/Atumefaciens/20190723_stringtie_pgen_v074/\n", "%env wget_bam=--quiet --no-check-certificate https://gannet.fish.washington.edu/Atumefaciens/20190723_stringtie_pgen_v074/20190723_sorted.merged.bam\n", "%env original_bam=20190723_sorted.merged.bam\n", "%env reassembled_bam=20190723_sorted.merged.reassembled.bam" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191031_pgen_v074_stringtie_BAM_splitting\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download Pgen_v074 merged Stringtie BAM file\n", "\n", "Info on the BAM is here: https://robertslab.github.io/sams-notebook/2019/07/23/Genome-Annotation-Pgenerosa_v074-Transcript-Isoform-ID-with-Stringtie-on-Mox.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, uncomment lines in the cell below" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# %%bash\n", "# time \\\n", "# wget \"${wget_gffs}\"\n", "# wget \"${wget_fasta}\"\n", "# ls -lh ${wd}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "20190723_sorted.merged.bam\n", "\n", "sent 30 bytes received 77,940,299,973 bytes 64,869,163.55 bytes/sec\n", "total size is 77,930,786,826 speedup is 1.00\n" ] } ], "source": [ "%%bash\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "\"${rsync_gannet}${original_bam}\" \\\n", "." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 73G\n", "-rw-rw-r-- 1 sam users 73G Aug 29 10:54 20190723_sorted.merged.bam\n" ] } ], "source": [ "%%bash\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Split BAM into 5GB chunks" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 146G\n", "-rw-rw-r-- 1 sam users 73G Aug 29 10:54 20190723_sorted.merged.bam\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:21 20190723_sorted.merged.bam_aa\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:23 20190723_sorted.merged.bam_ab\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:25 20190723_sorted.merged.bam_ac\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:27 20190723_sorted.merged.bam_ad\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:30 20190723_sorted.merged.bam_ae\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:32 20190723_sorted.merged.bam_af\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:34 20190723_sorted.merged.bam_ag\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:37 20190723_sorted.merged.bam_ah\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:39 20190723_sorted.merged.bam_ai\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:41 20190723_sorted.merged.bam_aj\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:43 20190723_sorted.merged.bam_ak\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:46 20190723_sorted.merged.bam_al\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:48 20190723_sorted.merged.bam_am\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:50 20190723_sorted.merged.bam_an\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:53 20190723_sorted.merged.bam_ao\n", "-rw-rw-r-- 1 sam sam 2.8G Oct 31 13:54 20190723_sorted.merged.bam_ap\n" ] } ], "source": [ "%%bash\n", "split \\\n", "--bytes 5GB \\\n", "\"${original_bam}\" \\\n", "\"${original_bam}_\"\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reassemble and compare to original" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1c1\n", "< e53c73db4145ba65522f771535d10a52 20190723_sorted.merged.bam\n", "---\n", "> e53c73db4145ba65522f771535d10a52 20190723_sorted.merged.reassembled.bam\n" ] }, { "ename": "CalledProcessError", "evalue": "Command 'b\"cat ${original_bam}_* > ${reassembled_bam}\\n\\n# Let's see if MD5 checksums are the same..\\ndiff <(md5sum ${original_bam}) <(md5sum ${reassembled_bam})\\n\"' returned non-zero exit status 1.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'bash'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"cat ${original_bam}_* > ${reassembled_bam}\\n\\n# Let's see if MD5 checksums are the same..\\ndiff <(md5sum ${original_bam}) <(md5sum ${reassembled_bam})\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2350\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2351\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmagic_arg_s\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2352\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2353\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mnamed_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscript\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshebang\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0;31m# write a basic docstring:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/magics/script.py\u001b[0m in \u001b[0;36mshebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflush\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m!=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mCalledProcessError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturncode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_script\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mto_close\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b\"cat ${original_bam}_* > ${reassembled_bam}\\n\\n# Let's see if MD5 checksums are the same..\\ndiff <(md5sum ${original_bam}) <(md5sum ${reassembled_bam})\\n\"' returned non-zero exit status 1." ] } ], "source": [ "%%bash\n", "cat ${original_bam}_* > ${reassembled_bam}\n", "\n", "# Let's see if MD5 checksums are the same..\n", "diff <(md5sum ${original_bam}) <(md5sum ${reassembled_bam})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Looks good! Same checksum. The error here is generated due to the fact that the output of `md5sum` is: _checksum_ _filename_\n", "\n", "#### Of course the file names are different, which triggers the error from the `diff` command. \n", "\n", "#### I forgot that this was the output and should have just compared the two checksums." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 73G\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:21 20190723_sorted.merged.bam_aa\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:23 20190723_sorted.merged.bam_ab\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:25 20190723_sorted.merged.bam_ac\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:27 20190723_sorted.merged.bam_ad\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:30 20190723_sorted.merged.bam_ae\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:32 20190723_sorted.merged.bam_af\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:34 20190723_sorted.merged.bam_ag\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:37 20190723_sorted.merged.bam_ah\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:39 20190723_sorted.merged.bam_ai\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:41 20190723_sorted.merged.bam_aj\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:43 20190723_sorted.merged.bam_ak\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:46 20190723_sorted.merged.bam_al\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:48 20190723_sorted.merged.bam_am\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:50 20190723_sorted.merged.bam_an\n", "-rw-rw-r-- 1 sam sam 4.7G Oct 31 13:53 20190723_sorted.merged.bam_ao\n", "-rw-rw-r-- 1 sam sam 2.8G Oct 31 13:54 20190723_sorted.merged.bam_ap\n" ] } ], "source": [ "%%bash\n", "rm ${original_bam} ${reassembled_bam}\n", "ls -ltrh" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }