{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Mar 27 08:34:53 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "roadrunner\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 16\n", "On-line CPU(s) list: 0-15\n", "Thread(s) per core: 2\n", "Core(s) per socket: 4\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 26\n", "Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz\n", "Stepping: 5\n", "CPU MHz: 1596.000\n", "CPU max MHz: 2394.0000\n", "CPU min MHz: 1596.0000\n", "BogoMIPS: 4521.81\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 8192K\n", "NUMA node0 CPU(s): 0-15\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 47G 1.3G 38G 584M 7.0G 44G\n", "Swap: 47G 0B 47G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20190327_cgig_repeatmasker_all\n", "env: fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "env: fasta=Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "env: repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker\n", "env: cpus=16\n", "env: checksum=6de9d1239eb10ea0545bed6c4e746d6c\n" ] } ], "source": [ "%env wd=/home/sam/analyses/20190327_cgig_repeatmasker_all\n", "%env fasta_url=http://owl.fish.washington.edu/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "%env fasta=Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "%env repeat_masker=/home/shared/RepeatMasker-4.0.7/RepeatMasker\n", "%env cpus=16\n", "%env checksum=6de9d1239eb10ea0545bed6c4e746d6c" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create necessary directories" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Download _Crassostrea gigas genome FastA file\n", "\n", "Info on FastA file is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome](https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "\r", " 0 0% 0.00kB/s 0:00:00 \r", " 491,520 0% 14.11kB/s 11:09:48 \r", " 22,478,848 3% 626.91kB/s 0:14:29 \r", " 49,020,928 8% 1.30MB/s 0:06:30 \r", " 65,732,608 11% 1.69MB/s 0:04:49 \r", " 92,078,080 16% 21.84MB/s 0:00:21 \r", " 119,209,984 21% 23.07MB/s 0:00:18 \r", " 143,982,592 25% 22.65MB/s 0:00:18 \r", " 173,506,560 30% 25.64MB/s 0:00:15 \r", " 204,505,088 36% 26.75MB/s 0:00:13 \r", " 233,537,536 41% 27.20MB/s 0:00:11 \r", " 263,651,328 46% 28.47MB/s 0:00:10 \r", " 280,494,080 49% 25.42MB/s 0:00:11 \r", " 284,655,616 50% 15.82MB/s 0:00:17 \r", " 285,114,368 50% 8.01MB/s 0:00:34 \r", " 288,849,920 50% 2.51MB/s 0:01:48 \r", " 308,314,112 54% 2.78MB/s 0:01:31 \r", " 334,888,960 59% 5.15MB/s 0:00:44 \r", " 334,987,264 59% 4.40MB/s 0:00:51 \r", " 336,494,592 59% 6.05MB/s 0:00:37 \r", " 338,362,368 59% 3.47MB/s 0:01:04 \r", " 339,181,568 59% 532.11kB/s 0:07:09 \r", " 339,279,872 59% 816.52kB/s 0:04:39 \r", " 341,278,720 60% 829.99kB/s 0:04:32 \r", " 347,471,872 61% 1.68MB/s 0:02:07 \r", " 350,027,776 61% 1.99MB/s 0:01:46 \r", " 355,958,784 62% 2.86MB/s 0:01:12 \r", " 364,347,392 64% 4.41MB/s 0:00:45 \r", " 366,444,544 64% 3.76MB/s 0:00:52 \r", " 367,099,904 64% 3.29MB/s 0:00:59 \r", " 372,736,000 65% 3.44MB/s 0:00:55 \r", " 374,833,152 66% 2.16MB/s 0:01:27 \r", " 382,140,416 67% 3.07MB/s 0:00:58 \r", " 391,610,368 68% 5.06MB/s 0:00:33 \r", " 398,131,200 70% 5.25MB/s 0:00:31 \r", " 404,160,512 71% 2.59MB/s 0:01:01 \r", " 424,935,424 74% 3.90MB/s 0:00:35 \r", " 447,021,056 78% 5.10MB/s 0:00:23 \r", " 476,053,504 83% 7.27MB/s 0:00:12 \r", " 504,856,576 88% 23.27MB/s 0:00:02 \r", " 523,665,408 92% 22.81MB/s 0:00:01 \r", " 538,411,008 94% 19.61MB/s 0:00:01 \r", " 540,508,160 95% 12.62MB/s 0:00:02 \r", " 542,605,312 95% 7.49MB/s 0:00:03 \r", " 544,669,696 95% 1.72MB/s 0:00:13 \r", " 567,574,528 99% 2.41MB/s 0:00:00 \r", " 567,592,991 100% 5.15MB/s 0:01:45 (xfr#1, to-chk=0/1)\n", "\n", "sent 30 bytes received 567,662,409 bytes 5,137,216.64 bytes/sec\n", "total size is 567,592,991 speedup is 1.00\n", "total 542M\n", "-rw-rw-rw- 1 sam users 542M Aug 24 2018 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n" ] } ], "source": [ "%%bash\n", "# Using rsync\n", "cd ${wd}\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "owl:/volume1/web/halfshell/genomic-databank/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa .\n", "\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If need to download via wget, change cell below to code, instead of markdown\n", "\n", "#### Also, change cell with md5 checksum comparisons to code, instead of markdown" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "%%bash\n", "time \\\n", "wget ${fasta_url} \\\n", "--quiet \\\n", "--directory-prefix=${wd}\n", "\n", "ls -lh ${wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Verify MD5 checksum\n", "\n", "Original MD5 checksum taken from GitHub Genomic Resource linked above.\n", "\n", "Use ```md5sum``` to generate checksum from downloaded FastA file and ```awk``` to print the first field (i.e. the checksum value). This is saved to the variable: ```dl_md5```\n", "\n", "Then, check for differences between the two variables. \n", "\n", "No output confirms no difference." ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "%%bash\n", "md5=${checksum}\n", "dl_md5=$(md5sum ${wd}/${fasta} | awk '{ print $1 }')\n", "diff <(echo \"$md5\") <(echo \"$dl_md5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Run RepeatMasker with _all_ species setting and following options:\n", "\n", "```-species \"all\"``` : Sets species to all\n", "\n", "```-par 15``` : Use 15 CPU threads\n", "\n", "```-gff``` : Create GFF output file (in addition to default files)\n", "\n", "```-excln``` : Adjusts output table calculations to exclude sequence runs of >=25Ns. Useful for draft genome assemblies.\n", "\n", "```-1>``` : Send stdout to file instead of printing to notebook.\n", "\n", "```-2>``` : Send stderr to file instead of printing to notebook.\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t1419m34.303s\n", "user\t21339m49.472s\n", "sys\t111m2.000s\n" ] } ], "source": [ "%%bash\n", "\n", "cd ${wd}\n", "time \\\n", "${repeat_masker} \\\n", "${fasta} \\\n", "-species \"all\" \\\n", "-par ${cpus} \\\n", "-gff \\\n", "-excln \\\n", "1> stdout.out \\\n", "2> stderr.err\n", "\n", "sed '/^Subject:/ s/ / repeatmasker_gigas_all JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.5G\n", "-rw-rw-rw- 1 sam users 542M Aug 24 2018 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "-rw-rw-r-- 1 sam sam 244M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz\n", "-rw-rw-r-- 1 sam sam 544M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked\n", "-rw-rw-r-- 1 sam sam 90M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out\n", "-rw-rw-r-- 1 sam sam 58M Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff\n", "-rw-rw-r-- 1 sam sam 2.4K Mar 28 08:43 Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl\n", "-rw-rw-r-- 1 sam sam 0 Mar 27 09:04 stderr.err\n", "-rw-rw-r-- 1 sam sam 2.1M Mar 28 08:43 stdout.out\n" ] } ], "source": [ "%%bash\n", "ls -lh ${wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SUMMARY TABLE" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", "file name: Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa\n", "sequences: 7658\n", "total length: 557717710 bp (491860439 bp excl N/X-runs)\n", "GC level: 33.42 %\n", "bases masked: 160369613 bp ( 32.60 %)\n", "==================================================\n", " number of length percentage\n", " elements* occupied of sequence\n", "--------------------------------------------------\n", "Retroelements 48481 19773596 bp 4.02 %\n", " SINEs: 2498 317084 bp 0.06 %\n", " Penelope 5749 1808270 bp 0.37 %\n", " LINEs: 26463 10472676 bp 2.13 %\n", " CRE/SLACS 15 1289 bp 0.00 %\n", " L2/CR1/Rex 1712 307207 bp 0.06 %\n", " R1/LOA/Jockey 299 21470 bp 0.00 %\n", " R2/R4/NeSL 218 69735 bp 0.01 %\n", " RTE/Bov-B 8417 3631379 bp 0.74 %\n", " L1/CIN4 983 64189 bp 0.01 %\n", " LTR elements: 19520 8983836 bp 1.83 %\n", " BEL/Pao 2050 1349545 bp 0.27 %\n", " Ty1/Copia 2139 189535 bp 0.04 %\n", " Gypsy/DIRS1 11971 6501545 bp 1.32 %\n", " Retroviral 1263 69288 bp 0.01 %\n", "\n", "DNA transposons 299050 85782505 bp 17.44 %\n", " hobo-Activator 9348 2278556 bp 0.46 %\n", " Tc1-IS630-Pogo 32515 8695261 bp 1.77 %\n", " En-Spm 0 0 bp 0.00 %\n", " MuDR-IS905 0 0 bp 0.00 %\n", " PiggyBac 4136 747000 bp 0.15 %\n", " Tourist/Harbinger 11590 2828277 bp 0.58 %\n", " Other (Mirage, 232 14514 bp 0.00 %\n", " P-element, Transib)\n", "\n", "Rolling-circles 0 0 bp 0.00 %\n", "\n", "Unclassified: 109149 49075277 bp 9.98 %\n", "\n", "Total interspersed repeats: 154631378 bp 31.44 %\n", "\n", "\n", "Small RNA: 830 93282 bp 0.02 %\n", "\n", "Satellites: 2087 401812 bp 0.08 %\n", "Simple repeats: 110847 4687373 bp 0.95 %\n", "Low complexity: 16716 787611 bp 0.16 %\n", "==================================================\n", "\n", "* most repeats fragmented by insertions or deletions\n", " have been counted as one element\n", " Runs of >=20 X/Ns in query were excluded in % calcs\n", "\n", "\n", "The query species was assumed to be root \n", "RepeatMasker Combined Database: Dfam_Consensus-20170127, RepBase-20170127\n", " \n", "run with rmblastn version 2.6.0+\n", "\n" ] } ], "source": [ "%%bash\n", "cat ${wd}/${fasta}.tbl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Delete FastA (not needed) and `rsync` to my folder on Gannet" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "drwxrwxr-x 4,096 2019/03/27 08:45:17 .\n", "\n", "sent 49 bytes received 64 bytes 226.00 bytes/sec\n", "total size is 0 speedup is 0.00\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "rsync: link_stat \"/home/sam/analyses/home/sam/analyses/20190327_cgig_repeatmasker_all\" failed: No such file or directory (2)\n", "rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1183) [sender=3.1.1]\n", "bash: line 10: gannet:/volume2/web/Atumefaciens: No such file or directory\n" ] } ], "source": [ "%%bash\n", "cd ${wd}\n", "rm ${fasta}\n", "cd ..\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--relative \\\n", "./home/sam/analyses/20190327_cgig_repeatmasker_all \n", "gannet:/volume2/web/Atumefaciens" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "drwxrwxr-x 4,096 2019/03/27 08:45:17 .\n", "drwxrwxr-x 4,096 2019/03/28 11:54:45 20190327_cgig_repeatmasker_all\n", "-rw-rw-r-- 255,343,202 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz\n", "-rw-rw-r-- 569,452,085 2019/03/28 08:43:44 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked\n", "-rw-rw-r-- 93,415,468 2019/03/28 08:43:44 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out\n", "-rw-rw-r-- 59,778,663 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff\n", "-rw-rw-r-- 2,444 2019/03/28 08:43:43 20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl\n", "-rw-rw-r-- 0 2019/03/27 09:04:11 20190327_cgig_repeatmasker_all/stderr.err\n", "-rw-rw-r-- 2,119,050 2019/03/28 08:43:41 20190327_cgig_repeatmasker_all/stdout.out\n", "\n", "sent 353 bytes received 1,000 bytes 2,706.00 bytes/sec\n", "total size is 980,110,912 speedup is 724,398.31\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "rm: cannot remove 'Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa': No such file or directory\n", "bash: line 10: gannet:/volume2/web/Atumefaciens: No such file or directory\n" ] } ], "source": [ "%%bash\n", "cd ${wd}\n", "rm ${fasta}\n", "cd ..\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--relative \\\n", "./20190327_cgig_repeatmasker_all \n", "gannet:/volume2/web/Atumefaciens" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Restarted notbook in case `gannet` shortuct wasn't loaded. However, problem was most likely due to missing continuatino slash after repeatmasker directory..." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "./\n", "20190327_cgig_repeatmasker_all/\n", "20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.cat.gz\n", "\r", " 32,768 0% 0.00kB/s 0:00:00 \r", " 118,947,840 46% 113.41MB/s 0:00:01 \r", " 236,191,744 92% 112.67MB/s 0:00:00 \r", " 255,343,202 100% 112.72MB/s 0:00:02 (xfr#1, to-chk=6/9)\n", "20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.masked\n", "\r", " 32,768 0% 197.53kB/s 0:48:02 \r", " 98,205,696 17% 93.66MB/s 0:00:04 \r", " 215,515,136 37% 102.82MB/s 0:00:03 \r", " 332,693,504 58% 105.83MB/s 0:00:02 \r", " 449,871,872 79% 107.34MB/s 0:00:01 \r", " 567,017,472 99% 111.86MB/s 0:00:00 \r", " 569,452,085 100% 108.29MB/s 0:00:05 (xfr#2, to-chk=5/9)\n", "20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out\n", "\r", " 32,768 0% 1.56MB/s 0:00:58 \r", " 93,415,468 100% 109.04MB/s 0:00:00 (xfr#3, to-chk=4/9)\n", "20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.out.gff\n", "\r", " 32,768 0% 39.12kB/s 0:25:27 \r", " 21,397,504 35% 20.41MB/s 0:00:01 \r", " 59,778,663 100% 42.96MB/s 0:00:01 (xfr#4, to-chk=3/9)\n", "20190327_cgig_repeatmasker_all/Crassostrea_gigas.oyster_v9.dna_sm.toplevel.fa.tbl\n", "\r", " 2,444 100% 7.28kB/s 0:00:00 \r", " 2,444 100% 7.28kB/s 0:00:00 (xfr#5, to-chk=2/9)\n", "20190327_cgig_repeatmasker_all/stderr.err\n", "\r", " 0 100% 0.00kB/s 0:00:00 (xfr#6, to-chk=1/9)\n", "20190327_cgig_repeatmasker_all/stdout.out\n", "\r", " 32,768 1% 97.56kB/s 0:00:21 \r", " 2,119,050 100% 5.82MB/s 0:00:00 (xfr#7, to-chk=0/9)\n", "\n", "sent 980,350,785 bytes received 163 bytes 103,194,836.63 bytes/sec\n", "total size is 980,110,912 speedup is 1.00\n" ] } ], "source": [ "%%bash\n", "cd /home/sam/analyses/\n", "\n", "rsync \\\n", "--archive \\\n", "--verbose \\\n", "--progress \\\n", "--relative \\\n", "./20190327_cgig_repeatmasker_all \\\n", "gannet:/volume2/web/Atumefaciens" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }