{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Dec 19 11:16:08 PST 2018\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.5 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "emu\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 16\n", "On-line CPU(s) list: 0-15\n", "Thread(s) per core: 2\n", "Core(s) per socket: 4\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 26\n", "Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz\n", "Stepping: 5\n", "CPU MHz: 2394.000\n", "CPU max MHz: 2394.0000\n", "CPU min MHz: 1596.0000\n", "BogoMIPS: 4521.80\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 8192K\n", "NUMA node0 CPU(s): 0-15\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm tpr_shadow vnmi flexpriority ept vpid dtherm ida\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 47G 1.5G 43G 377M 2.2G 44G\n", "Swap: 11G 438M 11G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create working directories for both genome versions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "mkdir /home/sam/analyses/20181219_Pgenerosa_repeatmodeler" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: work_dir=/home/sam/analyses/20181219_Pgenerosa_repeatmodeler\n", "env: rptm=/home/shared/RepeatModeler-open-1.0.11\n", "env: Pgenerosa_v070_fasta=/home/sam/data/genomes/geoduck/Pgenerosa_v070.fa\n" ] } ], "source": [ "%env work_dir = /home/sam/analyses/20181219_Pgenerosa_repeatmodeler\n", "%env rptm = /home/shared/RepeatModeler-open-1.0.11\n", "%env Pgenerosa_v070_fasta = /home/sam/data/genomes/geoduck/Pgenerosa_v070.fa" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2.1G\n", "-rw-rw-rw- 1 sam sam 2.1G Aug 14 11:45 Pgenerosa_v070.fa\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t1m24.224s\n", "user\t0m29.692s\n", "sys\t0m12.028s\n" ] } ], "source": [ "%%bash\n", "cd /home/sam/data/genomes/geoduck/\n", "time \\\n", "rsync \\\n", "--archive \\\n", "owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v070.fa \\\n", ".\n", "ls -lh\n", "\n", "# Uncomment following line(s) to download from web\n", "# wget http://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v070.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create RepeatModeler databases" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "------------------------------------------------------------------------\n", "\n", "------------------------------------------------------------------------\n", "Building database Pgenerosa_v070:\n", " Adding /home/sam/data/genomes/geoduck/Pgenerosa_v070.fa to database\n", "Number of sequences (bp) added to database: 313649 ( 2205688688 bp )\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t1m23.003s\n", "user\t1m2.300s\n", "sys\t0m5.360s\n" ] } ], "source": [ "%%bash\n", "cd ${work_dir}\n", "\n", "time \\\n", "perl ${rptm}/BuildDatabase \\\n", "-name Pgenerosa_v070 \\\n", "-engine ncbi \\\n", "${Pgenerosa_v070_fasta} \\\n", ">& database_build_run.out\n", "\n", "sed '/^Subject:/ s/ / repeatmodeler db JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"\n", "\n", "echo \"------------------------------------------------------------------------\"\n", "echo \"\"\n", "echo \"------------------------------------------------------------------------\"\n", "cat ${work_dir}/database_build_run.out" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Run RepeatModeler" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t2056m18.020s\n", "user\t20838m22.216s\n", "sys\t407m24.308s\n" ] } ], "source": [ "%%bash\n", "\n", "cd ${work_dir}\n", "time \\\n", "perl ${rptm}/RepeatModeler \\\n", "-database ${work_dir}/Pgenerosa_v070 \\\n", "-engine ncbi \\\n", "-pa 16 \\\n", ">& run.out\n", "\n", "sed '/^Subject:/ s/ / repeatmodeler JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " - Saving elements to a file...\n", " - 16 elements found.\n", "Element Gathering: 00:00:00 (hh:mm:ss) Elapsed Time\n", "Refining family-1617 model...\n", " - numRounds = 7\n", " - Consensus Length = 233 ( orig = 233 )\n", " - Avg Kimura Divergence = 0.01\n", " - Unaligned sequences = 0 ( orig = 0 )\n", " Build Consensus: 0:0:1 Elapsed Time\n", "Refinement: 00:00:02 (hh:mm:ss) Elapsed Time\n", "\n", "Processing RECON family: 6568\n", " - Saving elements to a file...\n", " - 16 elements found.\n", "Element Gathering: 00:00:00 (hh:mm:ss) Elapsed Time\n", "Refining family-6568 model...\n", " - numRounds = 6\n", " - Consensus Length = 404 ( orig = 417 )\n", " - Avg Kimura Divergence = 0.01\n", " - Unaligned sequences = 3 ( orig = 3 )\n", " Build Consensus: 0:0:2 Elapsed Time\n", "Refinement: 00:00:03 (hh:mm:ss) Elapsed Time\n", "Family Refinement: 00:23:28 (hh:mm:ss) Elapsed Time\n", "Round Time: 16:14:42 (hh:mm:ss) Elapsed Time\n", "\n", "Discovery complete: 2001 families found\n", "Classifying Repeats...\n", "RepeatClassifier Version open-1.0.11\n", "===============================\n", "Search Engine = ncbi\n", " - Looking for Simple and Low Complexity sequences..\n", " - Looking for similarity to known repeat proteins..\n", " - Looking for similarity to known repeat consensi..\n", "Classification Time: 01:28:39 (hh:mm:ss) Elapsed Time\n", "Program Time: 34:16:10 (hh:mm:ss) Elapsed Time\n", "Working directory: /home/sam/analyses/20181219_Pgenerosa_repeatmodeler/RM_31027.WedDec191236402018\n", "may be deleted unless there were problems with the run.\n", "\n", "The results have been saved to:\n", " /home/sam/analyses/20181219_Pgenerosa_repeatmodeler/Pgenerosa_v070-families.fa - Consensus sequences for each family identified.\n", " /home/sam/analyses/20181219_Pgenerosa_repeatmodeler/Pgenerosa_v070-families.stk - Seed alignments for each family identified.\n", "\n", "This version of RepeatModeler can upload families directly to the\n", "open repeat database - Dfam_consensus. Please consider uploading your final\n", "curated library using the RepeatModeler \"util/dfamConsensusTool.pl\" script\n", "( details at http://www.repeatmasker.org/RepeatModeler/dfamConsensusTool ) or\n", "posting your raw (uncurated) RepeatModeler results to the TE Raw Dataset\n", "Repository ( http://www.repeatmasker.org/Dfam_consensus/#/public/repository ).\n", "\n", "\n" ] } ], "source": [ "%%bash\n", "tail -n 50 ${work_dir}/run.out" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "cd /home/sam/analyses/\n", "rsync --archive --relative ./20181219_Pgenerosa_repeatmodeler gannet:/volume1/web/Atumefaciens\n", "\n", "sed '/^Subject:/ s/ / rsync JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }