{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Wed Jun 26 11:28:31 PDT 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "emu\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 16\n", "On-line CPU(s) list: 0-15\n", "Thread(s) per core: 2\n", "Core(s) per socket: 4\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 26\n", "Model name: Intel(R) Xeon(R) CPU E5520 @ 2.27GHz\n", "Stepping: 5\n", "CPU MHz: 2394.000\n", "CPU max MHz: 2394.0000\n", "CPU min MHz: 1596.0000\n", "BogoMIPS: 4521.81\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 8192K\n", "NUMA node0 CPU(s): 0-15\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology nonstop_tsc aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 sse4_2 popcnt lahf_lm tpr_shadow vnmi flexpriority ept vpid dtherm ida\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 47G 1.4G 43G 130M 2.0G 45G\n", "Swap: 11G 0B 11G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: data_dir=/home/sam/data/genomes\n", "env: work_dir=/home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler\n" ] } ], "source": [ "# Set working directories - %env useful for bash\n", "data_dir = \"/home/sam/data/genomes\"\n", "%env data_dir = /home/sam/data/genomes\n", "work_dir = \"/home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler\"\n", "%env work_dir = /home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: Pgenerosa_v074_fasta=/home/sam/data/genomes/Pgenerosa_v074.fa\n", "env: rptm_db_name=Pgenerosa_v074\n" ] } ], "source": [ "# Set file paths/names\n", "%env Pgenerosa_v074_fasta = /home/sam/data/genomes/Pgenerosa_v074.fa\n", "%env rptm_db_name = Pgenerosa_v074" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: rptm=/home/shared/RepeatModeler-open-1.0.11\n" ] } ], "source": [ "# Set program paths\n", "%env rptm = /home/shared/RepeatModeler-open-1.0.11" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir --parents \"${work_dir}\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 914M\n", "-rw-rw-rw- 1 sam sam 914M Jun 26 08:49 Pgenerosa_v074.fa\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t2m52.522s\n", "user\t0m14.052s\n", "sys\t0m5.720s\n" ] } ], "source": [ "%%bash\n", "cd \"${data_dir}\"\n", "time \\\n", "rsync \\\n", "--archive \\\n", "owl:/volume1/web/halfshell/genomic-databank/Pgenerosa_v074.fa \\\n", ".\n", "ls -lh\n", "\n", "# Uncomment following line(s) to download from web\n", "# wget https://owl.fish.washington.edu/halfshell/genomic-databank/Pgenerosa_v074.fa" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create RepeatModeler database" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "------------------------------------------------------------------------\n", "\n", "------------------------------------------------------------------------\n", "Building database Pgenerosa_v074:\n", " Adding /home/sam/data/genomes/Pgenerosa_v074.fa to database\n", "Number of sequences (bp) added to database: 18 ( 942353201 bp )\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t0m25.107s\n", "user\t0m22.716s\n", "sys\t0m1.740s\n" ] } ], "source": [ "%%bash\n", "cd \"${work_dir}\"\n", "\n", "time \\\n", "perl ${rptm}/BuildDatabase \\\n", "-name \"${rptm_db_name}\" \\\n", "-engine ncbi \\\n", "\"${Pgenerosa_v074_fasta}\" \\\n", ">& database_build_run.out\n", "\n", "echo \"------------------------------------------------------------------------\"\n", "echo \"\"\n", "echo \"------------------------------------------------------------------------\"\n", "cat ${work_dir}/database_build_run.out" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Run RepeatModeler" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "real\t1974m13.579s\n", "user\t22777m27.440s\n", "sys\t44m16.452s\n" ] } ], "source": [ "%%bash\n", "\n", "cd ${work_dir}\n", "time \\\n", "perl ${rptm}/RepeatModeler \\\n", "-database \"${work_dir}\"/\"${rptm_db_name}\" \\\n", "-engine ncbi \\\n", "-pa 16 \\\n", ">& run.out\n", "\n", "sed '/^Subject:/ s/ / repeatmodeler JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " - Saving elements to a file...\n", " - 16 elements found.\n", "Element Gathering: 00:00:00 (hh:mm:ss) Elapsed Time\n", "Refining family-2168 model...\n", " - numRounds = 5\n", " - Consensus Length = 524 ( orig = 525 )\n", " - Avg Kimura Divergence = 0.01\n", " - Unaligned sequences = 2 ( orig = 2 )\n", " Build Consensus: 0:0:1 Elapsed Time\n", "Refinement: 00:00:01 (hh:mm:ss) Elapsed Time\n", "\n", "Processing RECON family: 2536\n", " - Saving elements to a file...\n", " - 16 elements found.\n", "Element Gathering: 00:00:00 (hh:mm:ss) Elapsed Time\n", "Refining family-2536 model...\n", " - numRounds = 5\n", " - Consensus Length = 426 ( orig = 430 )\n", " - Avg Kimura Divergence = 0.01\n", " - Unaligned sequences = 1 ( orig = 3 )\n", " Build Consensus: 0:0:1 Elapsed Time\n", "Refinement: 00:00:01 (hh:mm:ss) Elapsed Time\n", "Family Refinement: 00:22:39 (hh:mm:ss) Elapsed Time\n", "Round Time: 15:23:11 (hh:mm:ss) Elapsed Time\n", "\n", "Discovery complete: 2029 families found\n", "Classifying Repeats...\n", "RepeatClassifier Version open-1.0.11\n", "===============================\n", "Search Engine = ncbi\n", " - Looking for Simple and Low Complexity sequences..\n", " - Looking for similarity to known repeat proteins..\n", " - Looking for similarity to known repeat consensi..\n", "Classification Time: 01:31:26 (hh:mm:ss) Elapsed Time\n", "Program Time: 32:54:13 (hh:mm:ss) Elapsed Time\n", "Working directory: /home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler/RM_8927.WedJun261133512019\n", "may be deleted unless there were problems with the run.\n", "\n", "The results have been saved to:\n", " /home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler/Pgenerosa_v074-families.fa - Consensus sequences for each family identified.\n", " /home/sam/analyses/20190626_Pgenerosa_v074_repeatmodeler/Pgenerosa_v074-families.stk - Seed alignments for each family identified.\n", "\n", "This version of RepeatModeler can upload families directly to the\n", "open repeat database - Dfam_consensus. Please consider uploading your final\n", "curated library using the RepeatModeler \"util/dfamConsensusTool.pl\" script\n", "( details at http://www.repeatmasker.org/RepeatModeler/dfamConsensusTool ) or\n", "posting your raw (uncurated) RepeatModeler results to the TE Raw Dataset\n", "Repository ( http://www.repeatmasker.org/Dfam_consensus/#/public/repository ).\n", "\n", "\n" ] } ], "source": [ "%%bash\n", "tail -n 50 ${work_dir}/run.out" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%bash\n", "cd /home/sam/analyses/\n", "rsync --archive --relative ./20190626_Pgenerosa_v074_repeatmodeler gannet:/volume2/web/Atumefaciens\n", "\n", "sed '/^Subject:/ s/ / rsync JOB COMPLETE/' ~/.default-subject.mail | msmtp \"$EMAIL\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }