{ "metadata": { "name": "", "signature": "sha256:3ed6edd64bb79155e7d2d00ced42d3ed1414b26227c1988bf7eda93273165621" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Transcriptome Exploration" ] }, { "cell_type": "heading", "level": 3, "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "First lets get set with a working directory" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pwd" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "u'/Users/sr320/Dropbox/Steven/eimd/ipynbs'" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "cd " ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/sr320\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "ls" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\u001b[34mApplications\u001b[m\u001b[m/ \u001b[34mGeneious 5.5 Data\u001b[m\u001b[m/ \u001b[34manaconda\u001b[m\u001b[m/\r\n", "BiGo_RNAseq.ipynb HOBOwareInstall.txt \u001b[34mapollo-webstart\u001b[m\u001b[m/\r\n", "BlackAb_Annot.ipynb \u001b[34mLibrary\u001b[m\u001b[m/ asample.html\r\n", "\u001b[34mCLC_Data\u001b[m\u001b[m/ \u001b[34mMovies\u001b[m\u001b[m/ \u001b[34mblast2go\u001b[m\u001b[m/\r\n", "\u001b[34mClueGOConfiguration\u001b[m\u001b[m/ \u001b[34mMusic\u001b[m\u001b[m/ \u001b[34migv\u001b[m\u001b[m/\r\n", "\u001b[34mCytoscapeConfiguration\u001b[m\u001b[m/ \u001b[34mPictures\u001b[m\u001b[m/ outputfile\r\n", "\u001b[34mDesktop\u001b[m\u001b[m/ \u001b[34mPublic\u001b[m\u001b[m/ \u001b[34msqlshare-pythonclient\u001b[m\u001b[m/\r\n", "\u001b[34mDocuments\u001b[m\u001b[m/ StencylWorks.prefs \u001b[34mstencylworks\u001b[m\u001b[m/\r\n", "\u001b[34mDownloads\u001b[m\u001b[m/ TJGR_pearl.ipynb test\r\n", "\u001b[34mDropbox\u001b[m\u001b[m/ \u001b[34mVirtualBox VMs\u001b[m\u001b[m/\r\n" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "cd Desktop/" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/sr320/Desktop\n" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "mkdir myawesome_dir" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "ls -d my*" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\u001b[34mmyawesome_dir\u001b[m\u001b[m/\r\n" ] } ], "prompt_number": 19 }, { "cell_type": "heading", "level": 1, "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Download the fasta file" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!curl -O http://eagle.fish.washington.edu/whale/eimd_14/Phel_transcriptome_clc.fa" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 43.2M 0 245k 0 0 317k 0 0:02:19 --:--:-- 0:02:19 814k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 1 43.2M 1 557k 0 0 310k 0 0:02:22 0:00:01 0:02:21 421k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 2 43.2M 2 889k 0 0 319k 0 0:02:18 0:00:02 0:02:16 385k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 2 43.2M 2 1314k 0 0 348k 0 0:02:07 0:00:03 0:02:04 398k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 3 43.2M 3 1517k 0 0 314k 0 0:02:20 0:00:04 0:02:16 348k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 3 43.2M 3 1665k 0 0 288k 0 0:02:33 0:00:05 0:02:28 283k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 4 43.2M 4 1999k 0 0 294k 0 0:02:30 0:00:06 0:02:24 289k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 6 43.2M 6 2935k 0 0 377k 0 0:01:57 0:00:07 0:01:50 409k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 8 43.2M 8 3964k 0 0 451k 0 0:01:38 0:00:08 0:01:30 529k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 11 43.2M 11 5273k 0 0 539k 0 0:01:22 0:00:09 0:01:13 758k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 15 43.2M 15 6872k 0 0 637k 0 0:01:09 0:00:10 0:00:59 1041k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 18 43.2M 18 7987k 0 0 678k 0 0:01:05 0:00:11 0:00:54 1198k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 18 43.2M 18 8277k 0 0 647k 0 0:01:08 0:00:12 0:00:56 1064k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 20 43.2M 20 8881k 0 0 644k 0 0:01:08 0:00:13 0:00:55 982k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 21 43.2M 21 9431k 0 0 638k 0 0:01:09 0:00:14 0:00:55 831k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 22 43.2M 22 9.8M 0 0 638k 0 0:01:09 0:00:15 0:00:54 638k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 24 43.2M 24 10.4M 0 0 640k 0 0:01:09 0:00:16 0:00:53 549k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 24 43.2M 24 10.8M 0 0 621k 0 0:01:11 0:00:17 0:00:54 557k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 25 43.2M 25 10.9M 0 0 597k 0 0:01:14 0:00:18 0:00:56 467k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 26 43.2M 26 11.4M 0 0 595k 0 0:01:14 0:00:19 0:00:55 468k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 26 43.2M 26 11.6M 0 0 572k 0 0:01:17 0:00:20 0:00:57 364k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 27 43.2M 27 11.8M 0 0 555k 0 0:01:19 0:00:21 0:00:58 271k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 29 43.2M 29 12.5M 0 0 564k 0 0:01:18 0:00:22 0:00:56 361k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 31 43.2M 31 13.4M 0 0 579k 0 0:01:16 0:00:23 0:00:53 513k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 32 43.2M 32 14.2M 0 0 587k 0 0:01:15 0:00:24 0:00:51 556k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 33 43.2M 33 14.4M 0 0 574k 0 0:01:17 0:00:25 0:00:52 583k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 35 43.2M 35 15.3M 0 0 586k 0 0:01:15 0:00:26 0:00:49 721k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 36 43.2M 36 16.0M 0 0 590k 0 0:01:15 0:00:27 0:00:48 704k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 39 43.2M 39 16.8M 0 0 601k 0 0:01:13 0:00:28 0:00:45 704k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 41 43.2M 41 17.8M 0 0 612k 0 0:01:12 0:00:29 0:00:43 740k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 43 43.2M 43 18.6M 0 0 619k 0 0:01:11 0:00" ] }, { "output_type": "stream", "stream": "stdout", "text": [ ":30 0:00:41 852k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 45 43.2M 45 19.7M 0 0 635k 0 0:01:09 0:00:31 0:00:38 899k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 48 43.2M 48 20.8M 0 0 652k 0 0:01:07 0:00:32 0:00:35 999k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 51 43.2M 51 22.1M 0 0 672k 0 0:01:05 0:00:33 0:00:32 1085k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 54 43.2M 54 23.5M 0 0 694k 0 0:01:03 0:00:34 0:00:29 1181k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 57 43.2M 57 24.7M 0 0 709k 0 0:01:02 0:00:35 0:00:27 1261k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 59 43.2M 59 25.8M 0 0 718k 0 0:01:01 0:00:36 0:00:25 1247k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 62 43.2M 62 26.9M 0 0 731k 0 0:01:00 0:00:37 0:00:23 1249k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 64 43.2M 64 28.0M 0 0 740k 0 0:00:59 0:00:38 0:00:21 1198k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 68 43.2M 68 29.4M 0 0 757k 0 0:00:58 0:00:39 0:00:19 1199k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 70 43.2M 70 30.4M 0 0 765k 0 0:00:57 0:00:40 0:00:17 1168k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 72 43.2M 72 31.3M 0 0 769k 0 0:00:57 0:00:41 0:00:16 1141k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 75 43.2M 75 32.5M 0 0 779k 0 0:00:56 0:00:42 0:00:14 1142k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 77 43.2M 77 33.7M 0 0 789k 0 0:00:56 0:00:43 0:00:13 1169k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 80 43.2M 80 34.7M 0 0 795k 0 0:00:55 0:00:44 0:00:11 1095k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 82 43.2M 82 35.8M 0 0 801k 0 0:00:55 0:00:45 0:00:10 1089k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 85 43.2M 85 37.0M 0 0 810k 0 0:00:54 0:00:46 0:00:08 1149k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 87 43.2M 87 37.8M 0 0 811k 0 0:00:54 0:00:47 0:00:07 1087k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 89 43.2M 89 38.7M 0 0 814k 0 0:00:54 0:00:48 0:00:06 1030k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 90 43.2M 90 39.3M 0 0 809k 0 0:00:54 0:00:49 0:00:05 936k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 92 43.2M 92 40.0M 0 0 806k 0 0:00:54 0:00:50 0:00:04 851k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 93 43.2M 93 40.5M 0 0 801k 0 0:00:55 0:00:51 0:00:04 723k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 94 43.2M 94 40.8M 0 0 792k 0 0:00:55 0:00:52 0:00:03 612k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 95 43.2M 95 41.3M 0 0 786k 0 0:00:56 0:00:53 0:00:03 517k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 95 43.2M 95 41.4M 0 0 772k 0 0:00:57 0:00:55 0:00:02 417k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 96 43.2M 96 41.7M 0 0 764k 0 0:00:57 0:00:55 0:00:02 344k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 97 43.2M 97 42.0M 0 0 754k 0 0:00:58 0:00:57 0:00:01 291k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 98 43.2M 98 42.6M 0 0 756k 0 0:00:58 0:00:57 0:00:01 371k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 43.2M 100 43.2M 0 0 756k 0 0:00:58 0:00:58 --:--:-- 417k\r\n" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "!head Phel_transcriptome_clc.fa" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">Phel_clc_contig_1\r\n", "CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCC\r\n", "GCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAA\r\n", "CCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTT\r\n", "TAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGAT\r\n", "TTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACT\r\n", "ATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGT\r\n", "GTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTT\r\n", "GGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGA\r\n", "CTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCT\r\n" ] } ], "prompt_number": 21 }, { "cell_type": "heading", "level": 2, "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Question - How many Sequences?" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!fgrep -c \">\" Phel_transcriptome_clc.fa" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "30578\r\n" ] } ], "prompt_number": 25 }, { "cell_type": "heading", "level": 1, "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "How Long are these sequences?" ] }, { "cell_type": "code", "collapsed": false, "input": [ "Convert fasta to tabular" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' Phel_transcriptome_clc.fa > Phel_transcriptome_clc.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "Converted 30578 FASTA records in 778053 lines to tabular format\r\n", "Total sequence length: 43945151\r\n", "\r\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 Phel_transcriptome_clc.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Phel_clc_contig_1\t\tCAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC\r\n" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' Phel_transcriptome_clc.tab > Phel_transcriptome_clc_length.tab\n" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "Added column with length of column 2 for 30578 lines.\r\n", "\r\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 Phel_transcriptome_clc_length.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Phel_clc_contig_1\t\tCAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC\t905\r\n" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "#lets add header\n", "!echo \"contig\\tsequence\\tseq_len\" >> fa.head\n", "!cat fa.head Phel_transcriptome_clc_length.tab > Phel_transcriptome_clc_length2.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "prompt_number": 90 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -2 Phel_transcriptome_clc_length2.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "contig\tsequence\tseq_len\r\n", "Phel_clc_contig_1\t\tCAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC\t905\r\n" ] } ], "prompt_number": 110 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "\n", "# read data from data file into a pandas DataFrame \n", "Ph = read_table(\"Phel_transcriptome_clc_length2.tab\", # name of the data file\n", " #sep=\",\", # what character separates each column?\n", " na_values=[\"\", \" \"]) # what values should be considered \"blank\" values?" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "prompt_number": 95 }, { "cell_type": "code", "collapsed": false, "input": [ "Ph.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n",
        "<class 'pandas.core.frame.DataFrame'>\n",
        "Index: 5 entries, Phel_clc_contig_1 to Phel_clc_contig_5\n",
        "Data columns (total 3 columns):\n",
        "contig      0  non-null values\n",
        "sequence    5  non-null values\n",
        "seq_len     5  non-null values\n",
        "dtypes: float64(1), int64(1), object(1)\n",
        "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 96, "text": [ " contig sequence \\\n", "Phel_clc_contig_1 NaN CAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCAT... \n", "Phel_clc_contig_2 NaN ATACTTCTAATTATTGTTCGCTGCAAAGGCACGTGTGAACATCTAC... \n", "Phel_clc_contig_3 NaN TGTGGATGAATTAGATACTAGCCCCCCCCCCGCCCACTGGCCGTCA... \n", "Phel_clc_contig_4 NaN AGCTTGAGCCTAATCCCTCTGTTTCCTATCAACGAAGTTTCGGATA... \n", "Phel_clc_contig_5 NaN AATAAAGTACCACTACGTGCACGCCAGTGTTACCACGCTCAGCTTG... \n", "\n", " seq_len \n", "Phel_clc_contig_1 905 \n", "Phel_clc_contig_2 735 \n", "Phel_clc_contig_3 4761 \n", "Phel_clc_contig_4 11084 \n", "Phel_clc_contig_5 3274 " ] } ], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: pylab import has clobbered these variables: ['load', 'info', 'save', 'datetime', 'set_printoptions', 'unique']\n", "`%matplotlib` prevents importing * from pylab and numpy\n" ] } ], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "Ph ['seq_len'].hist(bins=1000);\n", "#Axis limits are changed using the axis([xmin, xmax, ymin, ymax]) function.\n", "plt.axis([0, 10000, 0, 5000])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 94, "text": [ "[0, 10000, 0, 5000]" ] }, { "metadata": {}, "output_type": "display_data", "png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD9CAYAAACm2+DgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9Q03eeP/DXx4K73VaC7NaAiXthIIBRFKqFXu/ar4oB\n6mig6y6V7UFw2Z2OnK329tsVnfl2Z3emEq+zU+1t2Z3ZpdcUryJj75DuaApWoLu1hPMHV8/oEveS\nKQkh7Ypg7MIF9P39Az++I/tBkOZDPpjnY+Yz5fMhCe88Kzz5vPIDgTFGAAAAE82L9AIAAECZUBAA\nACAJBQEAAJJQEAAAIAkFAQAAklAQAAAgaVoFodPp3CtWrPg0Ozv7XE5OThcR0cDAQILRaGxNS0vr\nyc/PbxkcHIwXL19TU7Nbr9c7MzIyLrW0tOSLx8+cObMqMzPzvF6vd+7YseNA+O8OAACEDWNsyk2n\n07muXLmSEHrs5Zdf/ud9+/b9hDFGFotl165duyyMMbpw4YJh5cqV3cFgMNblculSUlIu37x5U2CM\n0WOPPdZlt9tzGGP09NNPHzt+/HjhdL4+NmzYsGGb/W3aIybGmBC639zcbDKbzVYiIrPZbG1qaiom\nIjp69GhRaWnpodjY2FGdTudOTU29bLfbc30+X1IgEFggnoGUl5e/I14HAACUZ1oFIQgCW79+/YnV\nq1ef/s1vfvMjIiK/369Wq9V+IiK1Wu33+/1qIqK+vr7FWq3WI15Xq9V6vF6vZuJxjUbj9Xq9mvDe\nHQAACJeY6Vzo448//rukpCTfF1988YjRaGzNyMi4FPp5QRCYIAhhec+OcN0OAEA0mTjlCYdpnUEk\nJSX5iIgeeeSRL5555pn/6OrqylGr1f7+/v5EIiKfz5e0aNGiz4nGzwx6e3uXiNf1eDxarVbr0Wg0\nXo/How09rtFovFJfb6bzsvb2dlKpniIiRkSMVKp1dOLEiYjP8Way/fSnP434GpSyIQtkgSzuvsll\nyoL4y1/+8o1AILCAiOjLL798qKWlJT8zM/O8yWRqtlqtZiIiq9VqLi4ubiIiMplMzQ0NDVuCweB8\nl8uV7HQ69Tk5OV2JiYn9cXFx1+x2ey5jTKivry8TrwN/ze12R3oJioEsOGTBIQv5TTli8vv96mee\neeY/iIjGxsZinnvuuX/Lz89vWb169emSkpLGurq6Sp1O525sbCwhIjIYDI6SkpJGg8HgiImJGaut\nra0Sx0a1tbVVFRUVbw8PDz+4YcOGY4WFhTZ57x4AAMyUIOfpyUwIgsBmuqaOjg4qKnqFhoY6iIhI\npcqj997bQ3l5eeFc4qxob2+nNWvWRHoZioAsOGTBIQtOEARikXoMAmYf/uFzyIJDFhyykB8KQqHa\n29sjvQTFQBYcsuCQhfxQEAAAIAkFoVA4feaQBYcsOGQhPxQEAABIQkEoFOarHLLgkAWHLOSHggAA\nAEkoCIXCfJVDFhyy4JCF/FAQAAAgCQWhUJivcsiCQxYcspAfCgIAACShIBQK81UOWXDIgkMW8kNB\nAACAJBSEQmG+yiELDllwyEJ+KAgAAJCEglAozFc5ZMEhCw5ZyO++L4hNm75DgiCQIAgUF5cQ6eUA\nAMwZ931BDA9fIyJGRIwCgauRXs60Yb7KIQsOWXDIQn73fUEAAMDMoCAUCvNVDllwyIJDFvJDQQAA\ngCQUhEJhvsohCw5ZcMhCfigIAACQhIJQKMxXOWTBIQsOWcgPBQEAAJJQEAqF+SqHLDhkwSEL+aEg\nAABAEgpCoTBf5ZAFhyw4ZCE/FAQAAEhCQSgU5qscsuCQBYcs5IeCAAAASSgIhcJ8lUMWHLLgkIX8\nUBAAACAJBaFQmK9yyIJDFhyykB8KAgAAJKEgFArzVQ5ZcMiCQxbym1ZB3Lhx44Hs7OxzmzZtep+I\naGBgIMFoNLampaX15OfntwwODsaLl62pqdmt1+udGRkZl1paWvLF42fOnFmVmZl5Xq/XO3fs2HEg\n/HcFAADCaVoFceDAgR0Gg8EhCAIjIrJYLNVGo7G1p6cnLS8v70OLxVJNRORwOAyHDx9+1uFwGGw2\nW2FVVVUtY0wgItq2bduv6urqKp1Op97pdOptNluhfHdr7sN8lUMWHLLgkIX8piwIj8ejPXbs2IYf\n/vCHvxV/2Dc3N5vMZrOViMhsNlubmpqKiYiOHj1aVFpaeig2NnZUp9O5U1NTL9vt9lyfz5cUCAQW\n5OTkdBERlZeXvyNeBwAAlClmqgu89NJLr7/22msvX7t2LU485vf71Wq12k9EpFar/X6/X01E1NfX\nt/jxxx/vFC+n1Wo9Xq9XExsbO6rVaj3icY1G4/V6vZrJvmZFRQXpdDoiIoqPj6esrKzb80bxt4bJ\n9sfGBomonYjWhNwi35/q+thX5r5IKeuJ1L54TCnrieT+mjVrFLWe2dwXP3a73SQrxtik2/vvv7+x\nqqrqTcYYtbW1rdm4ceP7jDGKj4+/Gnq5hQsXDjDGaPv27f9y8ODB58TjlZWVvz1y5Mjm06dPr1q/\nfn2rePyjjz56Urytidv4kmamvb2dqVRPMSLGiBhTqdYxIrq9/1VuGwBAqW79bLvrz/OZbHcdMZ06\ndeqJ5uZmU3Jysqu0tPTQyZMn15WVldWr1Wp/f39/IhGRz+dLWrRo0edE42cGvb29S8TrezwerVar\n9Wg0Gq/H49GGHtdoNF4Z+u6+MfE352iGLDhkwSEL+d21IPbu3bunt7d3icvlSm5oaNiybt26k/X1\n9WUmk6nZarWaiYisVqu5uLi4iYjIZDI1NzQ0bAkGg/NdLley0+nU5+TkdCUmJvbHxcVds9vtuYwx\nob6+vky8DgAAKNOUj0GEEp/FVF1dbSkpKWmsq6ur1Ol07sbGxhIiIoPB4CgpKWk0GAyOmJiYsdra\n2irxOrW1tVUVFRVvDw8PP7hhw4ZjhYWFtvDfnftH6Mw52iELDllwyEJ+wvj4SjkEQWAzXVNHRwcV\nFb1CQ0MdRESkUuXR0NBJIhJvTyCl3V8AgK9KEARit55lGk54JbVCYb7KIQsOWXDIQn4oCAAAkISC\nUCjMVzlkwSELDlnIDwUBAACSUBAKhfkqhyw4ZMEhC/mhIAAAQBIKQqEwX+WQBYcsOGQhPxQEAABI\nQkEoFOarHLLgkAWHLOSHggAAAEkoCIXCfJVDFhyy4JCF/FAQAAAgCQWhUJivcsiCQxYcspAfCgIA\nACShIBQK81UOWXDIgkMW8kNBAACAJBSEQmG+yiELDllwyEJ+KAgAAJCEglAozFc5ZMEhCw5ZyA8F\nAQAAklAQCoX5KocsOGTBIQv5oSAAAEASCkKhMF/lkAWHLDhkIT8UBAAASEJBKBTmqxyy4JAFhyzk\nh4IAAABJKAiFwnyVQxYcsuCQhfxQEAAAIAkFoVCYr3LIgkMWHLKQHwoCAAAkoSAUCvNVDllwyIJD\nFvJDQQAAgCQUhEJhvsohCw5ZcMhCfigIAACQhIJQKMxXOWTBIQsOWcjvrgUxMjLy9dzcXHtWVla3\nwWBw7N69u4aIaGBgIMFoNLampaX15OfntwwODsaL16mpqdmt1+udGRkZl1paWvLF42fOnFmVmZl5\nXq/XO3fs2HFAvrsEAADhcNeC+PrXvz7S1ta2tru7O+vTTz9d0dbWtvYPf/jD31sslmqj0dja09OT\nlpeX96HFYqkmInI4HIbDhw8/63A4DDabrbCqqqqWMSYQEW3btu1XdXV1lU6nU+90OvU2m61wNu7g\nXIX5KocsOGTBIQv5TTli+sY3vvEXIqJgMDj/xo0bDyxcuPBqc3OzyWw2W4mIzGaztampqZiI6OjR\no0WlpaWHYmNjR3U6nTs1NfWy3W7P9fl8SYFAYEFOTk4XEVF5efk74nUAAECZpiyImzdvzsvKyupW\nq9X+tWvXti1btuyC3+9Xq9VqPxGRWq32+/1+NRFRX1/fYq1W6xGvq9VqPV6vVzPxuEaj8Xq9Xo0c\nd+juYkgQhNtbXFzC7C9hmjBf5ZAFhyw4ZCG/mKkuMG/evJvd3d1ZQ0NDqoKCgg/a2trWhn5eEAQm\nCAIL56IqKipIp9MREVF8fDxlZWXd/scgnlZOtj82NkhE7US0JuQWxf0xImq7dWwNBQLClLeHfexj\nH/tK2xc/drvdJCvG2LS3n//85//vtdde+7/p6emXfD5fImOM+vr6ktLT0y8xxqimpqa6pqamWrx8\nQUGBrbOzM9fn8yVmZGRcFI+/++67pc8///yvpb7G+JJmpr29nalUTzEixogYU6nWMSK6vX/nx+P7\nStXW1hbpJSgGsuCQBYcsuFs/y+7p5/l0truOmP785z9/S3yG0vDw8IOtra3G7OzscyaTqdlqtZqJ\niKxWq7m4uLiJiMhkMjU3NDRsCQaD810uV7LT6dTn5OR0JSYm9sfFxV2z2+25jDGhvr6+TLwOAAAo\n011HTD6fL8lsNltv3rw57+bNm/PKysrq8/LyPszOzj5XUlLSWFdXV6nT6dyNjY0lREQGg8FRUlLS\naDAYHDExMWO1tbVV4viptra2qqKi4u3h4eEHN2zYcKywsNA2G3dwrhJPKQFZhEIWHLKQnzB+dqIc\ngiCwma6po6ODiopeoaGhDiIiUqnyaGjoJBGJtyeEfDy+r7T7DwBwrwRBIHbrJQXhhFdSK1Tog1HR\nDllwyIJDFvJDQQAAgCQUhEJhvsohCw5ZcMhCfigIAACQhIJQKMxXOWTBIQsOWcgPBQEAAJJQEAqF\n+SqHLDhkwSEL+aEgAABAEgpCoTBf5ZAFhyw4ZCE/FAQAAEhCQSgU5qscsuCQBYcs5IeCAAAASSgI\nhcJ8lUMWHLLgkIX8UBAAACAJBaFQmK9yyIJDFhyykB8KAgAAJKEgFArzVQ5ZcMiCQxbyQ0EAAIAk\nFIRCYb7KIQsOWXDIQn4oCAAAkISCUCjMVzlkwSELDlnIDwUBAACSUBAKhfkqhyw4ZMEhC/mhIAAA\nQBIKQqEwX+WQBYcsOGQhPxQEAABIQkEoFOarHLLgkAWHLOQX5QURQ4IgkCAIFBeXEOnFAAAoSpQX\nxBgRMSJiFAhcjfRi7oD5KocsOGTBIQv5RXlBAADAZFAQCoX5KocsOGTBIQv5oSAAAEASCkKhMF/l\nkAWHLDhkIT8UBAAASEJBKBTmqxyy4JAFhyzkh4IAAABJUxZEb2/vkrVr17YtW7bswvLly//7jTfe\neJGIaGBgIMFoNLampaX15OfntwwODsaL16mpqdmt1+udGRkZl1paWvLF42fOnFmVmZl5Xq/XO3fs\n2HFAnrt0f8B8lUMWHLLgkIX8piyI2NjY0ddff/2lCxcuLOvs7Hz8zTff/MeLFy8utVgs1UajsbWn\npyctLy/vQ4vFUk1E5HA4DIcPH37W4XAYbDZbYVVVVS1jTCAi2rZt26/q6uoqnU6n3ul06m02W6Hc\ndxAAAGZmyoJITEzsz8rK6iYievjhh68vXbr0otfr1TQ3N5vMZrOViMhsNlubmpqKiYiOHj1aVFpa\neig2NnZUp9O5U1NTL9vt9lyfz5cUCAQW5OTkdBERlZeXvyNeB/4a5qscsuCQBYcs5HdPj0G43W7d\nuXPnsnNzc+1+v1+tVqv9RERqtdrv9/vVRER9fX2LtVqtR7yOVqv1eL1ezcTjGo3G6/V6NeG6IwAA\nEF4x073g9evXH968efN7Bw4c2LFgwYJA6OcEQWCCILBwLaqiooJ0Oh0REcXHx1NWVtbt3xbEueNk\n+2Njg0TUTkRrQm4xdL/91n8n7tO0bn+29sVjSllPJPe7u7tp586dillPJPf3799/T98P9/P+xO+V\nSK9nNvfFj91uN8mKMTblFgwGY/Pz8z94/fXXd4rH0tPTL/l8vkTGGPX19SWlp6dfYoxRTU1NdU1N\nTbV4uYKCAltnZ2euz+dLzMjIuCgef/fdd0uff/75X0/8WuNLmpn29namUj3FiBgjYkylWjf+Tny3\n9u/8eOL+zL+uHNra2iK9BMVAFhyy4JAFd+vn17R+nt/LNuWIiTEmVFZW1hkMBsfOnTv3i8dNJlOz\n1Wo1ExFZrVZzcXFxk3i8oaFhSzAYnO9yuZKdTqc+JyenKzExsT8uLu6a3W7PZYwJ9fX1ZeJ14K+J\nvzEAsgiFLDhkIb8pR0wff/zx3x08ePAfVqxY8Wl2dvY5ovGnsVZXV1tKSkoa6+rqKnU6nbuxsbGE\niMhgMDhKSkoaDQaDIyYmZqy2trZKHD/V1tZWVVRUvD08PPzghg0bjhUWFtrkvXsAADBTwvjZiXII\ngsBmuqaOjg4qKnqFhoY6iIhIpcqjoaGTNP43H4iIhJCPJ+4LpKQs2tvb8RvSLciCQxYcsuAEQSB2\n6+UE4YRXUgMAgCQUhELhNyMOWXDIgkMW8kNBAACAJBSEQoU+3znaIQsOWXDIQn4oCAAAkISCUCjM\nVzlkwSELDlnIDwVxWwwJgnB7i4tLiPSCAAAiCgVx2xiNvyZifAsErkZ0NZivcsiCQxYcspAfCgIA\nACShIBQK81UOWXDIgkMW8kNBAACAJBSEQmG+yiELDllwyEJ+KAgAAJCEglAozFc5ZMEhCw5ZyA8F\nAQAAklAQCoX5KocsOGTBIQv5oSAAAEASCkKhMF/lkAWHLDhkIT8UxKRi8L5MABDVUBCT4u/NFIn3\nZcJ8lUMWHLLgkIX8UBAAACAJBaFQmK9yyIJDFhyykB8KAgAAJKEgFArzVQ5ZcMiCQxbyQ0EAAIAk\nFIRCYb7KIQsOWXDIQn4oCAAAkISCUCjMVzlkwSELDlnIDwUxLfxV1XhlNQBEC4ExFuk13EEQBDbT\nNXV0dFBR0Ss0NNRBREQqVR4NDZ2k8VdEExEJIR9P3L+3zyktNwCIXoIgEGNMCPft4gwCAAAkoSAU\nCvNVDllwyIJDFvJDQQAAgCQUhELhOd4csuCQBYcs5IeCAAAASSiIGZH/jwlhvsohCw5ZcMhCflMW\nxA9+8IO31Gq1PzMz87x4bGBgIMFoNLampaX15OfntwwODsaLn6upqdmt1+udGRkZl1paWvLF42fO\nnFmVmZl5Xq/XO3fs2HEg/HdlNkX2jwkBAMyGKQti69at/2qz2QpDj1kslmqj0dja09OTlpeX96HF\nYqkmInI4HIbDhw8/63A4DDabrbCqqqpWfG7utm3bflVXV1fpdDr1TqdTP/E24U6Yr3LIgkMWHLKQ\n35QF8eSTT/5+4cKFd/ya3NzcbDKbzVYiIrPZbG1qaiomIjp69GhRaWnpodjY2FGdTudOTU29bLfb\nc30+X1IgEFiQk5PTRURUXl7+jngdAABQppiZXMnv96vVarWfiEitVvv9fr+aiKivr2/x448/3ile\nTqvVerxeryY2NnZUq9V6xOMajcbr9Xo1k91+RUUF6XQ6IiKKj4+nrKys278tiHPHyfbHxgaJqJ2I\n1oTcYuh++63/TtynGe6Pr2G665vuvngsXLc3l/e7u7tp586dillPJPf3799/T98P9/P+xO+VSK9n\nNvfFj91uN8mKMTbl5nK5dMuXLz8v7sfHx18N/fzChQsHGGO0ffv2fzl48OBz4vHKysrfHjlyZPPp\n06dXrV+/vlU8/tFHHz25cePG96W+1viSZqa9vZ2pVE8xIsaIGFOp1o0/UHBr/86PJ+7P/HNyaGtr\nk+V25yJkwSELDllwt34OTevn+b1sM3oWk1qt9vf39ycSEfl8vqRFixZ9TjR+ZtDb27tEvJzH49Fq\ntVqPRqPxejwebehxjUbjnVGjKY48b+Qn/sYAyCIUsuCQhfxmVBAmk6nZarWaiYisVqu5uLi4STze\n0NCwJRgMzne5XMlOp1Ofk5PTlZiY2B8XF3fNbrfnMsaE+vr6MvE6cx9/RhOe1QQA95WpTjG2bNly\nKCkpqS82Njao1Wp733rrra1XrlxJyMvLO6HX63uMRmPL1atX48XLv/rqq3tSUlIup6enX7LZbAXi\n8dOnT69avnz5+ZSUlMsvvPDCG5N9PZqDI6aJnwsHnD5zyIJDFhyy4EimEdOUD1IfOnSoVOr4iRMn\n1ksd37Nnz949e/bsnXh81apVZ86fP585vdoCAIBIwyupFQrzVQ5ZcMiCQxbyQ0EAAIAkFETYhed9\nmkKf7xztkAWHLDhkIb8ZvVAO7kZ8VhNRIBD2vwAIADBrcAahUJivcsiCQxYcspAfCgIAACShIBQK\n81UOWXDIgkMW8kNBAACAJBSErO58nyZBmD/tZzhhvsohCw5ZcMhCfngWk6z4M5rGCYRnOAHAXIEz\nCIXCfJVDFhyy4JCF/FAQAAAgCQWhUJivcsiCQxYcspAfCiJiZv4ANgDAbEBBRMydf2iIaPT2x4HA\nVcxXQyALDllwyEJ+KAgAAJCEglAozFc5ZMEhCw5ZyA8FoUh3Pj6BxyQAIBJQEIo0RkRtFPqYRDTD\nrJlDFhyykB8KAgAAJKEgFGtNyMfh+St1cxVmzRyy4JCF/FAQcwJ/SmwgEMDjEwAwK1AQitU+yfE7\nXz8RDY9PYNbMIQsOWcgPBTHnRff4CQDkg7f7Vqw107wcf0vx+/UtxDFr5pAFhyzkhzOI+wpePwEA\n4YOCUKz2GVxn4uMTAck3A5xr5YFZM4csOGQhP4yY7muhf9FOoNC/bne/jqMAIHxwBqFYa2S+/bnz\n4DZmzRyy4JCF/FAQUQuvrQCAu0NBKFb7LH4tZT92gVkzhyw4ZCE/FARICC2MUZq8PHC2AXA/Q0Eo\n1ppIL2ASk59tyFUWmDVzyIJDFvJDQcBXNPljGfg72wBzGwpCsdojvYAZuNvf2Z68PKZ6nAOzZg5Z\ncMhCfrNeEDabrTAjI+OSXq937tu3b9dsf/25ozvSCwizycvj7o9zzKe1a9dOq1ii4Yylu/t++3cx\nc8hCfrNaEDdu3Hhg+/btv7TZbIUOh8Nw6NCh0osXLy6dzTXMHYORXkAETXyQ/Kc0nWIJ1xmLkg0O\nRvO/izshC/nNakF0dXXlpKamXtbpdO7Y2NjRLVu2NBw9erRoNtcA0WLmZyzTLZaZns3ExSXcF2UF\n979ZLQiv16tZsmRJr7iv1Wo9Xq9XE67bnzdvHo2MnCeVahOpVJtoZOS/wnXTEeCO9AIUxC3z7U/+\ntN7pnrHcy9nM+N/wmFlZ/exnP5OtyGb7c3crzumUqlQWKNjwmtX3YhIEgU19KSJB+GrvE/S///u7\nibc4ycdK/pz11qaEtdztc7PxNeZKFlPlNN3riUbvsj/3PxcIXJ30e/1un5vpbcK9m9WC0Gg03t7e\n3iXifm9v7xKtVusJvQxjDP93AQAUYFZHTKtXrz7tdDr1brdbFwwG5x8+fPhZk8nUPJtrAACA6ZnV\nM4iYmJixX/7yl9sLCgo+uHHjxgOVlZV1S5cuvTibawAAgGlijCliO378eGF6evql1NRUp8Vi2RXp\n9YR7++yzz5asWbOmzWAwXFi2bNl/Hzhw4EXGGF25ciVh/fr1rXq9vsdoNLZcvXo1XrzO3r17d6em\npjrT09MvffDBB/ni8dOnT69avnz5+dTUVOeLL754INL3babb2NjYA1lZWec2btz4fjRncfXq1fjN\nmzcfycjIuLh06VJHZ2dnbrRmsXfv3t0Gg+HC8uXLz5eWlr47MjLytWjJYuvWrW8tWrTIv3z58vPi\nsXDe95GRka+VlJQcTk1Ndebm5na63e6/mWpNEQ+FsfEfFCkpKZddLpcuGAzGrly5stvhcCyN9LrC\nufl8vsRz585lMcYoEAg8nJaW9keHw7H05Zdf/ud9+/b9hDFGFotl165duyyMMbpw4YJh5cqV3cFg\nMNblculSUlIu37x5U2CM0WOPPdZlt9tzGGP09NNPHzt+/HhhpO/fTLZf/OIX//T973//3zZt2tTM\nGKNozaK8vNxaV1f3A8YYjY6OxgwODqqiMQuXy6VLTk7+n5GRka8xxqikpOTw22+/bY6WLD766KMn\nz549mx1aEOG872+++WbVtm3bahlj1NDQ8Oyzzz7bMNWaIh4KY4xOnTr1twUFBTZxv6amprqmpqY6\n0uuScysqKmpqbW1dn56efqm/v1/N2HiJpKenX2Js/LeD0DOpgoIC2yeffPJ4X19fUkZGxkXx+KFD\nh7Y8//zzv470/bnXrbe3V5uXl3fi5MmTa8UziGjMYnBwUJWcnPw/E49HYxZXrlxJSEtL++PAwMDC\n0dHRmI0bN77f0tJijKYsXC6XLrQgwnnfCwoKbJ2dnbmMjf8i8q1vfeuLqdajiPdikvv1EUrjdrt1\n586dy87NzbX7/X61Wq32ExGp1Wq/3+9XExH19fUtDn2Gl5jJxOMajcY7F7N66aWXXn/ttddenjdv\n3k3xWDRm4XK5kh955JEvtm7d+q+PPvro2R/96Ee/+fLLLx+KxiwSEhIGfvzjH//i29/+9meLFy/u\ni4+PHzQaja3RmIUonPc99OdsTEzMmEqlGhoYGLjrC0cUURDTfX3E/eD69esPb968+b0DBw7sWLBg\nQSD0c4IgsGjI4ne/+93GRYsWfZ6dnX2OTfK05mjJYmxsLObs2bOPVlVV1Z49e/bRhx566EuLxVId\neployeJPf/pTyv79+3e63W5dX1/f4uvXrz988ODBfwi9TLRkISUS910RBTGd10fcD0ZHR2M3b978\nXllZWX1xcXET0fhvBf39/YlERD6fL2nRokWfE/11Jh6PR6vVaj0ajcbr8Xi0occ1Go13tu/LV3Hq\n1KknmpubTcnJya7S0tJDJ0+eXFdWVlYfjVlotVqPVqv1PPbYY/9JRPTd7373yNmzZx9NTEzsj7Ys\nTp8+vfqJJ5449c1vfvNKTEzM2He+851//+STT/42GrMQheN7QvxZqtFovJ999tm3icZ/MRkaGlIl\nJCQM3O3rK6IgouH1EYwxobKyss5gMDh27ty5XzxuMpmarVarmYjIarWaxeIwmUzNDQ0NW4LB4HyX\ny5XsdDr1OTk5XYmJif1xcXHX7HZ7LmNMqK+vLxOvM1fs3bt3T29v7xKXy5Xc0NCwZd26dSfr6+vL\nojGLxMTE/iVLlvT29PSkERGdOHFi/bJlyy5s2rTp/WjLIiMj41JnZ+fjw8PDDzLGhBMnTqw3GAyO\naMxCFI5WI+QZAAABDUlEQVTviaKioqMTb+vIkSPfzcvL+3DKBUT6QRlxO3bs2NNpaWl/TElJubx3\n797dkV5PuLff//73fy8Iws2VK1d2Z2VlncvKyjp3/PjxwitXriTk5eWdkHoa26uvvronJSXlcnp6\n+iWbzVYgHhefxpaSknL5hRdeeCPS9+2rbO3t7f9HfBZTtGbR3d29cvXq1f+5YsWK/3rmmWf+fXBw\nUBWtWezbt+8n4tNcy8vLrcFgMDZastiyZcuhpKSkvtjY2KBWq+196623tobzvo+MjHzte9/7XqP4\nNFeXy6Wbak0CY1E5zgMAgCkoYsQEAADKg4IAAABJKAgAAJCEggAAAEkoCAAAkISCAAAASf8fO21y\nJoztDIgAAAAASUVORK5CYII=\n", "text": [ "" ] } ], "prompt_number": 94 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Other tricks" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!head -2 Phel_transcriptome_clc_length2.tab" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "contig\tsequence\tseq_len\r\n", "Phel_clc_contig_1\t\tCAAATATATGAACGGTTGATTGTCAACGATTAGTACATGTTTTCATTGTTCCCCACGCCCGCCCCCCCCCACTCAAACATTTAAAGTGTGAAATATTATTTATCCACAAATTTCCTTAAACCTGCAAACTTGTCTGCTGTCTCTTATTGGAAGTTATGAAAAAGAACAACGGGTTTTCTTTAAAGGGTCTGCGTGCGATTTTCAACCTTTTGAGTAATAGCAGTTATTTTGATAACCGATTTTTTTCAAAGCTCAACAGCTTTTTAAAATAAGGAATCCTATAATGGCCAAACGAATACTATAAAAATAAGGGTTCTCTTAATTGTATAAAACGTATAATTTTATCAATTTTGGGACCGTGTAATTTTTTAAAGACCACAAGAATGTTACATACAACAAATAGACGAAACTCGTAGCTTTGGAAACTACGTCATGGGCGTTTGGTCAAAAGCTGGAGAGAAAGAGAGGTGGGGTGCCAGACTTAAGTAGTCACGTGATCTGACCAACGCACATCGGAAGCTCGATCGGATGAAATCTTCTCTATCGTTCTTGCGTCTATACGTGCTACGAAGAGCTGACAGAAGTTTGGACTTGTTTACTTCTTGCACCTGTTGATGGAACGGCCACGGACCTTGTCGCACGCACACCTGGAGCCAGTGCTCGGATCGACGCAACGGATGTACTGTCTTCCCCTTCCGCGTTTCTCAAGTAGGTACTCAAAGTCGTCCGCGTCGAAGTTGGCCTCGGCGTCCCTCTTCTCCAGCTCCTCCATGTCCTCCTCTGTGTAGTACGGGGTGACGAGCACCACCAGGGCGGCCACAATGGCCAGTGCTAGAAGACACTTCGTATTCATTCTGCTGGTGGTTGGATGTGCGCAAACAAGACAGGAGAGACTTATTAGAATC\t905\r\n" ] } ], "prompt_number": 101 }, { "cell_type": "code", "collapsed": false, "input": [ "!awk '{print \">\"$1,$2}' /Users/sr320/Desktop/Phel_transcriptome_clc_length2.tab | sort -g -r" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!fgrep \"Phel_clc_contig_2671\" Phel_transcriptome_clc_length2.tab | pbcopy" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 113 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Another way to get some basic statistics" ] }, { "cell_type": "code", "collapsed": false, "input": [ "mkdir scripts" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "cd scripts" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/sr320/Desktop/scripts\n" ] } ], "prompt_number": 76 }, { "cell_type": "code", "collapsed": false, "input": [ "!curl -O https://raw.githubusercontent.com/sr320/eimd/master/scripts/count_fasta.pl" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 909 0 909 0 0 2081 0 --:--:-- --:--:-- --:--:-- 2094" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 15426 0 15426 0 0 22283 0 --:--:-- --:--:-- --:--:-- 22356\r\n" ] } ], "prompt_number": 79 }, { "cell_type": "code", "collapsed": false, "input": [ "cd .." ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Users/sr320/Desktop\n" ] } ], "prompt_number": 80 }, { "cell_type": "code", "collapsed": false, "input": [ "!perl /Users/sr320/Dropbox/Steven/eimd/scripts/count_fasta.pl -i 10000 Phel_transcriptome_clc.fa" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "0:9999 \t30486\r\n", "10000:19999 \t58\r\n", "20000:29999 \t11\r\n", "30000:39999 \t9\r\n", "40000:49999 \t7\r\n", "50000:59999 \t3\r\n", "60000:69999 \t2\r\n", "70000:79999 \t1\r\n", "80000:89999 \t0\r\n", "90000:99999 \t0\r\n", "100000:109999 \t1\r\n", "\r\n", "Total length of sequence:\t43945151 bp\r\n", "Total number of sequences:\t30578\r\n", "N25 stats:\t\t\t25% of total sequence length is contained in the 1930 sequences >= 3393 bp\r\n", "N50 stats:\t\t\t50% of total sequence length is contained in the 6425 sequences >= 1848 bp\r\n", "N75 stats:\t\t\t75% of total sequence length is contained in the 14670 sequences >= 977 bp\r\n", "Total GC count:\t\t\t18236282 bp\r\n", "GC %:\t\t\t\t41.50 %\r\n", "\r\n" ] } ], "prompt_number": 111 }, { "cell_type": "code", "collapsed": false, "input": [ "!perl /Users/sr320/Dropbox/Steven/eimd/scripts/count_fasta.pl -i 1000 ../data/Phel_transcriptome_clc_v3.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "0:999 \t15505\r\n", "1000:1999 \t8559\r\n", "2000:2999 \t3003\r\n", "3000:3999 \t1240\r\n", "4000:4999 \t580\r\n", "5000:5999 \t312\r\n", "6000:6999 \t129\r\n", "7000:7999 \t72\r\n", "8000:8999 \t31\r\n", "9000:9999 \t19\r\n", "10000:10999 \t9\r\n", "11000:11999 \t7\r\n", "12000:12999 \t2\r\n", "13000:13999 \t4\r\n", "14000:14999 \t3\r\n", "15000:15999 \t0\r\n", "16000:16999 \t1\r\n", "\r\n", "Total length of sequence:\t40747496 bp\r\n", "Total number of sequences:\t29476\r\n", "N25 stats:\t\t\t25% of total sequence length is contained in the 2260 sequences >= 3085 bp\r\n", "N50 stats:\t\t\t50% of total sequence length is contained in the 6715 sequences >= 1757 bp\r\n", "N75 stats:\t\t\t75% of total sequence length is contained in the 14612 sequences >= 959 bp\r\n", "Total GC count:\t\t\t16459121 bp\r\n", "GC %:\t\t\t\t40.39 %\r\n", "\r\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Sequence Similarity Search " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Blast - lets get comfortable with commandline; maybe add to our PATH" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Making a database " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download fasta http://www.uniprot.org/downloads" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "BIG Seqs" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }