{ "metadata": { "name": "", "signature": "sha256:813ccbb0d395bff71eecd0d0f92b28a4049bc7a014053bc72064b5243d792519" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Determing CpG ratio per Gene Function" ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Step 1: Obtain GO slim information (gene function)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#from 00 notebook\n", "!head ../data/Piura_v1_GOslim.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Column1,GOSlim_bin\r", "\r\n", "PiuraChilensis_v1_contig_7798,protein metabolism\r", "\r\n", "PiuraChilensis_v1_contig_3751,other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_11851,other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_12118,protein metabolism\r", "\r\n", "PiuraChilensis_v1_contig_12118,transport\r", "\r\n", "PiuraChilensis_v1_contig_12118,cell organization and biogenesis\r", "\r\n", "PiuraChilensis_v1_contig_12118,signal transduction\r", "\r\n", "PiuraChilensis_v1_contig_12118,developmental processes\r", "\r\n", "PiuraChilensis_v1_contig_3855,developmental processes\r", "\r\n" ] } ], "prompt_number": 1 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Step 2: Determine CpG ratio for each gene " ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Step 2a: Convert fasta to tab-delimited" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!perl -e '$count=0; $len=0; while(<>) {s/\\r?\\n//; s/\\t/ /g; if (s/^>//) { if ($. != 1) {print \"\\n\"} s/ |$/\\t/; $count++; $_ .= \"\\t\";} else {s/ //g; $len += length($_)} print $_;} print \"\\n\"; warn \"\\nConverted $count FASTA records in $. lines to tabular format\\nTotal sequence length: $len\\n\\n\";' \\\n", "../data/Piura_v1_contigs.fa > ../data/Piura_v1_contigs.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "Converted 15022 FASTA records in 384556 lines to tabular format\r\n", "Total sequence length: 21729367\r\n", "\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 ../data/Piura_v1_contigs.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PiuraChilensis_v1_contig_1\t\tATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC\r\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "#temp replace name so c or g will not confound\n", "!sed 's/PiuraChilensis_v1_contig/999999/g' <../data/Piura_v1_contigs.tab> ../data/Piura_v1-99_contigs.tab\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 ../data/Piura_v1-99_contigs.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "999999_1\t\tATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC\r\n" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "#add column with length of sequence\n", "!perl -e '$col = 2;' -e 'while (<>) { s/\\r?\\n//; @F = split /\\t/, $_; $len = length($F[$col]); print \"$_\\t$len\\n\" } warn \"\\nAdded column with length of column $col for $. lines.\\n\\n\";' \\\n", "../data/Piura_v1-99_contigs.tab > ../data/Piura_v1-99-l_contigs.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "Added column with length of column 2 for 15022 lines.\r\n", "\r\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 ../data/Piura_v1-99-l_contigs.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "999999_1\t\tATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC\t1168\r\n" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "!awk -F\\CG '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__CG.tab\n", "!awk -F\\C '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__C.tab\n", "!awk -F\\G '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__G.tab\n", "!paste ../data/Piura_v1-99-l_contigs.tab \\\n", "../data/Piura_v1-99-l_contigs__CG.tab \\\n", "../data/Piura_v1-99-l_contigs__C.tab \\\n", "../data/Piura_v1-99-l_contigs__G.tab \\\n", "> ../data/Piura_v1-99-l_contigs__C-G.tab\n", "!head -1 ../data/Piura_v1-99-l_contigs__C-G.tab\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "999999_1\t\tATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC\t1168\t15\t203\t202\r\n" ] } ], "prompt_number": 28 }, { "cell_type": "markdown", "metadata": {}, "source": [ "![](http://eagle.fish.washington.edu/cnidarian/skitch/BMC_Genomics___Full_text___DNA_methylation_patterns_provide_insight_into_epigenetic_regulation_in_the_Pacific_oyster__Crassostrea_gigas__1A0683A5.png)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!awk '{print $1, \"\\t\", (($4)/($5*$6))*(($3**2)/($3-1))}' \\\n", "../data/Piura_v1-99-l_contigs__C-G.tab \\\n", "| sed 's/999999/PiuraChilensis_v1_contig/g' > ../data/Piura_v1_CpG.tab\n", "!head ../data/Piura_v1_CpG.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PiuraChilensis_v1_contig_1 \t 0.427621\r\n", "PiuraChilensis_v1_contig_2 \t 0.600881\r\n", "PiuraChilensis_v1_contig_3 \t 0.750945\r\n", "PiuraChilensis_v1_contig_4 \t 1.22757\r\n", "PiuraChilensis_v1_contig_5 \t 0.848172\r\n", "PiuraChilensis_v1_contig_6 \t 0.86292\r\n", "PiuraChilensis_v1_contig_7 \t 0.718647\r\n", "PiuraChilensis_v1_contig_8 \t 1.03897\r\n", "PiuraChilensis_v1_contig_9 \t 0.97759\r\n", "PiuraChilensis_v1_contig_10 \t 0.817866\r\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "!ls ../data" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Cgigas-HS-count.txt\r\n", "PiuraC_Coq_Trinity.fasta\r\n", "PiuraC_Val_Trinity.fasta\r\n", "PiuraC_Val_Trinity_2ndhalf.fasta\r\n", "PiuraC_Val_Trinity_uniprot_sprot.tab\r\n", "PiuraC_Val_Trinity_uniprot_sprot_2ndhalf.tab\r\n", "Piura_counts.txt\r\n", "Piura_v1-99-l_contigs.tab\r\n", "Piura_v1-99-l_contigs__C-G.tab\r\n", "Piura_v1-99-l_contigs__C.tab\r\n", "Piura_v1-99-l_contigs__CG.tab\r\n", "Piura_v1-99-l_contigs__G.tab\r\n", "Piura_v1-99_contigs.tab\r\n", "Piura_v1_CpG.tab\r\n", "Piura_v1_GOslim.csv\r\n", "Piura_v1_contigs.fa\r\n", "Piura_v1_contigs.tab\r\n", "Piura_v1_uniprot_sprot.tab\r\n", "Piura_v1_uniprot_sprot_sql.tab\r\n", "\u001b[34mwd\u001b[m\u001b[m\r\n" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "!rm ../data/Piura_v1-99*" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Join with GO Slim" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!head ../data/Piura_v1_GOslim.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Column1,GOSlim_bin\r", "\r\n", "PiuraChilensis_v1_contig_7798,protein metabolism\r", "\r\n", "PiuraChilensis_v1_contig_3751,other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_11851,other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_12118,protein metabolism\r", "\r\n", "PiuraChilensis_v1_contig_12118,transport\r", "\r\n", "PiuraChilensis_v1_contig_12118,cell organization and biogenesis\r", "\r\n", "PiuraChilensis_v1_contig_12118,signal transduction\r", "\r\n", "PiuraChilensis_v1_contig_12118,developmental processes\r", "\r\n", "PiuraChilensis_v1_contig_3855,developmental processes\r", "\r\n" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "!tr ',' \"\\t\" <../data/Piura_v1_GOslim.csv> ../data/Piura_v1_GOslim.tab" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "!sort ../data/Piura_v1_GOslim.tab | tail -n +2 > ../data/Piura_v1_GOslim.sorted\n", "!awk -F $'\\t' '{print $1, \"\\t\", $2}' ../data/Piura_v1_GOslim.sorted > ../data/Piura_v1_GOslim.sortedtab\n", "!head ../data/Piura_v1_GOslim.sortedtab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PiuraChilensis_v1_contig_100 \t death\r", "\r\n", "PiuraChilensis_v1_contig_100 \t other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_100 \t signal transduction\r", "\r\n", "PiuraChilensis_v1_contig_100 \t stress response\r", "\r\n", "PiuraChilensis_v1_contig_100 \t transport\r", "\r\n", "PiuraChilensis_v1_contig_1000 \t other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_1000 \t other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_10003 \t RNA metabolism\r", "\r\n", "PiuraChilensis_v1_contig_10003 \t other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_10003 \t protein metabolism\r", "\r\n" ] } ], "prompt_number": 151 }, { "cell_type": "code", "collapsed": false, "input": [ "!sort ../data/Piura_v1_CpG.tab > ../data/Piura_v1_CpG.sorted\n", "!awk -F $'\\t' '{print $1, \"\\t\", $2}' ../data/Piura_v1_CpG.sorted > ../data/Piura_v1_CpG.sortedtab\n", "!head ../data/Piura_v1_CpG.sortedtab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PiuraChilensis_v1_contig_1 \t 0.427621\r\n", "PiuraChilensis_v1_contig_10 \t 0.817866\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473\r\n", "PiuraChilensis_v1_contig_1000 \t 0.792597\r\n", "PiuraChilensis_v1_contig_10000 \t 0.947282\r\n", "PiuraChilensis_v1_contig_10001 \t 0.691634\r\n", "PiuraChilensis_v1_contig_10002 \t 0.936689\r\n", "PiuraChilensis_v1_contig_10003 \t 0.371111\r\n", "PiuraChilensis_v1_contig_10004 \t 0.89432\r\n", "PiuraChilensis_v1_contig_10005 \t 0.873687\r\n" ] } ], "prompt_number": 152 }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "SELECT * FROM [sr320@washington.edu].[Piura_v1_CpG.sorted]cpg\n", " left join\n", "[sr320@washington.edu].[Piura_v1_GOslim.sorted\u200b]go\n", " on\n", " cpg.Column1=go.Column1\n", "``` \n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!head ../data/Piura_v1_CpG-GOslim.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Column1,Column2,Column1,Column2\r", "\r\n", "PiuraChilensis_v1_contig_1,0.427621,,\r", "\r\n", "PiuraChilensis_v1_contig_10,0.817866,,\r", "\r\n", "PiuraChilensis_v1_contig_100,0.914473,PiuraChilensis_v1_contig_100,death\r", "\r\n", "PiuraChilensis_v1_contig_100,0.914473,PiuraChilensis_v1_contig_100,other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_100,0.914473,PiuraChilensis_v1_contig_100,signal transduction\r", "\r\n", "PiuraChilensis_v1_contig_100,0.914473,PiuraChilensis_v1_contig_100,stress response\r", "\r\n", "PiuraChilensis_v1_contig_100,0.914473,PiuraChilensis_v1_contig_100,transport\r", "\r\n", "PiuraChilensis_v1_contig_1000,0.792597,PiuraChilensis_v1_contig_1000,other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_1000,0.792597,PiuraChilensis_v1_contig_1000,other metabolic processes\r", "\r\n" ] } ], "prompt_number": 167 }, { "cell_type": "code", "collapsed": false, "input": [ "!tr ',' \"\\t\" <../data/Piura_v1_CpG-GOslim.csv> ../data/Piura_v1_CpG-GOslim.tab" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 168 }, { "cell_type": "code", "collapsed": false, "input": [ "!head ../data/Piura_v1_CpG-GOslim.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Column1\tColumn2\tColumn1\tColumn2\r", "\r\n", "PiuraChilensis_v1_contig_1\t0.427621\t\t\r", "\r\n", "PiuraChilensis_v1_contig_10\t0.817866\t\t\r", "\r\n", "PiuraChilensis_v1_contig_100\t0.914473\tPiuraChilensis_v1_contig_100\tdeath\r", "\r\n", "PiuraChilensis_v1_contig_100\t0.914473\tPiuraChilensis_v1_contig_100\tother biological processes\r", "\r\n", "PiuraChilensis_v1_contig_100\t0.914473\tPiuraChilensis_v1_contig_100\tsignal transduction\r", "\r\n", "PiuraChilensis_v1_contig_100\t0.914473\tPiuraChilensis_v1_contig_100\tstress response\r", "\r\n", "PiuraChilensis_v1_contig_100\t0.914473\tPiuraChilensis_v1_contig_100\ttransport\r", "\r\n", "PiuraChilensis_v1_contig_1000\t0.792597\tPiuraChilensis_v1_contig_1000\tother biological processes\r", "\r\n", "PiuraChilensis_v1_contig_1000\t0.792597\tPiuraChilensis_v1_contig_1000\tother metabolic processes\r", "\r\n" ] } ], "prompt_number": 169 }, { "cell_type": "code", "collapsed": false, "input": [ "!awk -F $'\\t' '{print $1, \"\\t\", $2,\" \\t\", $4}' ../data/Piura_v1_CpG-GOslim.tab | tail -n +2 > ../data/Piura_v1_CpG-slim.tab\n", "!head ../data/Piura_v1_CpG-slim.tab\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PiuraChilensis_v1_contig_1 \t 0.427621 \t \r", "\r\n", "PiuraChilensis_v1_contig_10 \t 0.817866 \t \r", "\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473 \t death\r", "\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473 \t other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473 \t signal transduction\r", "\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473 \t stress response\r", "\r\n", "PiuraChilensis_v1_contig_100 \t 0.914473 \t transport\r", "\r\n", "PiuraChilensis_v1_contig_1000 \t 0.792597 \t other biological processes\r", "\r\n", "PiuraChilensis_v1_contig_1000 \t 0.792597 \t other metabolic processes\r", "\r\n", "PiuraChilensis_v1_contig_10000 \t 0.947282 \t \r", "\r\n" ] } ], "prompt_number": 170 }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 171 }, { "cell_type": "code", "collapsed": false, "input": [ "Piura = pd.read_table('../data/Piura_v1_CpG-slim.tab', header=None)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 172 }, { "cell_type": "code", "collapsed": false, "input": [ "Piura" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "<class 'pandas.core.frame.DataFrame'>\n", "Int64Index: 27267 entries, 0 to 27266\n", "Data columns (total 3 columns):\n", "0 27267 non-null values\n", "1 27267 non-null values\n", "2 27267 non-null values\n", "dtypes: float64(1), object(2)\n", "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 173, "text": [ "