{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Re-defining canonical C gigas Genome Tracks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "via Ensembl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"IGV_1AA0C920.png\"/" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scaffold44098\tdust\trepeat_region\t518076\t518099\t.\t.\t.\tName=dust;class=dust;type=Dust\r\n", "scaffold44098\tdust\trepeat_region\t519261\t519281\t.\t.\t.\tName=dust;class=dust;type=Dust\r\n", "scaffold44098\ttrf\trepeat_region\t519261\t519281\t.\t.\t.\tName=trf;class=trf;repeat_consensus=AT;type=Tandem repeats\r\n" ] } ], "source": [ "!tail -3 /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "186890 CDS\r\n", " 5 RNA\r\n", "189468 exon\r\n", "26114 gene\r\n", " 28 miRNA\r\n", " 28 miRNA_gene\r\n", "1410 pseudogenic_tRNA\r\n", " 13 rRNA\r\n", " 13 rRNA_gene\r\n", "875275 repeat_region\r\n", " 47 snRNA\r\n", " 47 snRNA_gene\r\n", " 20 snoRNA\r\n", " 20 snoRNA_gene\r\n", " 994 tRNA_gene\r\n", "28523 transcript\r\n" ] } ], "source": [ "!cut -f 3 \\\n", "/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5 EnsemblGenomes\tRNA\r\n", "2530 EnsemblGenomes\texon\r\n", " 13 EnsemblGenomes\tgene\r\n", " 28 EnsemblGenomes\tmiRNA\r\n", " 28 EnsemblGenomes\tmiRNA_gene\r\n", "1410 EnsemblGenomes\tpseudogenic_tRNA\r\n", " 13 EnsemblGenomes\trRNA\r\n", " 13 EnsemblGenomes\trRNA_gene\r\n", " 47 EnsemblGenomes\tsnRNA\r\n", " 47 EnsemblGenomes\tsnRNA_gene\r\n", " 20 EnsemblGenomes\tsnoRNA\r\n", " 20 EnsemblGenomes\tsnoRNA_gene\r\n", " 994 EnsemblGenomes\ttRNA_gene\r\n", "2422 EnsemblGenomes\ttranscript\r\n", "186890 GigaDB\tCDS\r\n", "186938 GigaDB\texon\r\n", "26101 GigaDB\tgene\r\n", "26101 GigaDB\ttranscript\r\n", "650376 dust\trepeat_region\r\n", "224899 trf\trepeat_region\r\n" ] } ], "source": [ "!cut -f 2,3 \\\n", "/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scaffold44098\tprotein_coding\tCDS\t509746\t510288\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"2\"; protein_id \"EKC17988\";\r\n", "scaffold44098\tprotein_coding\texon\t514550\t514690\t.\t-\t.\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\"; seqedit \"false\";\r\n", "scaffold44098\tprotein_coding\tCDS\t514550\t514690\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\"; protein_id \"EKC17988\";\r\n", "scaffold44098\tprotein_coding\tstart_codon\t514688\t514690\t.\t-\t0\t gene_id \"CGI_10017729\"; gene_version \"1\"; transcript_id \"EKC17988\"; transcript_version \"1\"; exon_number \"1\";\r\n", "scaffold44098\tprotein_coding\texon\t514859\t515511\t.\t-\t.\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\"; seqedit \"false\";\r\n", "scaffold44098\tprotein_coding\tstop_codon\t514859\t514861\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\";\r\n", "scaffold44098\tprotein_coding\tCDS\t514862\t515511\t.\t-\t2\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"2\"; protein_id \"EKC17989\";\r\n", "scaffold44098\tprotein_coding\texon\t515871\t515877\t.\t-\t.\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\"; seqedit \"false\";\r\n", "scaffold44098\tprotein_coding\tCDS\t515871\t515877\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\"; protein_id \"EKC17989\";\r\n", "scaffold44098\tprotein_coding\tstart_codon\t515875\t515877\t.\t-\t0\t gene_id \"CGI_10017730\"; gene_version \"1\"; transcript_id \"EKC17989\"; transcript_version \"1\"; exon_number \"1\";\r\n" ] } ], "source": [ "!tail /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gtf" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2 RNase_MRP_RNA\texon\r\n", " 1 RNase_P_RNA\texon\r\n", " 10 SRP_RNA\texon\r\n", " 28 miRNA\texon\r\n", " 5 misc_RNA\texon\r\n", " 48 nontranslating_CDS\texon\r\n", "186890 protein_coding\tCDS\r\n", "186890 protein_coding\texon\r\n", "25587 protein_coding\tstart_codon\r\n", "26087 protein_coding\tstop_codon\r\n", " 13 rRNA\texon\r\n", " 47 snRNA\texon\r\n", " 20 snoRNA\texon\r\n", " 994 tRNA\texon\r\n", "1410 tRNA_pseudogene\texon\r\n" ] } ], "source": [ "!cut -f 2,3 \\\n", "/Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gtf \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# via GigaDB aka version9" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff <==\r\n", "scaffold999\tflankbed\tpromoter\t99703\t100702\t.\t-\t.\tID=CGI_10006972;\r", "\r\n", "scaffold999\tflankbed\tpromoter\t106744\t107743\t.\t+\t.\tID=CGI_10006973;\r", "\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_CG.gff <==\r\n", "scaffold38980\tfuzznuc\tnucleotide_motif\t63903\t63904\t2\t+\t.\tID=scaffold38980.744;note=*pat pattern:CG\r\n", "scaffold38980\tfuzznuc\tnucleotide_motif\t64051\t64052\t2\t+\t.\tID=scaffold38980.745;note=*pat pattern:CG\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-TANDEMREPEAT.gff <==\r\n", "scaffold999\tTRF\tTandem_Repeat\t153009\t153196\t189\t+\t.\t.\r\n", "scaffold999\tTRF\tTandem_Repeat\t166754\t166792\t69\t+\t.\t.\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff <==\r\n", "scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n", "scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE.gff <==\r\n", "scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n", "scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TEx.gff <==\r\n", "scaffold1009\tWUBlastX\tDNA_TcMar-Tc2\t1790325\t1790603\t20\t+\t.\t.\r\n", "scaffold983\tWUBlastX\tDNA_TcMar-Tc1\t369636\t369770\t26\t-\t.\t.\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff <==\r\n", "scaffold22\tGLEAN\tCDS\t1870289\t1870360\t.\t-\t0\tParent=CGI_10028939;\r\n", "scaffold22\tGLEAN\tCDS\t1869336\t1869428\t.\t-\t0\tParent=CGI_10028939;\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff <==\r\n", "scaffold22\tGLEAN\tmRNA\t1863760\t1864161\t0.544455\t+\t.\tID=CGI_10028938;\r\n", "scaffold22\tGLEAN\tmRNA\t1869336\t1885890\t0.999933\t-\t.\tID=CGI_10028939;\r\n", "\r\n", "==> /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_intron.gff <==\r\n", "scaffold999\tsubtractBed\tintrn\t124997\t126011\t.\t+\t.\tParent=CGI_10006973;\r", "\r\n", "scaffold999\tsubtractBed\tintrn\t126144\t126616\t.\t+\t.\tParent=CGI_10006973;\r", "\r\n" ] } ], "source": [ "!tail -2 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_*.gff" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 28023 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff\n", " 10035701 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_CG.gff\n", " 61319 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-TANDEMREPEAT.gff\n", " 58468 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff\n", " 119786 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE.gff\n", " 58468 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TEx.gff\n", " 196691 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff\n", " 28027 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff\n", " 176049 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_intron.gff\n", " 10762532 total\n" ] } ], "source": [ "!wc -l /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_*.gff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"IGV_1AA0D065.png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Comparison" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Array-feature-overlap-02_1AA0D233.png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lets see if can take all array and intersect with Ensembl gff" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1373 GigaDB\tCDS\r\n", "1373 GigaDB\texon\r\n", "8468 GigaDB\tgene\r\n", "8468 GigaDB\ttranscript\r\n", "1240 dust\trepeat_region\r\n", " 975 trf\trepeat_region\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2 EnsemblGenomes\texon\r\n", " 1 EnsemblGenomes\tpseudogenic_tRNA\r\n", " 1 EnsemblGenomes\ttRNA_gene\r\n", " 2 EnsemblGenomes\ttranscript\r\n", "1177 GigaDB\tCDS\r\n", "1177 GigaDB\texon\r\n", "8491 GigaDB\tgene\r\n", "8491 GigaDB\ttranscript\r\n", "1320 dust\trepeat_region\r\n", " 873 trf\trepeat_region\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 EnsemblGenomes\texon\r\n", " 1 EnsemblGenomes\tsnRNA\r\n", " 1 EnsemblGenomes\tsnRNA_gene\r\n", " 947 GigaDB\tCDS\r\n", " 948 GigaDB\texon\r\n", "9689 GigaDB\tgene\r\n", "9689 GigaDB\ttranscript\r\n", "1591 dust\trepeat_region\r\n", " 864 trf\trepeat_region\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 5 EnsemblGenomes\tRNA\r\n", " 444 EnsemblGenomes\texon\r\n", " 6 EnsemblGenomes\tgene\r\n", " 2 EnsemblGenomes\tmiRNA\r\n", " 2 EnsemblGenomes\tmiRNA_gene\r\n", " 259 EnsemblGenomes\tpseudogenic_tRNA\r\n", " 14 EnsemblGenomes\tsnRNA\r\n", " 14 EnsemblGenomes\tsnRNA_gene\r\n", " 6 EnsemblGenomes\tsnoRNA\r\n", " 6 EnsemblGenomes\tsnoRNA_gene\r\n", " 152 EnsemblGenomes\ttRNA_gene\r\n", " 422 EnsemblGenomes\ttranscript\r\n", "157279 GigaDB\tCDS\r\n", "157307 GigaDB\texon\r\n", "600445 GigaDB\tgene\r\n", "600445 GigaDB\ttranscript\r\n", "56210 dust\trepeat_region\r\n", "42390 trf\trepeat_region\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.25.sorted.gff3 \\\n", "| cut -f 11,12 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TEs" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 383 WUBlastX\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n", "| cut -f 6 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 254 WUBlastX\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n", "| cut -f 6 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 168 WUBlastX\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n", "| cut -f 6 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10322 WUBlastX\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_TE-WUBLASTX.gff \\\n", "| cut -f 11 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Promoters" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 976 flankbed\tpromoter\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.2M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 992 flankbed\tpromoter\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.4M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1248 flankbed\tpromoter\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a ./data/2014.07.02.colson/genomeBrowserTracks/logFC_HS-preHS/2014.07.02.6M_sig.bedGraph \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n", "| cut -f 6,7 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "66368 flankbed\r\n" ] } ], "source": [ "!intersectbed \\\n", "-wb \\\n", "-a /Users/sr320/git-repos/paper-Temp-stress/ipynb/data/array-design/OID40453_probe_locations.gff \\\n", "-b /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_1k5p_gene_promoter.gff \\\n", "| cut -f 11 \\\n", "| sort | uniq -c | sed '/#/d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Book_1AA0DEB2.png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "oys2\toys4\toys6\tProbes\n", "gene\t8468\t8491\t9689\t600445\n", "exon\t1373\t1177\t948\t157307\n", "intron\t7095\t7314\t8741\t443138\n", "dust repeat\t1240\t1320\t1591\t56210\n", "trf repeat\t975\t873\t864\t42390\n", "TE-blast\t383\t254\t168\t10322\n", "promoter\t976\t992\t1248\t66368\n", "```" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "# Analysis of one proportion" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://nbviewer.ipython.org/github/thomas-haslwanter/statsintro/blob/master/ipynb/70_compGroups.ipynb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline\n", "import scipy.stats as stats" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ONE PROPORTION\n", "The confidence interval for the given sample is 0.224 to 0.226\n" ] } ], "source": [ "# Get the data Probes exon\n", "numTotal = 697753\n", "numPositive = 157307\n", "\n", "# Calculate the confidence intervals\n", "p = float(numPositive)/numTotal\n", "se = sqrt(p*(1-p)/numTotal)\n", "td = stats.t(numTotal-1)\n", "ci = p + array([-1,1])*td.isf(0.025)*se\n", "\n", "# Print them\n", "print('ONE PROPORTION')\n", "print('The confidence interval for the given sample is {0:5.3f} to {1:5.3f}'.format(\n", " ci[0], ci[1]))\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Chi-square test to a 2x2 table\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CHI SQUARE\n", "The corrected chi2 value is 47.663, with p=0.000\n", "The uncorrected chi2 value is 47.772, with p=0.000\n" ] } ], "source": [ "# Enter the data comparing Oyster 2; probes at intron\n", "obs = array([[7095, 10028], [443138, 697753]])\n", "\n", "# Calculate the chi-square test\n", "chi2_corrected = stats.chi2_contingency(obs, correction=True)\n", "chi2_uncorrected = stats.chi2_contingency(obs, correction=False)\n", "\n", "# Print the result\n", "print('CHI SQUARE')\n", "print('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_corrected[0], chi2_corrected[1]))\n", "print('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_uncorrected[0], chi2_uncorrected[1]))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CHI SQUARE\n", "The corrected chi2 value is 1.597, with p=0.206\n", "The uncorrected chi2 value is 1.616, with p=0.204\n" ] } ], "source": [ "# Enter the data comparing Oyster 2; probes at gene\n", "obs = array([[8468, 10028], [600445, 697753]])\n", "\n", "# Calculate the chi-square test\n", "chi2_corrected = stats.chi2_contingency(obs, correction=True)\n", "chi2_uncorrected = stats.chi2_contingency(obs, correction=False)\n", "\n", "# Print the result\n", "print('CHI SQUARE')\n", "print('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_corrected[0], chi2_corrected[1]))\n", "print('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format(chi2_uncorrected[0], chi2_uncorrected[1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }