{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Defining RNA-seq (gene function) based Tracks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**tldr** 4 \"new\" tracks\n", "\"IGV_and_Directory_Listing_of__halfshell_2015-02-hs-bedgraph__1AA51F1B.png\"/\n", "```\n", "/Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf\n", "/Users/sr320/data-genomic/tentacle/rebuilt.gtf\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-housekeeping.gff\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-env-response.gff\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Diff Exp Genes" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scaffold992\tCufflinks\tCDS\t9669\t9825\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"2\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r", "\r\n", "scaffold992\tCufflinks\texon\t10327\t10415\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"3\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r", "\r\n", "scaffold992\tCufflinks\tCDS\t10327\t10415\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"3\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r", "\r\n" ] } ], "source": [ "#Track with DEGs defined by Cuffdiff\n", "#how derived = {RNA-seq-Gene-ID}\n", "!tail -3 /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 122038 /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf\r\n" ] } ], "source": [ "!wc -l /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### New GTF from Cuffdiff" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C12764\tCufflinks\texon\t28\t201\t.\t.\t.\tgene_id XLOC_000001; tss_id \"TSS1\"; oId \"CUFF.1.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000001\"\r\n", "C12764\tCufflinks\tCDS\t28\t201\t.\t.\t.\tgene_id XLOC_000001; tss_id \"TSS1\"; oId \"CUFF.1.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000001\"\r\n", "C12768\tCufflinks\texon\t4\t189\t.\t.\t.\tgene_id XLOC_000002; tss_id \"TSS2\"; oId \"CUFF.2.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000002\"\r\n" ] } ], "source": [ "#GTF produced from Cuffdiff \n", "#see /Volumes/web/halfshell/BS-heat/Cuffdiff2_heat-b-2014-12-20-22-27-15.4\n", "!head -3 /Users/sr320/data-genomic/tentacle/rebuilt.gtf" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1347244 /Users/sr320/data-genomic/tentacle/rebuilt.gtf\r\n" ] } ], "source": [ "!wc -l /Users/sr320/data-genomic/tentacle/rebuilt.gtf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### GigaDB gene tracks - Isolated Housekeeping and Environment Stress Genes " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"sh_1AA50F63.png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "_Based on annotation from 10.3389/fphys.2011.00116 (see image above)_" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\r\n", "C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\r\n", "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\r\n" ] } ], "source": [ "!head -3 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 28027 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff\r\n" ] } ], "source": [ "!wc -l /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CGI_10000001\r\n", "CGI_10000002\r\n", "CGI_10000003\r\n", "CGI_10000004\r\n", "CGI_10000005\r\n", "CGI_10000009\r\n", "CGI_10000010\r\n", "CGI_10000011\r\n", "CGI_10000012\r\n", "CGI_10000013\r\n" ] } ], "source": [ "#adding extra CGI column to join GO info on\n", "!awk -F[\"\\t\"] '{print $9}' /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff \\\n", "| rev | cut -c 2- | rev | sed s/ID=C/C/g > \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi\n", "!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 28027 /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi\r\n" ] } ], "source": [ "!wc -l /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!paste /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi \\\n", "> /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\r\n", "C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\r\n", "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\r\n", "C17476\tGLEAN\tmRNA\t34\t257\t0.998947\t-\t.\tID=CGI_10000004;\tCGI_10000004\r\n", "C17998\tGLEAN\tmRNA\t196\t387\t1\t-\t.\tID=CGI_10000005;\tCGI_10000005\r\n", "C18346\tGLEAN\tmRNA\t174\t551\t1\t+\t.\tID=CGI_10000009;\tCGI_10000009\r\n", "C18428\tGLEAN\tmRNA\t286\t546\t0.555898\t-\t.\tID=CGI_10000010;\tCGI_10000010\r\n", "C18964\tGLEAN\tmRNA\t203\t658\t0.999572\t-\t.\tID=CGI_10000011;\tCGI_10000011\r\n", "C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\r\n", "C19100\tGLEAN\tmRNA\t160\t681\t0.999955\t-\t.\tID=CGI_10000013;\tCGI_10000013\r\n" ] } ], "source": [ "!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sqls=\"/Applications/bioinfo/sqlshare-pythonclient/tools/\"" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "processing chunk line 0 to 28027 (0.00476694107056 s elapsed)\n", "pushing /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab...\n", "parsing 0863C50E...\n", "finished Cgigas_v9_gene--ID\n" ] } ], "source": [ "!python {sqls}singleupload.py \\\n", "-d Cgigas_v9_gene--ID \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!python {sqls}fetchdata.py \\\n", "-s \"SELECT * \\\n", "FROM [sr320@washington.edu].[Cgigas_v9_gene--ID]md \\\n", "left join \\\n", "[sr320@washington.edu].[qDOD_Cgigas_GOslim_DISTINCT]go on md.Column10=go.CGI_ID\" \\\n", "-f tsv \\\n", "-o /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Column1\tColumn2\tColumn3\tColumn4\tColumn5\tColumn6\tColumn7\tColumn8\tColumn9\tColumn10\tCGI_ID\tGOslim_bin\taspect\r", "\r\n", "C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother biological processes\tP\r", "\r\n", "C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother cellular component\tC\r", "\r\n", "C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother molecular function\tF\r", "\r\n", "C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tcytoskeleton\tC\r", "\r\n", "C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tother cellular component\tC\r", "\r\n", "C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tother molecular function\tF\r", "\r\n", "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tnon-structural extracellular\tC\r", "\r\n", "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction\tP\r", "\r\n", "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction activity\tF\r", "\r\n" ] } ], "source": [ "!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scaffold117\tGLEAN\tmRNA\t381424\t383832\t0.991601\t+\t.\tID=CGI_10016969;\tCGI_10016969\t\t\t\r", "\r\n", "scaffold315\tGLEAN\tmRNA\t483272\t485643\t0.994136\t-\t.\tID=CGI_10020430;\tCGI_10020430\t\t\t\r", "\r\n", "scaffold588\tGLEAN\tmRNA\t248679\t258989\t0.997309\t-\t.\tID=CGI_10016218;\tCGI_10016218\t\t\t\r", "\r\n" ] } ], "source": [ "!tail -3 /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C18964\tGLEAN\tmRNA\t203\t658\t0.999572\t-\t.\tID=CGI_10000011;\tCGI_10000011\tCGI_10000011\tprotein metabolism\tP\r", "\r\n", "C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\tCGI_10000012\tprotein metabolism\tP\r", "\r\n", "C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\tCGI_10000012\tRNA metabolism\tP\r", "\r\n", "C19100\tGLEAN\tmRNA\t160\t681\t0.999955\t-\t.\tID=CGI_10000013;\tCGI_10000013\tCGI_10000013\tprotein metabolism\tP\r", "\r\n", "C19392\tGLEAN\tmRNA\t46\t610\t1\t+\t.\tID=CGI_10000015;\tCGI_10000015\tCGI_10000015\tprotein metabolism\tP\r", "\r\n", "C19392\tGLEAN\tmRNA\t46\t610\t1\t+\t.\tID=CGI_10000015;\tCGI_10000015\tCGI_10000015\tRNA metabolism\tP\r", "\r\n", "C20188\tGLEAN\tmRNA\t437\t967\t0.999572\t-\t.\tID=CGI_10000024;\tCGI_10000024\tCGI_10000024\tprotein metabolism\tP\r", "\r\n", "C20462\tGLEAN\tmRNA\t3\t871\t1\t+\t.\tID=CGI_10000030;\tCGI_10000030\tCGI_10000030\tRNA metabolism\tP\r", "\r\n", "C20524\tGLEAN\tmRNA\t6\t1100\t1\t-\t.\tID=CGI_10000033;\tCGI_10000033\tCGI_10000033\tprotein metabolism\tP\r", "\r\n", "C20582\tGLEAN\tmRNA\t75\t980\t0.555898\t+\t.\tID=CGI_10000035;\tCGI_10000035\tCGI_10000035\tRNA metabolism\tP\r", "\r\n" ] } ], "source": [ "!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab | head " ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction\tP\r\n", "C20480\tGLEAN\tmRNA\t367\t1037\t0.999572\t-\t.\tID=CGI_10000032;\tCGI_10000032\tCGI_10000032\tsignal transduction\tP\r\n", "C20578\tGLEAN\tmRNA\t699\t950\t0.555898\t+\t.\tID=CGI_10000034;\tCGI_10000034\tCGI_10000034\tsignal transduction\tP\r\n", "C22046\tGLEAN\tmRNA\t98\t1281\t1\t+\t.\tID=CGI_10000069;\tCGI_10000069\tCGI_10000069\tcell adhesion\tP\r\n", "C22046\tGLEAN\tmRNA\t98\t1281\t1\t+\t.\tID=CGI_10000069;\tCGI_10000069\tCGI_10000069\tsignal transduction\tP\r\n", "C22798\tGLEAN\tmRNA\t433\t1785\t1\t+\t.\tID=CGI_10000088;\tCGI_10000088\tCGI_10000088\tsignal transduction\tP\r\n", "C23676\tGLEAN\tmRNA\t34\t2210\t1\t+\t.\tID=CGI_10000145;\tCGI_10000145\tCGI_10000145\tsignal transduction\tP\r\n", "scaffold1370\tGLEAN\tmRNA\t642\t1238\t1\t-\t.\tID=CGI_10000165;\tCGI_10000165\tCGI_10000165\tsignal transduction\tP\r\n", "scaffold1370\tGLEAN\tmRNA\t1243\t2469\t0.999414\t-\t.\tID=CGI_10000166;\tCGI_10000166\tCGI_10000166\tsignal transduction\tP\r\n", "C24232\tGLEAN\tmRNA\t589\t2415\t1\t-\t.\tID=CGI_10000183;\tCGI_10000183\tCGI_10000183\tsignal transduction\tP\r\n" ] } ], "source": [ "%%bash\n", "grep --color 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n", "| grep -v \"signal transduction activity\tF\" \\\n", "| head" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 666 DNA metabolism\r\n", "2452 RNA metabolism\r\n", "3737 protein metabolism\r\n" ] } ], "source": [ "#QC\n", "!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n", "| cut -f 12 | sort | uniq -c " ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1069 cell adhesion\n", " 478 cell-cell signaling\n", "3001 signal transduction\n" ] } ], "source": [ "#QC\n", "!grep 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n", "| grep -v \"signal transduction activity\tF\" \\\n", "| cut -f 12 | sort | uniq -c " ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n", "| cut -f 1,2,3,4,5,6,7,8,9 > /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-housekeeping.gff" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!grep 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n", "/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n", "| grep -v \"signal transduction activity\tF\" \\\n", "| cut -f 1,2,3,4,5,6,7,8,9 > /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-env-response.gff" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }