{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Defining RNA-seq (gene function) based Tracks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**tldr** 4 \"new\" tracks\n",
"\n",
"```\n",
"/Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf\n",
"/Users/sr320/data-genomic/tentacle/rebuilt.gtf\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-housekeeping.gff\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-env-response.gff\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Diff Exp Genes"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scaffold992\tCufflinks\tCDS\t9669\t9825\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"2\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r",
"\r\n",
"scaffold992\tCufflinks\texon\t10327\t10415\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"3\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r",
"\r\n",
"scaffold992\tCufflinks\tCDS\t10327\t10415\t.\t+\t.\tgene_id XLOC_036046; tss_id \"TSS54716\"; nearest_ref \"EKC31279\"; oId \"CUFF.28745.1\"; exon_number \"3\"; class_code \"x\"; gene_name \"CGI_10007500\"; transcript_id \"TCONS_00075736\"\r",
"\r\n"
]
}
],
"source": [
"#Track with DEGs defined by Cuffdiff\n",
"#how derived = {RNA-seq-Gene-ID}\n",
"!tail -3 /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 122038 /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf\r\n"
]
}
],
"source": [
"!wc -l /Users/sr320/data-genomic/tentacle/Cuffdiff_geneexp.sig.gtf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### New GTF from Cuffdiff"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C12764\tCufflinks\texon\t28\t201\t.\t.\t.\tgene_id XLOC_000001; tss_id \"TSS1\"; oId \"CUFF.1.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000001\"\r\n",
"C12764\tCufflinks\tCDS\t28\t201\t.\t.\t.\tgene_id XLOC_000001; tss_id \"TSS1\"; oId \"CUFF.1.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000001\"\r\n",
"C12768\tCufflinks\texon\t4\t189\t.\t.\t.\tgene_id XLOC_000002; tss_id \"TSS2\"; oId \"CUFF.2.1\"; exon_number \"1\"; class_code \"u\"; transcript_id \"TCONS_00000002\"\r\n"
]
}
],
"source": [
"#GTF produced from Cuffdiff \n",
"#see /Volumes/web/halfshell/BS-heat/Cuffdiff2_heat-b-2014-12-20-22-27-15.4\n",
"!head -3 /Users/sr320/data-genomic/tentacle/rebuilt.gtf"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1347244 /Users/sr320/data-genomic/tentacle/rebuilt.gtf\r\n"
]
}
],
"source": [
"!wc -l /Users/sr320/data-genomic/tentacle/rebuilt.gtf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### GigaDB gene tracks - Isolated Housekeeping and Environment Stress Genes "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_Based on annotation from 10.3389/fphys.2011.00116 (see image above)_"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\r\n",
"C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\r\n",
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\r\n"
]
}
],
"source": [
"!head -3 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 28027 /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff\r\n"
]
}
],
"source": [
"!wc -l /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CGI_10000001\r\n",
"CGI_10000002\r\n",
"CGI_10000003\r\n",
"CGI_10000004\r\n",
"CGI_10000005\r\n",
"CGI_10000009\r\n",
"CGI_10000010\r\n",
"CGI_10000011\r\n",
"CGI_10000012\r\n",
"CGI_10000013\r\n"
]
}
],
"source": [
"#adding extra CGI column to join GO info on\n",
"!awk -F[\"\\t\"] '{print $9}' /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff \\\n",
"| rev | cut -c 2- | rev | sed s/ID=C/C/g > \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi\n",
"!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 28027 /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi\r\n"
]
}
],
"source": [
"!wc -l /Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!paste /Volumes/web-1/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_gene.gff \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_cgi \\\n",
"> /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\r\n",
"C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\r\n",
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\r\n",
"C17476\tGLEAN\tmRNA\t34\t257\t0.998947\t-\t.\tID=CGI_10000004;\tCGI_10000004\r\n",
"C17998\tGLEAN\tmRNA\t196\t387\t1\t-\t.\tID=CGI_10000005;\tCGI_10000005\r\n",
"C18346\tGLEAN\tmRNA\t174\t551\t1\t+\t.\tID=CGI_10000009;\tCGI_10000009\r\n",
"C18428\tGLEAN\tmRNA\t286\t546\t0.555898\t-\t.\tID=CGI_10000010;\tCGI_10000010\r\n",
"C18964\tGLEAN\tmRNA\t203\t658\t0.999572\t-\t.\tID=CGI_10000011;\tCGI_10000011\r\n",
"C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\r\n",
"C19100\tGLEAN\tmRNA\t160\t681\t0.999955\t-\t.\tID=CGI_10000013;\tCGI_10000013\r\n"
]
}
],
"source": [
"!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sqls=\"/Applications/bioinfo/sqlshare-pythonclient/tools/\""
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"processing chunk line 0 to 28027 (0.00476694107056 s elapsed)\n",
"pushing /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab...\n",
"parsing 0863C50E...\n",
"finished Cgigas_v9_gene--ID\n"
]
}
],
"source": [
"!python {sqls}singleupload.py \\\n",
"-d Cgigas_v9_gene--ID \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID.tab"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!python {sqls}fetchdata.py \\\n",
"-s \"SELECT * \\\n",
"FROM [sr320@washington.edu].[Cgigas_v9_gene--ID]md \\\n",
"left join \\\n",
"[sr320@washington.edu].[qDOD_Cgigas_GOslim_DISTINCT]go on md.Column10=go.CGI_ID\" \\\n",
"-f tsv \\\n",
"-o /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column1\tColumn2\tColumn3\tColumn4\tColumn5\tColumn6\tColumn7\tColumn8\tColumn9\tColumn10\tCGI_ID\tGOslim_bin\taspect\r",
"\r\n",
"C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother biological processes\tP\r",
"\r\n",
"C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother cellular component\tC\r",
"\r\n",
"C16582\tGLEAN\tmRNA\t35\t385\t0.555898\t-\t.\tID=CGI_10000001;\tCGI_10000001\tCGI_10000001\tother molecular function\tF\r",
"\r\n",
"C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tcytoskeleton\tC\r",
"\r\n",
"C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tother cellular component\tC\r",
"\r\n",
"C17212\tGLEAN\tmRNA\t31\t363\t0.999572\t+\t.\tID=CGI_10000002;\tCGI_10000002\tCGI_10000002\tother molecular function\tF\r",
"\r\n",
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tnon-structural extracellular\tC\r",
"\r\n",
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction\tP\r",
"\r\n",
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction activity\tF\r",
"\r\n"
]
}
],
"source": [
"!head /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scaffold117\tGLEAN\tmRNA\t381424\t383832\t0.991601\t+\t.\tID=CGI_10016969;\tCGI_10016969\t\t\t\r",
"\r\n",
"scaffold315\tGLEAN\tmRNA\t483272\t485643\t0.994136\t-\t.\tID=CGI_10020430;\tCGI_10020430\t\t\t\r",
"\r\n",
"scaffold588\tGLEAN\tmRNA\t248679\t258989\t0.997309\t-\t.\tID=CGI_10016218;\tCGI_10016218\t\t\t\r",
"\r\n"
]
}
],
"source": [
"!tail -3 /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C18964\tGLEAN\tmRNA\t203\t658\t0.999572\t-\t.\tID=CGI_10000011;\tCGI_10000011\tCGI_10000011\tprotein metabolism\tP\r",
"\r\n",
"C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\tCGI_10000012\tprotein metabolism\tP\r",
"\r\n",
"C18980\tGLEAN\tmRNA\t30\t674\t0.555898\t+\t.\tID=CGI_10000012;\tCGI_10000012\tCGI_10000012\tRNA metabolism\tP\r",
"\r\n",
"C19100\tGLEAN\tmRNA\t160\t681\t0.999955\t-\t.\tID=CGI_10000013;\tCGI_10000013\tCGI_10000013\tprotein metabolism\tP\r",
"\r\n",
"C19392\tGLEAN\tmRNA\t46\t610\t1\t+\t.\tID=CGI_10000015;\tCGI_10000015\tCGI_10000015\tprotein metabolism\tP\r",
"\r\n",
"C19392\tGLEAN\tmRNA\t46\t610\t1\t+\t.\tID=CGI_10000015;\tCGI_10000015\tCGI_10000015\tRNA metabolism\tP\r",
"\r\n",
"C20188\tGLEAN\tmRNA\t437\t967\t0.999572\t-\t.\tID=CGI_10000024;\tCGI_10000024\tCGI_10000024\tprotein metabolism\tP\r",
"\r\n",
"C20462\tGLEAN\tmRNA\t3\t871\t1\t+\t.\tID=CGI_10000030;\tCGI_10000030\tCGI_10000030\tRNA metabolism\tP\r",
"\r\n",
"C20524\tGLEAN\tmRNA\t6\t1100\t1\t-\t.\tID=CGI_10000033;\tCGI_10000033\tCGI_10000033\tprotein metabolism\tP\r",
"\r\n",
"C20582\tGLEAN\tmRNA\t75\t980\t0.555898\t+\t.\tID=CGI_10000035;\tCGI_10000035\tCGI_10000035\tRNA metabolism\tP\r",
"\r\n"
]
}
],
"source": [
"!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab | head "
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C17316\tGLEAN\tmRNA\t30\t257\t0.555898\t+\t.\tID=CGI_10000003;\tCGI_10000003\tCGI_10000003\tsignal transduction\tP\r\n",
"C20480\tGLEAN\tmRNA\t367\t1037\t0.999572\t-\t.\tID=CGI_10000032;\tCGI_10000032\tCGI_10000032\tsignal transduction\tP\r\n",
"C20578\tGLEAN\tmRNA\t699\t950\t0.555898\t+\t.\tID=CGI_10000034;\tCGI_10000034\tCGI_10000034\tsignal transduction\tP\r\n",
"C22046\tGLEAN\tmRNA\t98\t1281\t1\t+\t.\tID=CGI_10000069;\tCGI_10000069\tCGI_10000069\tcell adhesion\tP\r\n",
"C22046\tGLEAN\tmRNA\t98\t1281\t1\t+\t.\tID=CGI_10000069;\tCGI_10000069\tCGI_10000069\tsignal transduction\tP\r\n",
"C22798\tGLEAN\tmRNA\t433\t1785\t1\t+\t.\tID=CGI_10000088;\tCGI_10000088\tCGI_10000088\tsignal transduction\tP\r\n",
"C23676\tGLEAN\tmRNA\t34\t2210\t1\t+\t.\tID=CGI_10000145;\tCGI_10000145\tCGI_10000145\tsignal transduction\tP\r\n",
"scaffold1370\tGLEAN\tmRNA\t642\t1238\t1\t-\t.\tID=CGI_10000165;\tCGI_10000165\tCGI_10000165\tsignal transduction\tP\r\n",
"scaffold1370\tGLEAN\tmRNA\t1243\t2469\t0.999414\t-\t.\tID=CGI_10000166;\tCGI_10000166\tCGI_10000166\tsignal transduction\tP\r\n",
"C24232\tGLEAN\tmRNA\t589\t2415\t1\t-\t.\tID=CGI_10000183;\tCGI_10000183\tCGI_10000183\tsignal transduction\tP\r\n"
]
}
],
"source": [
"%%bash\n",
"grep --color 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n",
"| grep -v \"signal transduction activity\tF\" \\\n",
"| head"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 666 DNA metabolism\r\n",
"2452 RNA metabolism\r\n",
"3737 protein metabolism\r\n"
]
}
],
"source": [
"#QC\n",
"!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n",
"| cut -f 12 | sort | uniq -c "
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1069 cell adhesion\n",
" 478 cell-cell signaling\n",
"3001 signal transduction\n"
]
}
],
"source": [
"#QC\n",
"!grep 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n",
"| grep -v \"signal transduction activity\tF\" \\\n",
"| cut -f 12 | sort | uniq -c "
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!grep 'DNA metabolism\\|RNA metabolism\\|protein metabolism' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n",
"| cut -f 1,2,3,4,5,6,7,8,9 > /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-housekeeping.gff"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!grep 'cell-cell signaling\\|signal transduction\\|cell adhesion' \\\n",
"/Users/sr320/data-genomic/tentacle/Cgigas_v9_gene--ID--GOslim.tab \\\n",
"| grep -v \"signal transduction activity\tF\" \\\n",
"| cut -f 1,2,3,4,5,6,7,8,9 > /Users/sr320/data-genomic/tentacle/Cgigas_v9_gene-env-response.gff"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}