{ "metadata": { "name": "", "signature": "sha256:68ad69cf7f9dd05d22c51f4c12993711a6ab1b504ffa2fe5094d6088471d78cd" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Annotation of Genes associated with Differentially Methylation - E2 Exposure Trial" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cd Volumes/web/cnidarian/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/web/cnidarian\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "!curl -O http://eagle.fish.washington.edu/bivalvia/array/genesWithDiffMeth.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 244k 100 244k 0 0 1723k 0 --:--:-- --:--:-- --:--:-- 1835k\r\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/cnidarian/genesWithDiffMeth.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 166 166 250002 /Volumes/web/cnidarian/genesWithDiffMeth.fasta\r\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "!fgrep -c \">\" /Volumes/web/cnidarian/genesWithDiffMeth.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "83\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!blastn -h" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "USAGE\r\n", " blastn [-h] [-help] [-import_search_strategy filename]\r\n", " [-export_search_strategy filename] [-task task_name] [-db database_name]\r\n", " [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n", " [-negative_gilist filename] [-entrez_query entrez_query]\r\n", " [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]\r\n", " [-subject subject_input_file] [-subject_loc range] [-query input_file]\r\n", " [-out output_file] [-evalue evalue] [-word_size int_value]\r\n", " [-gapopen open_penalty] [-gapextend extend_penalty]\r\n", " [-perc_identity float_value] [-xdrop_ungap float_value]\r\n", " [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n", " [-searchsp int_value] [-max_hsps_per_subject int_value] [-penalty penalty]\r\n", " [-reward reward] [-no_greedy] [-min_raw_gapped_score int_value]\r\n", " [-template_type type] [-template_length int_value] [-dust DUST_options]\r\n", " [-filtering_db filtering_database]\r\n", " [-window_masker_taxid window_masker_taxid]\r\n", " [-window_masker_db window_masker_db] [-soft_masking soft_masking]\r\n", " [-ungapped] [-culling_limit int_value] [-best_hit_overhang float_value]\r\n", " [-best_hit_score_edge float_value] [-window_size int_value]\r\n", " [-off_diagonal_range int_value] [-use_index boolean] [-index_name string]\r\n", " [-lcase_masking] [-query_loc range] [-strand strand] [-parse_deflines]\r\n", " [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n", " [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n", " [-num_threads int_value] [-remote] [-version]\r\n", "\r\n", "DESCRIPTION\r\n", " Nucleotide-Nucleotide BLAST 2.2.28+\r\n", "\r\n", "Use '-help' to print detailed descriptions of command line arguments\r\n" ] } ], "prompt_number": 8 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "RefSeqGene" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#Blastn refseqgene default output\n", "!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/refseqgene -out /Volumes/web/cnidarian/_MGarray.x.refseqgene -num_threads 2" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import HTML\n", "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "" ] } ], "prompt_number": 12 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "nt" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#nt default\n", "!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/nt -out /Volumes/web/cnidarian/_MGarray.x.nt -num_threads 2" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import HTML\n", "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "#nt tabular output\n", "!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/nt -out /Volumes/web/cnidarian/_MGarray.x.nt2 -outfmt 6 -max_target_seqs 1 -num_threads 2" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/_MGarray.x.nt2" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CGI_10003380\tgi|524888538|ref|XM_005100782.1|\t71.46\t445\t121\t4\t574\t1015\t792\t1233\t2e-52\t 217\r\n", "CGI_10004132\tgi|524891254|ref|XM_005102106.1|\t67.60\t392\t120\t3\t16\t402\t10\t399\t4e-25\t 127\r\n", "CGI_10004132\tgi|524891254|ref|XM_005102106.1|\t88.89\t36\t4\t0\t1594\t1629\t1729\t1764\t0.31\t48.2\r\n", "CGI_10004278\tgi|542204616|ref|XM_003457422.2|\t65.40\t1312\t400\t18\t65\t1352\t99\t1380\t3e-73\t 288\r\n", "CGI_10004344\tgi|291232514|ref|XM_002736163.1|\t65.36\t1045\t315\t22\t94\t1116\t130\t1149\t8e-46\t 196\r\n", "CGI_10004344\tgi|291232514|ref|XM_002736163.1|\t73.28\t116\t31\t0\t1594\t1709\t1684\t1799\t8e-08\t69.8\r\n", "CGI_10004940\tgi|524909158|ref|XM_005109573.1|\t68.64\t1977\t560\t29\t202\t2154\t346\t2286\t0.0\t 693\r\n", "CGI_10005087\tgi|260802603|ref|XM_002596136.1|\t68.74\t803\t247\t4\t1543\t2343\t133\t933\t1e-77\t 302\r\n", "CGI_10005249\tgi|524889131|ref|XM_005101070.1|\t69.92\t4402\t1227\t50\t742\t5090\t1555\t5912\t0.0\t1831\r\n", "CGI_10005249\tgi|524889131|ref|XM_005101070.1|\t68.33\t3653\t1008\t58\t5254\t8815\t6094\t9688\t0.0\t1245\r\n" ] } ], "prompt_number": 16 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Delta-Blast" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!/Users/Shared/Apps/ncbi-blast_28/bin/deltablast -h" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "USAGE\r\n", " deltablast [-h] [-help] [-import_search_strategy filename]\r\n", " [-export_search_strategy filename] [-db database_name]\r\n", " [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n", " [-negative_gilist filename] [-subject subject_input_file]\r\n", " [-subject_loc range] [-query input_file] [-out output_file]\r\n", " [-evalue evalue] [-word_size int_value] [-gapopen open_penalty]\r\n", " [-gapextend extend_penalty] [-xdrop_ungap float_value]\r\n", " [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n", " [-searchsp int_value] [-max_hsps_per_subject int_value] [-seg SEG_options]\r\n", " [-soft_masking soft_masking] [-matrix matrix_name]\r\n", " [-threshold float_value] [-culling_limit int_value]\r\n", " [-best_hit_overhang float_value] [-best_hit_score_edge float_value]\r\n", " [-window_size int_value] [-lcase_masking] [-query_loc range]\r\n", " [-parse_deflines] [-outfmt format] [-show_gis]\r\n", " [-num_descriptions int_value] [-num_alignments int_value] [-html]\r\n", " [-max_target_seqs num_sequences] [-num_threads int_value] [-remote]\r\n", " [-comp_based_stats compo] [-use_sw_tback] [-gap_trigger float_value]\r\n", " [-num_iterations int_value] [-out_pssm checkpoint_file]\r\n", " [-out_ascii_pssm ascii_mtx_file] [-pseudocount pseudocount]\r\n", " [-domain_inclusion_ethresh ethresh] [-inclusion_ethresh ethresh]\r\n", " [-rpsdb database_name] [-show_domain_hits] [-version]\r\n", "\r\n", "DESCRIPTION\r\n", " Domain enhanced lookup time accelarated BLAST 2.2.28+\r\n", "\r\n", "Use '-help' to print detailed descriptions of command line arguments\r\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "!/Users/Shared/Apps/ncbi-blast_28/bin/deltablast -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/cdd_delta -out /Volumes/web/cnidarian/_MGarray.x.cdd_delta -num_threads 14" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "BLAST Database error: No alias or index file found for protein database [cdd_delta] in search path [/Users/sr320/Dropbox/Steven/ipython_nb::]\r\n" ] } ], "prompt_number": 8 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Swiss-Prot" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!blastx -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/uniprot_sprot -out /Volumes/web/cnidarian/_MGarray.x.swissprot -outfmt 6 -max_target_seqs 1 -num_threads 14" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/web/cnidarian/_MGarray.x.swissprot" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CGI_10003380\tsp|P51650|SSDH_RAT\t54.51\t488\t196\t2\t211\t1668\t60\t523\t0.0\t 530\r\n", "CGI_10004132\tsp|Q8CHW4|EI2BE_MOUSE\t39.73\t672\t376\t11\t58\t1998\t39\t706\t1e-160\t 486\r\n", "CGI_10004278\tsp|Q5F3K4|WDR48_CHICK\t57.08\t678\t239\t16\t55\t2007\t17\t669\t0.0\t 704\r\n", "CGI_10004344\tsp|Q58EN8|VP33B_DANRE\t45.57\t621\t273\t10\t25\t1743\t12\t615\t1e-180\t 531\r\n", "CGI_10004940\tsp|Q28BL6|AACS_XENTR\t61.79\t683\t242\t7\t118\t2151\t4\t672\t0.0\t 884\r\n", "CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t57.32\t717\t265\t5\t376\t2412\t387\t1100\t0.0\t 805\r\n", "CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t33.80\t213\t123\t4\t841\t1437\t385\t593\t3e-27\t 123\r\n", "CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t49.55\t111\t51\t2\t52\t369\t195\t305\t5e-17\t90.5\r\n", "CGI_10005249\tsp|Q9Y4A5|TRRAP_HUMAN\t63.96\t2775\t918\t23\t721\t8847\t541\t3299\t0.0\t3579\r\n", "CGI_10005249\tsp|Q9Y4A5|TRRAP_HUMAN\t76.72\t580\t131\t1\t9046\t10773\t3280\t3859\t0.0\t 894\r\n" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Annotating only DMR regions " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Need to pull fasta down from DMR bed" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!tail /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "scaffold459\tMBD_ChIP\tHYPER\t186321\t186568\t.\t.\t.\tHYPER\r\n", "scaffold59\tMBD_ChIP\tHYPER\t225189\t225453\t.\t.\t.\tHYPER\r\n", "scaffold601\tMBD_ChIP\tHYPER\t1116073\t1116588\t.\t.\t.\tHYPER\r\n", "scaffold733\tMBD_ChIP\tHYPER\t26797\t27176\t.\t.\t.\tHYPER\r\n", "scaffold733\tMBD_ChIP\tHYPER\t27741\t29238\t.\t.\t.\tHYPER\r\n", "scaffold748\tMBD_ChIP\tHYPER\t187113\t187500\t.\t.\t.\tHYPER\r\n", "scaffold759\tMBD_ChIP\tHYPER\t29417\t29782\t.\t.\t.\tHYPER\r\n", "scaffold759\tMBD_ChIP\tHYPER\t32132\t32777\t.\t.\t.\tHYPER\r\n", "scaffold801\tMBD_ChIP\tHYPER\t257945\t258188\t.\t.\t.\tHYPER\r\n", "scaffold82\tMBD_ChIP\tHYPER\t242904\t243288\t.\t.\t.\tHYPER" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 48 441 2561 /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff\r\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "!fastaFromBed -fi /Volumes/web/cnidarian/oyster.v9.fa -bed /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff -fo /Volumes/web/cnidarian/mgDMRonly.fa" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "!cp /Volumes/web/cnidarian/mgDMRonly.fa /Volumes/web/cnidarian/mgDMRonly/query.fa" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "cd /Volumes/web/cnidarian/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/web/cnidarian\n" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "!mkdir mgDMRonly_mouse" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "#Setting Working Directory\n", "wd=\"/Volumes/web/cnidarian/mgDMRonly_mouse/\"\n", "#Setting directory of Blast Databases !!! make sure you have last '/'\n", "#dbd=\"/Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/db/\"\n", "dbd=\"/Volumes/Bay3/CLC_blastdatabases/\"\n", "#Database name\n", "dbn=\"nt\"\n", "#Blast algorithim complete path\n", "ba=\"/Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/bin/blastn\"\n", "#Location of SQLShare python tools: you can empty (\"\") if tools are in PATH !!! make sure you have last '/'\n", "#spd=\"/Users/Mackenzie/sqlshare-pythonclient/tools/\"\n", "spd=\"/Users/sr320/sqlshare-pythonclient/tools/\"" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "cd {wd}\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/web/cnidarian/mgDMRonly_mouse\n" ] } ], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "!{ba} -query query.fa -db {dbd}{dbn} -out {dbn}_blast_out.tab -evalue 1E-10 -task blastn -num_threads 4 -max_target_seqs 1 -outfmt 6\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -1 {dbn}_blast_out.tab\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "scaffold39990:18800-19176\tgi|289063368|tpg|BK007044.1|\t67.65\t340\t102\t5\t31\t366\t376\t711\t2e-18\t 102\r\n" ] } ], "prompt_number": 50 }, { "cell_type": "code", "collapsed": false, "input": [ "!tr '|' \"\\t\" <{dbn}_blast_out.tab> {dbn}_blast_out2.tab\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "#Uploads formatted blast table to SQLshare; currently has generic name and meant to be temporary: Warning will overwrite.\n", "!python {spd}singleupload.py -d scratchblast_out {dbn}_blast_out2.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "processing chunk line 0 to 7 (0.000303983688354 s elapsed)\r\n", "pushing uniprot-mouse-reference_blast_out2.tab...\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "parsing 19E9D90A...\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "finished scratchblast_out\r\n" ] } ], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "!python {spd}fetchdata.py -s \"SELECT * FROM [sr320@washington.edu].[scratchblast_out]blast Left Join [sr320@washington.edu].[uniprot-reviewed_wGO_010714]unp ON blast.Column3 = unp.Entry Left Join [sr320@washington.edu].[SPID and GO Numbers]go ON unp.Entry = go.SPID Left Join [sr320@washington.edu].[GO_to_GOslim]slim ON slim.GO_id = go.GOID\" -f tsv -o {dbn}_join2goslim.txt" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "!head -2 {dbn}_join2goslim.txt\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Column1\tColumn2\tColumn3\tColumn4\tColumn5\tColumn6\tColumn7\tColumn8\tColumn9\tColumn10\tColumn11\tColumn12\tColumn13\tColumn14\tEntry\tEntry name\tGene ontology IDs\tInteracts with\tCross-reference (GO)\tGene ontology (GO)\tStatus\tInterPro\tPathway\tProtein names\tGene names\tOrganism\tLength\tSPID\tGOID\tGO_id\tterm\tGOSlim_bin\taspect\r", "\r\n", "scaffold1174:585060-585688\tsp\tO08848\tRO60_MOUSE\t41.87\t203\t106\t5\t612\t4\t144\t334\t2E-27\t110\tO08848\tRO60_MOUSE\tGO:0003723; GO:0060271; GO:0005737; GO:0046872; GO:0030529\t\t\tRNA binding; cilium morphogenesis; cytoplasm; metal ion binding; ribonucleoprotein complex\treviewed\tIPR008858;\t\t60 kDa SS-A/Ro ribonucleoprotein (60 kDa Ro protein) (60 kDa ribonucleoprotein Ro) (RoRNP) (TROVE domain family member 2)\tTrove2 Ssa2\tMus musculus (Mouse)\t538\tO08848\tGO:0003723\tGO:0003723\tRNA binding\tnucleic acid binding activity\tF\r", "\r\n" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }