{
"metadata": {
"name": "",
"signature": "sha256:68ad69cf7f9dd05d22c51f4c12993711a6ab1b504ffa2fe5094d6088471d78cd"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Annotation of Genes associated with Differentially Methylation - E2 Exposure Trial"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd Volumes/web/cnidarian/"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/web/cnidarian\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!curl -O http://eagle.fish.washington.edu/bivalvia/array/genesWithDiffMeth.fasta"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\r\n",
" Dload Upload Total Spent Left Speed\r\n",
"\r",
" 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r",
"100 244k 100 244k 0 0 1723k 0 --:--:-- --:--:-- --:--:-- 1835k\r\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/cnidarian/genesWithDiffMeth.fasta"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 166 166 250002 /Volumes/web/cnidarian/genesWithDiffMeth.fasta\r\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!fgrep -c \">\" /Volumes/web/cnidarian/genesWithDiffMeth.fasta"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"83\r\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!blastn -h"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"USAGE\r\n",
" blastn [-h] [-help] [-import_search_strategy filename]\r\n",
" [-export_search_strategy filename] [-task task_name] [-db database_name]\r\n",
" [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n",
" [-negative_gilist filename] [-entrez_query entrez_query]\r\n",
" [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]\r\n",
" [-subject subject_input_file] [-subject_loc range] [-query input_file]\r\n",
" [-out output_file] [-evalue evalue] [-word_size int_value]\r\n",
" [-gapopen open_penalty] [-gapextend extend_penalty]\r\n",
" [-perc_identity float_value] [-xdrop_ungap float_value]\r\n",
" [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n",
" [-searchsp int_value] [-max_hsps_per_subject int_value] [-penalty penalty]\r\n",
" [-reward reward] [-no_greedy] [-min_raw_gapped_score int_value]\r\n",
" [-template_type type] [-template_length int_value] [-dust DUST_options]\r\n",
" [-filtering_db filtering_database]\r\n",
" [-window_masker_taxid window_masker_taxid]\r\n",
" [-window_masker_db window_masker_db] [-soft_masking soft_masking]\r\n",
" [-ungapped] [-culling_limit int_value] [-best_hit_overhang float_value]\r\n",
" [-best_hit_score_edge float_value] [-window_size int_value]\r\n",
" [-off_diagonal_range int_value] [-use_index boolean] [-index_name string]\r\n",
" [-lcase_masking] [-query_loc range] [-strand strand] [-parse_deflines]\r\n",
" [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n",
" [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n",
" [-num_threads int_value] [-remote] [-version]\r\n",
"\r\n",
"DESCRIPTION\r\n",
" Nucleotide-Nucleotide BLAST 2.2.28+\r\n",
"\r\n",
"Use '-help' to print detailed descriptions of command line arguments\r\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"RefSeqGene"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Blastn refseqgene default output\n",
"!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/refseqgene -out /Volumes/web/cnidarian/_MGarray.x.refseqgene -num_threads 2"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.display import HTML\n",
"HTML('')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
""
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
""
]
}
],
"prompt_number": 12
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"nt"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#nt default\n",
"!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/nt -out /Volumes/web/cnidarian/_MGarray.x.nt -num_threads 2"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.display import HTML\n",
"HTML('')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
""
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
""
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#nt tabular output\n",
"!blastn -task blastn -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/nt -out /Volumes/web/cnidarian/_MGarray.x.nt2 -outfmt 6 -max_target_seqs 1 -num_threads 2"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/_MGarray.x.nt2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CGI_10003380\tgi|524888538|ref|XM_005100782.1|\t71.46\t445\t121\t4\t574\t1015\t792\t1233\t2e-52\t 217\r\n",
"CGI_10004132\tgi|524891254|ref|XM_005102106.1|\t67.60\t392\t120\t3\t16\t402\t10\t399\t4e-25\t 127\r\n",
"CGI_10004132\tgi|524891254|ref|XM_005102106.1|\t88.89\t36\t4\t0\t1594\t1629\t1729\t1764\t0.31\t48.2\r\n",
"CGI_10004278\tgi|542204616|ref|XM_003457422.2|\t65.40\t1312\t400\t18\t65\t1352\t99\t1380\t3e-73\t 288\r\n",
"CGI_10004344\tgi|291232514|ref|XM_002736163.1|\t65.36\t1045\t315\t22\t94\t1116\t130\t1149\t8e-46\t 196\r\n",
"CGI_10004344\tgi|291232514|ref|XM_002736163.1|\t73.28\t116\t31\t0\t1594\t1709\t1684\t1799\t8e-08\t69.8\r\n",
"CGI_10004940\tgi|524909158|ref|XM_005109573.1|\t68.64\t1977\t560\t29\t202\t2154\t346\t2286\t0.0\t 693\r\n",
"CGI_10005087\tgi|260802603|ref|XM_002596136.1|\t68.74\t803\t247\t4\t1543\t2343\t133\t933\t1e-77\t 302\r\n",
"CGI_10005249\tgi|524889131|ref|XM_005101070.1|\t69.92\t4402\t1227\t50\t742\t5090\t1555\t5912\t0.0\t1831\r\n",
"CGI_10005249\tgi|524889131|ref|XM_005101070.1|\t68.33\t3653\t1008\t58\t5254\t8815\t6094\t9688\t0.0\t1245\r\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Delta-Blast"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!/Users/Shared/Apps/ncbi-blast_28/bin/deltablast -h"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"USAGE\r\n",
" deltablast [-h] [-help] [-import_search_strategy filename]\r\n",
" [-export_search_strategy filename] [-db database_name]\r\n",
" [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n",
" [-negative_gilist filename] [-subject subject_input_file]\r\n",
" [-subject_loc range] [-query input_file] [-out output_file]\r\n",
" [-evalue evalue] [-word_size int_value] [-gapopen open_penalty]\r\n",
" [-gapextend extend_penalty] [-xdrop_ungap float_value]\r\n",
" [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n",
" [-searchsp int_value] [-max_hsps_per_subject int_value] [-seg SEG_options]\r\n",
" [-soft_masking soft_masking] [-matrix matrix_name]\r\n",
" [-threshold float_value] [-culling_limit int_value]\r\n",
" [-best_hit_overhang float_value] [-best_hit_score_edge float_value]\r\n",
" [-window_size int_value] [-lcase_masking] [-query_loc range]\r\n",
" [-parse_deflines] [-outfmt format] [-show_gis]\r\n",
" [-num_descriptions int_value] [-num_alignments int_value] [-html]\r\n",
" [-max_target_seqs num_sequences] [-num_threads int_value] [-remote]\r\n",
" [-comp_based_stats compo] [-use_sw_tback] [-gap_trigger float_value]\r\n",
" [-num_iterations int_value] [-out_pssm checkpoint_file]\r\n",
" [-out_ascii_pssm ascii_mtx_file] [-pseudocount pseudocount]\r\n",
" [-domain_inclusion_ethresh ethresh] [-inclusion_ethresh ethresh]\r\n",
" [-rpsdb database_name] [-show_domain_hits] [-version]\r\n",
"\r\n",
"DESCRIPTION\r\n",
" Domain enhanced lookup time accelarated BLAST 2.2.28+\r\n",
"\r\n",
"Use '-help' to print detailed descriptions of command line arguments\r\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!/Users/Shared/Apps/ncbi-blast_28/bin/deltablast -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/cdd_delta -out /Volumes/web/cnidarian/_MGarray.x.cdd_delta -num_threads 14"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"BLAST Database error: No alias or index file found for protein database [cdd_delta] in search path [/Users/sr320/Dropbox/Steven/ipython_nb::]\r\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Swiss-Prot"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!blastx -query /Volumes/web/cnidarian/genesWithDiffMeth.fasta -db /Volumes/web/whale/blast/db/uniprot_sprot -out /Volumes/web/cnidarian/_MGarray.x.swissprot -outfmt 6 -max_target_seqs 1 -num_threads 14"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/web/cnidarian/_MGarray.x.swissprot"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"CGI_10003380\tsp|P51650|SSDH_RAT\t54.51\t488\t196\t2\t211\t1668\t60\t523\t0.0\t 530\r\n",
"CGI_10004132\tsp|Q8CHW4|EI2BE_MOUSE\t39.73\t672\t376\t11\t58\t1998\t39\t706\t1e-160\t 486\r\n",
"CGI_10004278\tsp|Q5F3K4|WDR48_CHICK\t57.08\t678\t239\t16\t55\t2007\t17\t669\t0.0\t 704\r\n",
"CGI_10004344\tsp|Q58EN8|VP33B_DANRE\t45.57\t621\t273\t10\t25\t1743\t12\t615\t1e-180\t 531\r\n",
"CGI_10004940\tsp|Q28BL6|AACS_XENTR\t61.79\t683\t242\t7\t118\t2151\t4\t672\t0.0\t 884\r\n",
"CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t57.32\t717\t265\t5\t376\t2412\t387\t1100\t0.0\t 805\r\n",
"CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t33.80\t213\t123\t4\t841\t1437\t385\t593\t3e-27\t 123\r\n",
"CGI_10005087\tsp|Q9VJ79|PDE11_DROME\t49.55\t111\t51\t2\t52\t369\t195\t305\t5e-17\t90.5\r\n",
"CGI_10005249\tsp|Q9Y4A5|TRRAP_HUMAN\t63.96\t2775\t918\t23\t721\t8847\t541\t3299\t0.0\t3579\r\n",
"CGI_10005249\tsp|Q9Y4A5|TRRAP_HUMAN\t76.72\t580\t131\t1\t9046\t10773\t3280\t3859\t0.0\t 894\r\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Annotating only DMR regions "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Need to pull fasta down from DMR bed"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!tail /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"scaffold459\tMBD_ChIP\tHYPER\t186321\t186568\t.\t.\t.\tHYPER\r\n",
"scaffold59\tMBD_ChIP\tHYPER\t225189\t225453\t.\t.\t.\tHYPER\r\n",
"scaffold601\tMBD_ChIP\tHYPER\t1116073\t1116588\t.\t.\t.\tHYPER\r\n",
"scaffold733\tMBD_ChIP\tHYPER\t26797\t27176\t.\t.\t.\tHYPER\r\n",
"scaffold733\tMBD_ChIP\tHYPER\t27741\t29238\t.\t.\t.\tHYPER\r\n",
"scaffold748\tMBD_ChIP\tHYPER\t187113\t187500\t.\t.\t.\tHYPER\r\n",
"scaffold759\tMBD_ChIP\tHYPER\t29417\t29782\t.\t.\t.\tHYPER\r\n",
"scaffold759\tMBD_ChIP\tHYPER\t32132\t32777\t.\t.\t.\tHYPER\r\n",
"scaffold801\tMBD_ChIP\tHYPER\t257945\t258188\t.\t.\t.\tHYPER\r\n",
"scaffold82\tMBD_ChIP\tHYPER\t242904\t243288\t.\t.\t.\tHYPER"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 48 441 2561 /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff\r\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!fastaFromBed -fi /Volumes/web/cnidarian/oyster.v9.fa -bed /Volumes/web/bivalvia/array/2013.11.22.mgavery/mgaveryDMRs_112212.gff -fo /Volumes/web/cnidarian/mgDMRonly.fa"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!cp /Volumes/web/cnidarian/mgDMRonly.fa /Volumes/web/cnidarian/mgDMRonly/query.fa"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd /Volumes/web/cnidarian/"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/web/cnidarian\n"
]
}
],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!mkdir mgDMRonly_mouse"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 35
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Setting Working Directory\n",
"wd=\"/Volumes/web/cnidarian/mgDMRonly_mouse/\"\n",
"#Setting directory of Blast Databases !!! make sure you have last '/'\n",
"#dbd=\"/Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/db/\"\n",
"dbd=\"/Volumes/Bay3/CLC_blastdatabases/\"\n",
"#Database name\n",
"dbn=\"nt\"\n",
"#Blast algorithim complete path\n",
"ba=\"/Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/bin/blastn\"\n",
"#Location of SQLShare python tools: you can empty (\"\") if tools are in PATH !!! make sure you have last '/'\n",
"#spd=\"/Users/Mackenzie/sqlshare-pythonclient/tools/\"\n",
"spd=\"/Users/sr320/sqlshare-pythonclient/tools/\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 46
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd {wd}\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/web/cnidarian/mgDMRonly_mouse\n"
]
}
],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!{ba} -query query.fa -db {dbd}{dbn} -out {dbn}_blast_out.tab -evalue 1E-10 -task blastn -num_threads 4 -max_target_seqs 1 -outfmt 6\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head -1 {dbn}_blast_out.tab\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"scaffold39990:18800-19176\tgi|289063368|tpg|BK007044.1|\t67.65\t340\t102\t5\t31\t366\t376\t711\t2e-18\t 102\r\n"
]
}
],
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!tr '|' \"\\t\" <{dbn}_blast_out.tab> {dbn}_blast_out2.tab\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"#Uploads formatted blast table to SQLshare; currently has generic name and meant to be temporary: Warning will overwrite.\n",
"!python {spd}singleupload.py -d scratchblast_out {dbn}_blast_out2.tab"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"processing chunk line 0 to 7 (0.000303983688354 s elapsed)\r\n",
"pushing uniprot-mouse-reference_blast_out2.tab...\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"parsing 19E9D90A...\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"finished scratchblast_out\r\n"
]
}
],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!python {spd}fetchdata.py -s \"SELECT * FROM [sr320@washington.edu].[scratchblast_out]blast Left Join [sr320@washington.edu].[uniprot-reviewed_wGO_010714]unp ON blast.Column3 = unp.Entry Left Join [sr320@washington.edu].[SPID and GO Numbers]go ON unp.Entry = go.SPID Left Join [sr320@washington.edu].[GO_to_GOslim]slim ON slim.GO_id = go.GOID\" -f tsv -o {dbn}_join2goslim.txt"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head -2 {dbn}_join2goslim.txt\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Column1\tColumn2\tColumn3\tColumn4\tColumn5\tColumn6\tColumn7\tColumn8\tColumn9\tColumn10\tColumn11\tColumn12\tColumn13\tColumn14\tEntry\tEntry name\tGene ontology IDs\tInteracts with\tCross-reference (GO)\tGene ontology (GO)\tStatus\tInterPro\tPathway\tProtein names\tGene names\tOrganism\tLength\tSPID\tGOID\tGO_id\tterm\tGOSlim_bin\taspect\r",
"\r\n",
"scaffold1174:585060-585688\tsp\tO08848\tRO60_MOUSE\t41.87\t203\t106\t5\t612\t4\t144\t334\t2E-27\t110\tO08848\tRO60_MOUSE\tGO:0003723; GO:0060271; GO:0005737; GO:0046872; GO:0030529\t\t\tRNA binding; cilium morphogenesis; cytoplasm; metal ion binding; ribonucleoprotein complex\treviewed\tIPR008858;\t\t60 kDa SS-A/Ro ribonucleoprotein (60 kDa Ro protein) (60 kDa ribonucleoprotein Ro) (RoRNP) (TROVE domain family member 2)\tTrove2 Ssa2\tMus musculus (Mouse)\t538\tO08848\tGO:0003723\tGO:0003723\tRNA binding\tnucleic acid binding activity\tF\r",
"\r\n"
]
}
],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}