{
"metadata": {
"name": "",
"signature": "sha256:e2d633104dc8de88f53bc9d31f4669eb055e32dfb4cbc0a6e9e1cc64e6ccdce2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"BLAST"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Screenshot of Blast page at NCBI. \n",
""
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Download Stand-alone BLAST"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/`"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#unzipping [-]x --extract --get; -v, --verbose; -z, --gzip; -f, --file F\n",
"!tar -xzvf ncbi-blast-2.2.29+-universal-macosx.tar.gz"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"x ncbi-blast-2.2.29+/\r\n",
"x ncbi-blast-2.2.29+/bin/\r\n",
"x ncbi-blast-2.2.29+/bin/makembindex"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/tblastn"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/psiblast"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/rpsblast"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/legacy_blast.pl\r\n",
"x ncbi-blast-2.2.29+/bin/blastdbcmd"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/makeblastdb"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/tblastx"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blastn"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blastp"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/segmasker"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/dustmasker"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blastx"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blast_formatter"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/windowmasker"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blastdb_aliastool"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/convert2blastmask"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/update_blastdb.pl\r\n",
"x ncbi-blast-2.2.29+/bin/deltablast"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/blastdbcheck"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/rpstblastn"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/bin/makeprofiledb"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"x ncbi-blast-2.2.29+/doc/\r\n",
"x ncbi-blast-2.2.29+/doc/README.txt\r\n",
"x ncbi-blast-2.2.29+/README\r\n",
"x ncbi-blast-2.2.29+/ncbi_package_info\r\n",
"x ncbi-blast-2.2.29+/LICENSE\r\n",
"x ncbi-blast-2.2.29+/ChangeLog\r\n"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd ncbi-blast-2.2.29+/bin"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+/bin\n"
]
}
],
"prompt_number": 31
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ls -1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\u001b[31mblast_formatter\u001b[m\u001b[m*\r\n",
"\u001b[31mblastdb_aliastool\u001b[m\u001b[m*\r\n",
"\u001b[31mblastdbcheck\u001b[m\u001b[m*\r\n",
"\u001b[31mblastdbcmd\u001b[m\u001b[m*\r\n",
"\u001b[31mblastn\u001b[m\u001b[m*\r\n",
"\u001b[31mblastp\u001b[m\u001b[m*\r\n",
"\u001b[31mblastx\u001b[m\u001b[m*\r\n",
"\u001b[31mconvert2blastmask\u001b[m\u001b[m*\r\n",
"\u001b[31mdeltablast\u001b[m\u001b[m*\r\n",
"\u001b[31mdustmasker\u001b[m\u001b[m*\r\n",
"\u001b[31mlegacy_blast.pl\u001b[m\u001b[m*\r\n",
"\u001b[31mmakeblastdb\u001b[m\u001b[m*\r\n",
"\u001b[31mmakembindex\u001b[m\u001b[m*\r\n",
"\u001b[31mmakeprofiledb\u001b[m\u001b[m*\r\n",
"\u001b[31mpsiblast\u001b[m\u001b[m*\r\n",
"\u001b[31mrpsblast\u001b[m\u001b[m*\r\n",
"\u001b[31mrpstblastn\u001b[m\u001b[m*\r\n",
"\u001b[31msegmasker\u001b[m\u001b[m*\r\n",
"\u001b[31mtblastn\u001b[m\u001b[m*\r\n",
"\u001b[31mtblastx\u001b[m\u001b[m*\r\n",
"\u001b[31mupdate_blastdb.pl\u001b[m\u001b[m*\r\n",
"\u001b[31mwindowmasker\u001b[m\u001b[m*\r\n"
]
}
],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#check to see if \"works\"\n",
"!blastx -h"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"USAGE\r\n",
" blastx [-h] [-help] [-import_search_strategy filename]\r\n",
" [-export_search_strategy filename] [-db database_name]\r\n",
" [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n",
" [-negative_gilist filename] [-entrez_query entrez_query]\r\n",
" [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]\r\n",
" [-subject subject_input_file] [-subject_loc range] [-query input_file]\r\n",
" [-out output_file] [-evalue evalue] [-word_size int_value]\r\n",
" [-gapopen open_penalty] [-gapextend extend_penalty]\r\n",
" [-xdrop_ungap float_value] [-xdrop_gap float_value]\r\n",
" [-xdrop_gap_final float_value] [-searchsp int_value]\r\n",
" [-max_hsps_per_subject int_value] [-max_intron_length length]\r\n",
" [-seg SEG_options] [-soft_masking soft_masking] [-matrix matrix_name]\r\n",
" [-threshold float_value] [-culling_limit int_value]\r\n",
" [-best_hit_overhang float_value] [-best_hit_score_edge float_value]\r\n",
" [-window_size int_value] [-ungapped] [-lcase_masking] [-query_loc range]\r\n",
" [-strand strand] [-parse_deflines] [-query_gencode int_value]\r\n",
" [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n",
" [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n",
" [-num_threads int_value] [-remote] [-comp_based_stats compo]\r\n",
" [-use_sw_tback] [-version]\r\n",
"\r\n",
"DESCRIPTION\r\n",
" Translated Query-Protein Subject BLAST 2.2.28+\r\n",
"\r\n",
"Use '-help' to print detailed descriptions of command line arguments\r\n"
]
}
],
"prompt_number": 35
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Create a Blast Database"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I would like to make a database of UniProt/Swiss-prot."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Screenshot:\n",
"\n",
""
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd ncbi-blast-2.2.29+/"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[Errno 2] No such file or directory: 'ncbi-blast-2.2.29+/'\n",
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\n"
]
}
],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd db"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[Errno 2] No such file or directory: 'db'\n",
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\n"
]
}
],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ls"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uniprot_sprot.fasta.gz\r\n"
]
}
],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!gzip -d uniprot_sprot.fasta.gz"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ls"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uniprot_sprot.fasta\r\n"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pwd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 65,
"text": [
"u'/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db'"
]
}
],
"prompt_number": 65
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#note I am working in dir db, thus can just use file names. Most times you might use the complete path.\n",
"!makeblastdb -in uniprot_sprot.fasta -dbtype prot -out uniprot_sprot_r2013_12 "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"\r\n",
"Building a new DB, current time: 01/08/2014 11:34:36\r\n",
"New DB name: uniprot_sprot_r2013_12\r\n",
"New DB title: uniprot_sprot.fasta\r\n",
"Sequence type: Protein\r\n",
"Keep Linkouts: T\r\n",
"Keep MBits: T\r\n",
"Maximum file size: 1000000000B\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Adding sequences from FASTA; added 541954 sequences in 53.9535 seconds.\r\n"
]
}
],
"prompt_number": 66
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Get a Query Sequence"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#creating new directory; \n",
"!pwd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\r\n"
]
}
],
"prompt_number": 79
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd .."
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+\n"
]
}
],
"prompt_number": 80
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!mkdir query"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 83
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"cd query/"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Volumes/Bay3/Software/ncbi-blast-2.2.29+/query\n"
]
}
],
"prompt_number": 85
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#getting file from url to local location\n",
"#also curl -O works\n",
"!wget http://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"--2014-01-08 11:40:14-- http://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa\r\n",
"Resolving eagle.fish.washington.edu... "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"128.95.149.81\r\n",
"Connecting to eagle.fish.washington.edu|128.95.149.81|:80... connected.\r\n",
"HTTP request sent, awaiting response... 200 OK\r\n",
"Length: 2030182 (1.9M) [text/plain]\r\n",
"Saving to: `Ab_4denovo_CLC6_a.fa'\r\n",
"\r\n",
"\r",
" 0% [ ] 0 --.-K/s \r",
"100%[======================================>] 2,030,182 --.-K/s in 0.03s \r\n",
"\r\n",
"2014-01-08 11:40:14 (68.2 MB/s) - `Ab_4denovo_CLC6_a.fa' saved [2030182/2030182]\r\n",
"\r\n"
]
}
],
"prompt_number": 86
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#lets get a preview\n",
"!head Ab_4denovo_CLC6_a.fa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
">solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1\r\n",
"ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC\r\n",
"TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC\r\n",
"TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC\r\n",
"TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC\r\n",
"CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA\r\n",
"TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG\r\n",
"CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA\r\n",
"AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC\r\n",
"TTC\r\n"
]
}
],
"prompt_number": 87
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#word count\n",
"!wc Ab_4denovo_CLC6_a.fa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 35092 35092 2030182 Ab_4denovo_CLC6_a.fa\r\n"
]
}
],
"prompt_number": 88
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#how many sequences? lets count \">\" as we know each contig has 1\n",
"!grep -c \">\" Ab_4denovo_CLC6_a.fa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"5490\r\n"
]
}
],
"prompt_number": 90
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Run Blast"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#will use full paths..\n",
"!blastx \\\n",
"-query /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/query/Ab_4denovo_CLC6_a.fa \\\n",
"-db /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/db/uniprot_sprot_r2013_12 \\\n",
"-out /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab \\\n",
"-evalue 1E-20 \\\n",
"-max_target_seqs 1 \\\n",
"-outfmt 6"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3\tsp|O42248|GBLP_DANRE\t82.46\t171\t30\t0\t1\t513\t35\t205\t1e-101\t 301\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5\tsp|Q08013|SSRG_RAT\t75.38\t65\t16\t0\t3\t197\t121\t185\t1e-27\t 104\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_6\tsp|P12234|MPCP_BOVIN\t76.62\t77\t18\t0\t2\t232\t286\t362\t2e-23\t98.6\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_9\tsp|Q41629|ADT1_WHEAT\t82.26\t62\t11\t0\t3\t188\t170\t231\t3e-27\t 104\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_13\tsp|Q32NG4|PDDC1_XENLA\t54.44\t90\t40\t1\t1\t270\t140\t228\t1e-27\t 106\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_23\tsp|Q9GNE2|RL23_AEDAE\t97.22\t72\t2\t0\t67\t282\t14\t85\t1e-42\t 142\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_31\tsp|Q3V1H3|HPHL1_MOUSE\t53.38\t133\t59\t1\t2\t391\t23\t155\t5e-42\t 153\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_32\tsp|Q641Y2|NDUS2_RAT\t88.03\t117\t14\t0\t2\t352\t334\t450\t1e-70\t 224\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_37\tsp|Q9D3D9|ATPD_MOUSE\t56.10\t123\t54\t0\t2\t370\t46\t168\t7e-42\t 144\r\n",
"solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_39\tsp|Q39613|CYPH_CATRO\t75.00\t120\t23\t1\t55\t393\t1\t120\t7e-49\t 160\r\n"
]
}
],
"prompt_number": 94
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!wc /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 664 7968 84910 /Volumes/Bay3/Software/ncbi-blast-2.2.29+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab\r\n"
]
}
],
"prompt_number": 95
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}