{ "metadata": { "name": "", "signature": "sha256:e2d633104dc8de88f53bc9d31f4669eb055e32dfb4cbc0a6e9e1cc64e6ccdce2" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "BLAST" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Screenshot of Blast page at NCBI. \n", "\"blast_187C98FA_png\"/" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Download Stand-alone BLAST" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"National_Center_for_Biotechnology_Information_187C9A28_png\"/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#unzipping [-]x --extract --get; -v, --verbose; -z, --gzip; -f, --file F\n", "!tar -xzvf ncbi-blast-2.2.29+-universal-macosx.tar.gz" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "x ncbi-blast-2.2.29+/\r\n", "x ncbi-blast-2.2.29+/bin/\r\n", "x ncbi-blast-2.2.29+/bin/makembindex" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/tblastn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/psiblast" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/rpsblast" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/legacy_blast.pl\r\n", "x ncbi-blast-2.2.29+/bin/blastdbcmd" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/makeblastdb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/tblastx" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blastn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blastp" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/segmasker" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/dustmasker" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blastx" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blast_formatter" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/windowmasker" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blastdb_aliastool" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/convert2blastmask" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/update_blastdb.pl\r\n", "x ncbi-blast-2.2.29+/bin/deltablast" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/blastdbcheck" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/rpstblastn" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/bin/makeprofiledb" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "x ncbi-blast-2.2.29+/doc/\r\n", "x ncbi-blast-2.2.29+/doc/README.txt\r\n", "x ncbi-blast-2.2.29+/README\r\n", "x ncbi-blast-2.2.29+/ncbi_package_info\r\n", "x ncbi-blast-2.2.29+/LICENSE\r\n", "x ncbi-blast-2.2.29+/ChangeLog\r\n" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "cd ncbi-blast-2.2.29+/bin" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/Bay3/Software/ncbi-blast-2.2.29+/bin\n" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "ls -1" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\u001b[31mblast_formatter\u001b[m\u001b[m*\r\n", "\u001b[31mblastdb_aliastool\u001b[m\u001b[m*\r\n", "\u001b[31mblastdbcheck\u001b[m\u001b[m*\r\n", "\u001b[31mblastdbcmd\u001b[m\u001b[m*\r\n", "\u001b[31mblastn\u001b[m\u001b[m*\r\n", "\u001b[31mblastp\u001b[m\u001b[m*\r\n", "\u001b[31mblastx\u001b[m\u001b[m*\r\n", "\u001b[31mconvert2blastmask\u001b[m\u001b[m*\r\n", "\u001b[31mdeltablast\u001b[m\u001b[m*\r\n", "\u001b[31mdustmasker\u001b[m\u001b[m*\r\n", "\u001b[31mlegacy_blast.pl\u001b[m\u001b[m*\r\n", "\u001b[31mmakeblastdb\u001b[m\u001b[m*\r\n", "\u001b[31mmakembindex\u001b[m\u001b[m*\r\n", "\u001b[31mmakeprofiledb\u001b[m\u001b[m*\r\n", "\u001b[31mpsiblast\u001b[m\u001b[m*\r\n", "\u001b[31mrpsblast\u001b[m\u001b[m*\r\n", "\u001b[31mrpstblastn\u001b[m\u001b[m*\r\n", "\u001b[31msegmasker\u001b[m\u001b[m*\r\n", "\u001b[31mtblastn\u001b[m\u001b[m*\r\n", "\u001b[31mtblastx\u001b[m\u001b[m*\r\n", "\u001b[31mupdate_blastdb.pl\u001b[m\u001b[m*\r\n", "\u001b[31mwindowmasker\u001b[m\u001b[m*\r\n" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "#check to see if \"works\"\n", "!blastx -h" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "USAGE\r\n", " blastx [-h] [-help] [-import_search_strategy filename]\r\n", " [-export_search_strategy filename] [-db database_name]\r\n", " [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n", " [-negative_gilist filename] [-entrez_query entrez_query]\r\n", " [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]\r\n", " [-subject subject_input_file] [-subject_loc range] [-query input_file]\r\n", " [-out output_file] [-evalue evalue] [-word_size int_value]\r\n", " [-gapopen open_penalty] [-gapextend extend_penalty]\r\n", " [-xdrop_ungap float_value] [-xdrop_gap float_value]\r\n", " [-xdrop_gap_final float_value] [-searchsp int_value]\r\n", " [-max_hsps_per_subject int_value] [-max_intron_length length]\r\n", " [-seg SEG_options] [-soft_masking soft_masking] [-matrix matrix_name]\r\n", " [-threshold float_value] [-culling_limit int_value]\r\n", " [-best_hit_overhang float_value] [-best_hit_score_edge float_value]\r\n", " [-window_size int_value] [-ungapped] [-lcase_masking] [-query_loc range]\r\n", " [-strand strand] [-parse_deflines] [-query_gencode int_value]\r\n", " [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n", " [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n", " [-num_threads int_value] [-remote] [-comp_based_stats compo]\r\n", " [-use_sw_tback] [-version]\r\n", "\r\n", "DESCRIPTION\r\n", " Translated Query-Protein Subject BLAST 2.2.28+\r\n", "\r\n", "Use '-help' to print detailed descriptions of command line arguments\r\n" ] } ], "prompt_number": 35 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Create a Blast Database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I would like to make a database of UniProt/Swiss-prot." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Screenshot:\n", "\n", "\"Download_187DDB0F.png\"" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cd ncbi-blast-2.2.29+/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[Errno 2] No such file or directory: 'ncbi-blast-2.2.29+/'\n", "/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\n" ] } ], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "cd db" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[Errno 2] No such file or directory: 'db'\n", "/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\n" ] } ], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "ls" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "uniprot_sprot.fasta.gz\r\n" ] } ], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "!gzip -d uniprot_sprot.fasta.gz" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "ls" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "uniprot_sprot.fasta\r\n" ] } ], "prompt_number": 63 }, { "cell_type": "code", "collapsed": false, "input": [ "pwd" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 65, "text": [ "u'/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db'" ] } ], "prompt_number": 65 }, { "cell_type": "code", "collapsed": false, "input": [ "#note I am working in dir db, thus can just use file names. Most times you might use the complete path.\n", "!makeblastdb -in uniprot_sprot.fasta -dbtype prot -out uniprot_sprot_r2013_12 " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "\r\n", "Building a new DB, current time: 01/08/2014 11:34:36\r\n", "New DB name: uniprot_sprot_r2013_12\r\n", "New DB title: uniprot_sprot.fasta\r\n", "Sequence type: Protein\r\n", "Keep Linkouts: T\r\n", "Keep MBits: T\r\n", "Maximum file size: 1000000000B\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Adding sequences from FASTA; added 541954 sequences in 53.9535 seconds.\r\n" ] } ], "prompt_number": 66 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Get a Query Sequence" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#creating new directory; \n", "!pwd" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/Bay3/Software/ncbi-blast-2.2.29+/db\r\n" ] } ], "prompt_number": 79 }, { "cell_type": "code", "collapsed": false, "input": [ "cd .." ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/Bay3/Software/ncbi-blast-2.2.29+\n" ] } ], "prompt_number": 80 }, { "cell_type": "code", "collapsed": false, "input": [ "!mkdir query" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 83 }, { "cell_type": "code", "collapsed": false, "input": [ "cd query/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/Volumes/Bay3/Software/ncbi-blast-2.2.29+/query\n" ] } ], "prompt_number": 85 }, { "cell_type": "code", "collapsed": false, "input": [ "#getting file from url to local location\n", "#also curl -O works\n", "!wget http://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "--2014-01-08 11:40:14-- http://eagle.fish.washington.edu/cnidarian/Ab_4denovo_CLC6_a.fa\r\n", "Resolving eagle.fish.washington.edu... " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "128.95.149.81\r\n", "Connecting to eagle.fish.washington.edu|128.95.149.81|:80... connected.\r\n", "HTTP request sent, awaiting response... 200 OK\r\n", "Length: 2030182 (1.9M) [text/plain]\r\n", "Saving to: `Ab_4denovo_CLC6_a.fa'\r\n", "\r\n", "\r", " 0% [ ] 0 --.-K/s \r", "100%[======================================>] 2,030,182 --.-K/s in 0.03s \r\n", "\r\n", "2014-01-08 11:40:14 (68.2 MB/s) - `Ab_4denovo_CLC6_a.fa' saved [2030182/2030182]\r\n", "\r\n" ] } ], "prompt_number": 86 }, { "cell_type": "code", "collapsed": false, "input": [ "#lets get a preview\n", "!head Ab_4denovo_CLC6_a.fa" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ ">solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_1\r\n", "ACACCCCACCCCAACGCACCCTCACCCCCACCCCAACAATCCATGATTGAATACTTCATC\r\n", "TATCCAAGACAAACTCCTCCTACAATCCATGATAGAATTCCTCCAAAAATAATTTCACAC\r\n", "TGAAACTCCGGTATCCGAGTTATTTTGTTCCCAGTAAAATGGCATCAACAAAAGTAGGTC\r\n", "TGGATTAACGAACCAATGTTGCTGCGTAATATCCCATTGACATATCTTGTCGATTCCTAC\r\n", "CAGGATCCGGACTGACGAGATTTCACTGTACGTTTATGCAAGTCATTTCCATATATAAAA\r\n", "TTGGATCTTATTTGCACAGTTAAATGTCTCTATGCTTATTTATAAATCAATGCCCGTAAG\r\n", "CTCCTAATATTTCTCTTTTCGTCCGACGAGCAAACAGTGAGTTTACTGTGGCCTTCAGCA\r\n", "AAAGTATTGATGTTGTAAATCTCAGTTGTGATTGAACAATTTGCCTCACTAGAAGTAGCC\r\n", "TTC\r\n" ] } ], "prompt_number": 87 }, { "cell_type": "code", "collapsed": false, "input": [ "#word count\n", "!wc Ab_4denovo_CLC6_a.fa" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 35092 35092 2030182 Ab_4denovo_CLC6_a.fa\r\n" ] } ], "prompt_number": 88 }, { "cell_type": "code", "collapsed": false, "input": [ "#how many sequences? lets count \">\" as we know each contig has 1\n", "!grep -c \">\" Ab_4denovo_CLC6_a.fa" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "5490\r\n" ] } ], "prompt_number": 90 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Run Blast" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#will use full paths..\n", "!blastx \\\n", "-query /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/query/Ab_4denovo_CLC6_a.fa \\\n", "-db /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/db/uniprot_sprot_r2013_12 \\\n", "-out /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab \\\n", "-evalue 1E-20 \\\n", "-max_target_seqs 1 \\\n", "-outfmt 6" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!head /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_3\tsp|O42248|GBLP_DANRE\t82.46\t171\t30\t0\t1\t513\t35\t205\t1e-101\t 301\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_5\tsp|Q08013|SSRG_RAT\t75.38\t65\t16\t0\t3\t197\t121\t185\t1e-27\t 104\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_6\tsp|P12234|MPCP_BOVIN\t76.62\t77\t18\t0\t2\t232\t286\t362\t2e-23\t98.6\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_9\tsp|Q41629|ADT1_WHEAT\t82.26\t62\t11\t0\t3\t188\t170\t231\t3e-27\t 104\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_13\tsp|Q32NG4|PDDC1_XENLA\t54.44\t90\t40\t1\t1\t270\t140\t228\t1e-27\t 106\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_23\tsp|Q9GNE2|RL23_AEDAE\t97.22\t72\t2\t0\t67\t282\t14\t85\t1e-42\t 142\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_31\tsp|Q3V1H3|HPHL1_MOUSE\t53.38\t133\t59\t1\t2\t391\t23\t155\t5e-42\t 153\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_32\tsp|Q641Y2|NDUS2_RAT\t88.03\t117\t14\t0\t2\t352\t334\t450\t1e-70\t 224\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_37\tsp|Q9D3D9|ATPD_MOUSE\t56.10\t123\t54\t0\t2\t370\t46\t168\t7e-42\t 144\r\n", "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed_contig_39\tsp|Q39613|CYPH_CATRO\t75.00\t120\t23\t1\t55\t393\t1\t120\t7e-49\t 160\r\n" ] } ], "prompt_number": 94 }, { "cell_type": "code", "collapsed": false, "input": [ "!wc /Volumes/Bay3/Software/ncbi-blast-2.2.29\\+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 664 7968 84910 /Volumes/Bay3/Software/ncbi-blast-2.2.29+/out/Ab_4denovo_CLC6_a_uniprot_blastx.tab\r\n" ] } ], "prompt_number": 95 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }