{ "metadata": { "name": "using-blastparser" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import blastparser\n", "fp = open('sample-blast.txt')\n", "for record in blastparser.parse_fp(fp):\n", " for hit in record.hits:\n", " for match in hit.matches:\n", " print record.query_name, hit.subject_name\n", " print match.subject_start, match.query_start\n", " print match.subject_end, match.query_end\n", " print match.expect\n", " print dir(match)\n", " break\n", " break" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "occulta.ctb3.1 JGI_294397\n", "1 81\n", "1307 4050\n", "1e-300\n", "['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'expect', 'frame1', 'frame2', 'query_end', 'query_sequence', 'query_start', 'subject_end', 'subject_sequence', 'subject_start']\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "evalues = []\n", "\n", "fp = open('sample-blast.txt')\n", "for record in blastparser.parse_fp(fp):\n", " for hit in record.hits:\n", " for match in hit.matches:\n", " evalues.append(match.expect)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "hist(evalues, bins=100, normed=True)\n", "show()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "display_data", "png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEPCAYAAAC5sYRSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFVRJREFUeJzt3X9MVefhx/HPcVhT46TSy1yaMlImGRDFYlGkCt61FKiM\n/qGkgm00uD8uLoF2CW7rH62FLEvTbRmOVodb1jnBv0ozbdZpoMnlapRflQZbf2UVNDHtVmS7c2Hd\nbuF8//Ar7g6498L9hY/vV0LC5Tye8/iEvLl9ejhatm3bAgAYZUG8JwAAiDziDgAGIu4AYCDiDgAG\nIu4AYCDiDgAGikncd+3apeXLl2vVqlVBx3o8Hq1Zs0YLFy5Ue3v7lOP/+Mc/9PDDD6u2tjYaUwUA\nI8Qk7tXV1Tp+/HhIY1NTU3Xo0CFt37592uMvv/yyNm3aFMnpAYBxYhL3goICLVu2zO9r169f1549\ne5Sfn6+dO3dqaGhI0q24r1q1SgsWTJ3aBx98oL/+9a8qLi6OxbQB4K4Vtz33V155RZWVlTpz5oy2\nbdum119/PeD4iYkJ1dfX6+c//3mMZggAd6+EeFzU5/Ppvffe09mzZ0P+M/v379fmzZv10EMPiScm\nAEBgcYn7xMSEFixYoO7ubi1atGjGcZZlTX7e3d2tkydPav/+/frnP/+p//znP/rqV7+qn/zkJ7GY\nMgDcVeKyLbNo0SJt3rxZBw4c0Pj4uGzb1uDgoN8Y27b93qG3trbq6tWrGhoa0s9+9jPt2LGDsAPA\nDILGPdhtjG1tbVq9erVWr16t7du36/Lly1PGVFVV6fHHH9elS5eUkpKit956Sw0NDfrss8+Um5ur\nlStX6tixY5Kkvr4+paSk6O2335bL5Zrxuv/9rh4A4M8K9sjfkydPasmSJdqxY4fOnTs35fiZM2eU\nlZWlxMREHTp0SJ2dnTp8+HDUJgwACC5o3CVpeHhY5eXl08b9v42MjGjNmjW6du1axCYIAJi9iO65\nHzx4UOXl5ZE8JQBgDiJ2t0xnZ6daW1t1+vTpaY+zRw4AczOX278j8s59cHBQNTU1OnbsmB544IEZ\nx92+A+Ze/9i7d2/c5zBfPlgL1oK1CPwxV2HH/dq1a9q6dava2tq0YsWKcE8HAIiAoNsyVVVV6urq\n0sjIiFJSUtTQ0CCfzydJcrlcamxs1OjoqGpqaiRJCxcuVG9vb3RnDQAIKKS7ZSJyIcsK6z8xTOJ2\nu+V0OuM9jXmBtbiDtbiDtbhjru0k7gAwj821nfxLTABgIOIOAAYi7gBgIOIOAAYi7gBgIOIOAAYi\n7gBgIOIOAAYi7gBgIOIOAAYi7gBgIOIOAAYi7gBgIOIOAAYi7gBgIOIOAAYi7gBgIOIOAAYi7gBg\nIOIOAAaKadwty5JlWVq6NCmWlwWAe45lz+Wf1Z7LhSxL0u1Lze1f8waAe41lza2XbMsAgIGIOwAY\niLgDgIGIOwAYiLgDgIECxn3Xrl1avny5Vq1aNeOYl156SWlpaXrsscd08eLFiE8QADB7AeNeXV2t\n48ePz3i8t7dXJ0+eVH9/v+rr61VfXx/xCQIAZi9g3AsKCrRs2bIZj/f09KiiokJJSUmqqqrShQsX\nIj5BAMDshbXn3tvbq6ysrMnXycnJ+uSTT8KeFAAgPAnh/GHbtqf85tSt30SdyauTn7ndbjmdznAu\nDwDGcbvdcrvdYZ8n6OMHhoeHVV5ernPnzk051tzcrC+//FLf//73JUnf/OY3Z3znzuMHAGD24vL4\ngby8PLW3t+vGjRs6cuSIMjMzwzkdACBCAm7LVFVVqaurSyMjI0pJSVFDQ4N8Pp8kyeVyad26ddq4\ncaNyc3OVlJSk1tbWmEwaABAYT4UEgHmMp0ICACYRdwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMR\ndwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAw\nEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAwEHEHAAMRdwAwUNC4ezweZWZmKj09Xc3NzVOO/+tf\n/9LOnTuVk5OjTZs26ejRo1GZKAAgdJZt23agATk5Odq3b59SU1NVUlKiU6dOyeFwTB7/1a9+pcHB\nQe3fv19Xr17VE088oT//+c+yLMv/QpYl6falLAW5LABAt9o5l14GfOfu9XolSYWFhUpNTVVxcbF6\nenr8xiQmJurmzZvy+XwaHR3V4sWLp4QdABBbAePe19enjIyMyddZWVnq7u72G1NVVaXx8XE5HA5t\n3LhRbW1t0ZkpACBkCeGe4I033lBCQoI+/fRTnTt3TmVlZbp69aoWLJju58ark5+53W45nc5wLw8A\nRnG73XK73WGfJ+Ceu9frldPp1MDAgCSptrZWpaWlKisrmxzz7LPP6rvf/a5KSkokSXl5eTp06JDf\nO36JPXcAmIuo7LknJiZKunXHzPDwsDo6OpSXl+c35sknn9S7776riYkJXblyRaOjo1PCDgCIraDb\nMk1NTXK5XPL5fKqrq5PD4VBLS4skyeVyqbKyUufPn1dubq6Sk5O1b9++qE8aABBY0FshI3YhtmUA\nYNaisi0DALg7EXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcA\nMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBx\nBwADEXcAMBBxBwADEXcAMFDQuHs8HmVmZio9PV3Nzc3Tjunr69PatWuVmZkpp9MZ6TkCAGbJsm3b\nDjQgJydH+/btU2pqqkpKSnTq1Ck5HI7J47ZtKzs7W7/4xS9UVFSkkZERv+OTF7IsSbcvZSnIZQEA\nutXOufQy4Dt3r9crSSosLFRqaqqKi4vV09PjN6a/v1/Z2dkqKiqSpGnDDgCIrYBx7+vrU0ZGxuTr\nrKwsdXd3+405ceKELMtSQUGBysvLdeLEiejMFAAQsoRwT/DFF1/oww8/VGdnp8bGxvTUU0/po48+\n0v333z/N6FcnP3O73ezPA8D/cLvdcrvdYZ8n4J671+uV0+nUwMCAJKm2tlalpaUqKyubHPPHP/5R\nbrdbP/3pTyVJ27Zt065du1RSUuJ/IfbcAWDWorLnnpiYKOnWHTPDw8Pq6OhQXl6e35j169erq6tL\nY2NjGh0d1cDAgDZs2DDriQAAIifotkxTU5NcLpd8Pp/q6urkcDjU0tIiSXK5XHrwwQdVXV2t3Nxc\nJScnq7GxUUuWLIn6xAEAMwt6K2TELsS2DADMWlS2ZQAAdyfiDgAGIu4AYCDiDgAGIu4AYCDiDgAG\nIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4A\nYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGIu4AYCDiDgAGChp3j8ejzMxMpaenq7m5\necZxfX19SkhI0DvvvBPRCQIAZi9o3F944QW1tLSos7NTb775pkZGRqaMGR8f1w9/+EOVlpbKtu2o\nTBQAELqAcfd6vZKkwsJCpaamqri4WD09PVPGNTc3q6KiQsnJydGZJQBgVgLGva+vTxkZGZOvs7Ky\n1N3d7Tfm+vXrOnr0qHbv3i1JsiwrCtMEAMxGQrgnePHFF/Xaa6/JsizZth1kW+bVyc/cbrecTme4\nlwcAo7jdbrnd7rDPY9kBauz1euV0OjUwMCBJqq2tVWlpqcrKyibHpKWlTQZ9ZGREixcv1q9//Ws9\n88wz/heyLEm3L2WxNw8AIbj9xnm2Ar5zT0xMlHTrjplvfOMb6ujo0N69e/3GXLlyZfLz6upqlZeX\nTwk7ACC2gm7LNDU1yeVyyefzqa6uTg6HQy0tLZIkl8sV9QkCAGYv4LZMRC/EtgwAzNpct2X4DVUA\nMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBx\nBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwADEXcAMBBxBwAD\nEXcAMBBxBwADEXcAMFDQuHs8HmVmZio9PV3Nzc1Tjre1tWn16tVavXq1tm/frsuXL0dlogCA0Fm2\nbduBBuTk5Gjfvn1KTU1VSUmJTp06JYfDMXn8zJkzysrKUmJiog4dOqTOzk4dPnx46oUsS9LtS1kK\nclkAgG61cy69DPjO3ev1SpIKCwuVmpqq4uJi9fT0+I3Jz89XYmKiJKmsrExdXV2zngQAILICxr2v\nr08ZGRmTr7OystTd3T3j+IMHD6q8vDxyswMAzElCpE7U2dmp1tZWnT59OsCoVyc/c7vdcjqdkbo8\nABjB7XbL7XaHfZ6Ae+5er1dOp1MDAwOSpNraWpWWlqqsrMxv3ODgoLZs2aLjx49rxYoV01+IPXcA\nmLWo7Lnf3kv3eDwaHh5WR0eH8vLy/MZcu3ZNW7duVVtb24xhBwDEVtBtmaamJrlcLvl8PtXV1cnh\ncKilpUWS5HK51NjYqNHRUdXU1EiSFi5cqN7e3ujOGgAQUNBbISN2IbZlAGDWorItAwC4OxF3ADAQ\ncQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcA\nAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3\nADBQ0Lh7PB5lZmYqPT1dzc3N04556aWXlJaWpscee0wXL16M+CRN43a74z2FeYO1uIO1uIO1CF/Q\nuL/wwgtqaWlRZ2en3nzzTY2MjPgd7+3t1cmTJ9Xf36/6+nrV19dHbbKm4Bv3DtbiDtbiDtYifAHj\n7vV6JUmFhYVKTU1VcXGxenp6/Mb09PSooqJCSUlJqqqq0oULF+Y8maVLk2RZlizL0tKlSXM+z3y2\ndGmSGhoajP47Aoi/gHHv6+tTRkbG5OusrCx1d3f7jent7VVWVtbk6+TkZH3yySdBLpswGXHLum/y\n85s3/ybJlmTr5s2b046ZKYih/GCYDz88bv0d9+rW3/FvcZnDfFiH2/OI9w+6+bIWmB9M+n5ICPcE\ntm3Ltm2/r1mWNcPo6b7um9WYmzf/FuD8kR0TPQ3//xForWIjvuswv+YxH+bQ0NAQ1+vPJ/Fei/nw\n/RCOgHFfu3at9uzZM/n6448/Vmlpqd+YvLw8nT9/XiUlJZKkzz//XGlpaVPO9b8/AAAA0RNwWyYx\nMVHSrTtmhoeH1dHRoby8PL8xeXl5am9v140bN3TkyBFlZmZGb7YAgJAE3ZZpamqSy+WSz+dTXV2d\nHA6HWlpaJEkul0vr1q3Txo0blZubq6SkJLW2tkZ90gCAIOwI6urqsjMyMuwVK1bYv/zlL6cd86Mf\n/ch+5JFH7DVr1tgXLlyI5OXnlWBr0draamdnZ9vZ2dl2VVWVfenSpTjMMjZC+b6wbdvu7e21v/KV\nr9jt7e0xnF1shbIWvb29dm5urp2RkWFv2rQpthOMoWBrMTY2Zu/YscN+9NFH7cLCQvsPf/hDHGYZ\nfdXV1fbXvvY1e+XKlTOOmUs3Ixr3Rx991O7q6rKHh4ftb33rW/bnn3/ud7ynp8fesGGDfePGDfvI\nkSN2WVlZJC8/rwRbi9OnT9t///vfbdu27d/97nf2888/H49pxkSwtbBt2/7yyy/tb3/723ZZWZn9\n9ttvx2GWsRFsLSYmJuyVK1faHR0dtm3b066VKYKtxYEDB+zdu3fbtm3bw8PDdlpamj0xMRGPqUaV\nx+Oxz549O2Pc59rNiD1+INb3xM9noaxFfn7+5P/TKCsrU1dXV8znGQuhrIUkNTc3q6KiQsnJybGe\nYsyEshb9/f3Kzs5WUVGRJMnhcMR8nrEQylokJibq5s2b8vl8Gh0d1eLFi+/qu1dmUlBQoGXLls14\nfK7djFjco3dP/N0nlLX4bwcPHlR5eXksphZzoazF9evXdfToUe3evVtS/G8PjZZQ1uLEiROyLEsF\nBQUqLy/XiRMnYj3NmAhlLaqqqjQ+Pi6Hw6GNGzeqra0t1tOcF+bazbDvc58Ne1b3xN8bOjs71dra\nqtOnT8d7KnHz4osv6rXXXpNlWdN+j9xLvvjiC3344Yfq7OzU2NiYnnrqKX300Ue6//774z21mHvj\njTeUkJCgTz/9VOfOnVNZWZmuXr2qBQvurecdzrWbEVultWvX+j007OOPP9b69ev9xty+J/62me6J\nv9uFshaSNDg4qJqaGh07dkwPPPBALKcYM6GsxQcffKDKyko98sgjam9v1/e+9z0dO3Ys1lONulDW\nIj8/X08//bS+/vWvKy0tTbm5ufJ4PLGeatSFshYej0fPPfecFi9erLy8PD300EO6fPlyrKcad3Pt\nZsTizj3xd4SyFteuXdPWrVvV1tamFStWxGOaMRHKWly5ckVDQ0MaGhpSRUWFDhw4oGeeeSYe042q\nUNZi/fr16urq0tjYmEZHRzUwMKANGzbEY7pRFcpaPPnkk3r33Xc1MTGhK1euaHR01G8r514x125G\ndFuGe+LvCLYWjY2NGh0dVU1NjSRp4cKF6u3tjeeUoybYWtxLgq3Fgw8+qOrqauXm5io5OVmNjY1a\nsmRJnGcdHcHWorKyUufPn59ci3379sV5xtFRVVWlrq4ujYyMKCUlRQ0NDfL5bj1yJZxuWva9vMEJ\nAIa6t/7PBADcI4g7ABiIuANAALt27dLy5cu1atWqiJyvtLRUy5Ytm/K7Lc8995wyMjK0bt06vfzy\ny2Ffh7gDQADV1dU6fvx4xM73gx/8QIcPH57y9eeff14XL17UqVOn1N/fr/fffz+s6xB3AAhguscD\nXL9+XXv27FF+fr527typoaGhkM/3xBNPTHsH1NNPPy1Juu+++1RUVBT2I0mIOwDM0iuvvKLKykqd\nOXNG27Zt0+uvvx6xc//73//W73//e33nO98J6zwxffwAANztfD6f3nvvPZ09e3bKsXfeeUd79+6d\n8vWHH35Yf/rTn0I6/+7du1VUVKR169aFNU/iDgCzMDExoQULFqi7u1uLFi3yO7ZlyxZt2bIl6Dlm\nejZMQ0ODvF6vfvvb34Y9T7ZlAGAWFi1apM2bN+vAgQMaHx+XbdsaHByc1Tmm+93R3/zmN+ro6IjY\n0y+JOwAEUFVVpccff1yXLl1SSkqK3nrrLTU0NOizzz5Tbm6uVq5cOasH3RUUFOjZZ5/V+++/r5SU\nFHV0dEi6tR3zl7/8Rfn5+crJydGPf/zjsObN4wcAwEC8cwcAAxF3ADAQcQcAAxF3ADAQcQcAAxF3\nADDQ/wG9faT0I2S5cwAAAABJRU5ErkJggg==\n", "text": [ "" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "fp = open('sample-blast.txt')\n", "\n", "n = 0\n", "for record in blastparser.parse_fp(fp):\n", " n += 1\n", " if n > 5: break\n", " for hit in record.hits:\n", " for match in hit.matches:\n", " print record.query_name, hit.subject_name, match.expect\n", " break\n", " break\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "occulta.ctb3.1 JGI_294397 1e-300\n", "occulta.ctb3.2 ci0100148600 1e-300\n", "occulta.ctb3.3 ENSCINT00000024997 1e-125\n", "occulta.ctb3.4 KH.S404.1.v4.A.ND1-1 4e-57\n", "occulta.ctb3.5 KH.S404.1.v4.A.ND1-1 8e-57\n" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "fp = open('sample-blast.txt')\n", "outfp = open('/tmp/blastout.csv', 'w')\n", "\n", "for record in blastparser.parse_fp(fp):\n", " for hit in record.hits:\n", " for match in hit.matches:\n", " print >>outfp, record.query_name, hit.subject_name, match.expect\n", " break\n", " break\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "import csv\n", "\n", "fp = open('sample-blast.txt')\n", "outfp = open('/tmp/blastout.csv', 'wb')\n", "w = csv.writer(outfp)\n", "\n", "for record in blastparser.parse_fp(fp):\n", " for hit in record.hits:\n", " for match in hit.matches:\n", " w.writerow([record.query_name, hit.subject_name, match.expect])\n", " break\n", " break\n", "outfp.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }