{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Local alignment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We define a simple cost function `exampleCost` and a function `smithWaterman` that, given strings x and y and a cost function, fills in and returns the corresponding local alignment matrix." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy\n", "\n", "def exampleCost(xc, yc):\n", " ''' Cost function: 2 to match, -6 to gap, -4 to mismatch '''\n", " if xc == yc: return 2 # match\n", " if xc == '-' or yc == '-': return -6 # gap\n", " return -4\n", "\n", "def smithWaterman(x, y, s):\n", " ''' Calculate local alignment values of sequences x and y using\n", " dynamic programming. Return maximal local alignment value. '''\n", " V = numpy.zeros((len(x)+1, len(y)+1), dtype=int)\n", " for i in range(1, len(x)+1):\n", " for j in range(1, len(y)+1):\n", " V[i, j] = max(V[i-1, j-1] + s(x[i-1], y[j-1]), # diagonal\n", " V[i-1, j ] + s(x[i-1], '-'), # vertical\n", " V[i , j-1] + s('-', y[j-1]), # horizontal\n", " 0) # empty\n", " argmax = numpy.where(V == V.max())\n", " return V, int(V[argmax])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [ 0 0 0 0 0 0 2 0 2 2 0 2 0 0 0]\n", " [ 0 0 0 0 0 0 2 0 2 4 0 2 0 0 0]\n", " [ 0 2 0 2 0 2 0 0 0 0 0 0 4 2 2]\n", " [ 0 0 4 0 4 0 0 0 0 0 0 0 0 0 0]\n", " [ 0 2 0 6 0 6 0 0 0 0 0 0 2 2 2]\n", " [ 0 0 0 0 2 0 8 2 2 2 0 2 0 0 0]\n", " [ 0 0 0 0 0 0 2 10 4 0 4 0 0 0 0]\n", " [ 0 2 0 2 0 2 0 4 6 0 0 0 2 2 2]\n", " [ 0 0 0 0 0 0 4 0 6 8 2 2 0 0 0]\n", " [ 0 0 0 0 0 0 2 0 2 8 4 4 0 0 0]\n", " [ 0 0 0 0 0 0 0 4 0 2 10 4 0 0 0]\n", " [ 0 0 0 0 0 0 2 0 6 2 4 12 6 0 0]\n", " [ 0 0 0 0 0 0 0 4 0 2 4 6 8 2 0]\n", " [ 0 2 0 2 0 2 0 0 0 0 0 0 8 10 4]\n", " [ 0 0 4 0 4 0 0 0 0 0 0 0 2 4 6]]\n", "Best score=12, in cell (12, 11)\n" ] } ], "source": [ "x, y = 'GGTATGCTGGCGCTA', 'TATATGCGGCGTTT'\n", "V, best = smithWaterman(x, y, exampleCost)\n", "print(V)\n", "print(\"Best score=%d, in cell %s\" % (best, numpy.unravel_index(numpy.argmax(V), V.shape)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we define a `traceback` function that, given a local alignment matrix, strings x and y, and a scoring function, returns the optimal edit transcript and alignment for the optimal substrings of x and y." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def traceback(V, x, y, s):\n", " \"\"\" Trace back from given cell in local-alignment matrix V \"\"\"\n", " # get i, j for maximal cell\n", " i, j = numpy.unravel_index(numpy.argmax(V), V.shape)\n", " xscript, alx, aly, alm = [], [], [], []\n", " while (i > 0 or j > 0) and V[i, j] != 0:\n", " diag, vert, horz = 0, 0, 0\n", " if i > 0 and j > 0:\n", " diag = V[i-1, j-1] + s(x[i-1], y[j-1])\n", " if i > 0:\n", " vert = V[i-1, j] + s(x[i-1], '-')\n", " if j > 0:\n", " horz = V[i, j-1] + s('-', y[j-1])\n", " if diag >= vert and diag >= horz:\n", " match = x[i-1] == y[j-1]\n", " xscript.append('M' if match else 'R')\n", " alm.append('|' if match else ' ')\n", " alx.append(x[i-1]); aly.append(y[j-1])\n", " i -= 1; j -= 1\n", " elif vert >= horz:\n", " xscript.append('D')\n", " alx.append(x[i-1]); aly.append('-'); alm.append(' ')\n", " i -= 1\n", " else:\n", " xscript.append('I')\n", " aly.append(y[j-1]); alx.append('-'); alm.append(' ')\n", " j -= 1\n", " xscript = (''.join(xscript))[::-1]\n", " alignment = '\\n'.join(map(lambda x: ''.join(x), [alx[::-1], alm[::-1], aly[::-1]]))\n", " return xscript, alignment" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TATGCTGGCG\n", "||||| ||||\n", "TATGC-GGCG\n" ] } ], "source": [ "print(traceback(V, x, y, exampleCost)[1])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best score: 8\n", "_his_sour_\n", "||||| ||||\n", "_his_hour_\n" ] } ], "source": [ "def editDistanceLikeCost(xc, yc):\n", " return 1 if xc == yc else -1\n", "\n", "x = 'he_will_after_his_sour_fashion_tell_you'\n", "y = 'struts_and_frets_his_hour_upon_the_stage '\n", "V, best = smithWaterman(x, y, editDistanceLikeCost)\n", "print('Best score: %d' % best)\n", "print(traceback(V, x, y, exampleCost)[1])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }