{ "cells": [ { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def exampleCost(xc, yc, zc):\n", " \"\"\" Cost function assigning the score of a column of the alignment matrix is equal to 1 \n", " if all of the column's symbols are identical, and 0 if even one symbol disagrees \"\"\"\n", " if xc == yc == zc: return 1\n", " #if xc == '-' or yc == '-' or zc == '-': return 0\n", " else: return 0" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "def multipleSequenceAlignment(v, w, u, s):\n", " arr3d = np.zeros((len(v)+1, len(w)+1, len(u)+1), dtype = 'int')\n", " for i in xrange(1, len(v) + 1):\n", " for j in xrange(1, len(w) + 1):\n", " for k in xrange(1, len(u) + 1):\n", " arr3d[i,j,k] = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n", " arr3d[i, j-1, k] + s('-', w[j-1], '-'),\n", " arr3d[i, j, k-1] + s('-', '-', u[k-1]),\n", " arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-'),\n", " arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]),\n", " arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]),\n", " arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]))\n", " return arr3d" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[[0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 0 0 0 0 0 0 0 0]\n", " [0 1 1 1 1 1 1 1 1]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 1 1 1 1 1 1 1 1]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 1 1 1 1 2 2 2 2]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 1 1 1 1 2 2 2 2]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 1 1 1 1 2 2 2 2]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 1 1 1 1 2 2 2 2]]\n", "\n", " [[0 0 0 0 0 0 0 0 0]\n", " [0 0 1 1 1 1 1 1 1]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 1 1 1 2 2 2]\n", " [0 0 1 2 2 2 2 2 3]\n", " [0 1 1 2 2 2 2 2 3]]]\n" ] } ], "source": [ "v, w, u = 'ATATCCG', 'TCCGA', 'ATGTACTG'\n", "arr3d = multipleSequenceAlignment(v, w, u, exampleCost)\n", "print arr3d" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def traceback_MSA(arr3d, v, w, u, s):\n", " i,j,k = len(v), len(w), len(u)\n", " alv, alw, alu = [], [], []\n", " while (i,j,k) != (0,0,0):\n", " max_score = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n", " arr3d[i, j-1, k] + s('-', w[j-1], '-'),\n", " arr3d[i, j, k-1] + s('-', '-', u[k-1]),\n", " arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-'),\n", " arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]),\n", " arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]),\n", " arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]))\n", " print max_score\n", " if arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]) == max_score:\n", " alv.append('-'); alw.append(w[j-1]); alu.append(u[k-1])\n", " j -= 1; k -=1 \n", " elif arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]) == max_score:\n", " alv.append(v[i-1]); alw.append(w[j-1]); alu.append(u[k-1])\n", " i -= 1; j -= 1; k -= 1\n", " elif arr3d[i, j-1, k] + s('-', w[j-1], '-') == max_score:\n", " alv.append('-'); alw.append(w[j-1]); alu.append('-')\n", " j -= 1\n", " elif arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-') == max_score:\n", " alv.append(v[i-1]); alw.append(w[j-1]); alu.append('-')\n", " i -= 1; j -= 1\n", " elif arr3d[i-1, j, k] + s(v[i-1], '-', '-') == max_score:\n", " alv.append(v[i-1]); alw.append('-'); alu.append('-')\n", " i -= 1\n", " elif arr3d[i, j, k-1] + s('-', '-', u[k-1]) == max_score:\n", " alv.append('-'); alw.append('-'); alu.append(u[k-1])\n", " k -= 1\n", " elif arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]) == max_score:\n", " alv.append(v[i-1]); alw.append('-'); alu.append(u[k-1])\n", " i-=1; k-=1 \n", " print alv, alw, alu\n", " alignment = map(lambda x: ''.join(x), [alv[::-1], alw[::-1], alu[::-1]])\n", " return alignment" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n", "['-'] ['A'] ['-']\n", "3\n", "['-', 'G'] ['A', 'G'] ['-', 'G']\n", "2\n", "['-', 'G', '-'] ['A', 'G', 'C'] ['-', 'G', 'T']\n", "2\n", "['-', 'G', '-', 'C'] ['A', 'G', 'C', 'C'] ['-', 'G', 'T', 'C']\n", "1\n", "['-', 'G', '-', 'C', 'C'] ['A', 'G', 'C', 'C', '-'] ['-', 'G', 'T', 'C', '-']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T'] ['A', 'G', 'C', 'C', '-', '-'] ['-', 'G', 'T', 'C', '-', '-']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T', 'A'] ['A', 'G', 'C', 'C', '-', '-', '-'] ['-', 'G', 'T', 'C', '-', '-', '-']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G']\n", "1\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T']\n", "0\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A']\n", "0\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G']\n", "0\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C', 'C'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G', 'T']\n", "0\n", "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C', 'C', 'T'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G', 'T', 'C']\n" ] }, { "ename": "IndexError", "evalue": "string index out of range", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ATATCCG'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'TCCGA'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'ATGTACTG'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0marr3d\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmultipleSequenceAlignment\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexampleCost\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0malgn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraceback_MSA\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr3d\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexampleCost\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0malgn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mtraceback_MSA\u001b[0;34m(arr3d, v, w, u, s)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m max_score = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: string index out of range" ] } ], "source": [ "v, w, u = 'ATATCCG', 'TCCGA', 'ATGTACTG'\n", "arr3d = multipleSequenceAlignment(v, w, u, exampleCost)\n", "algn = traceback_MSA(arr3d, v, w, u, exampleCost)\n", "print algn" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "8" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(u)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'a' == 'a' == 'a'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }