{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def exampleCost(xc, yc, zc):\n",
    "    \"\"\" Cost function assigning the score of a column of the alignment matrix is equal to 1 \n",
    "    if all of the column's symbols are identical, and 0 if even one symbol disagrees \"\"\"\n",
    "    if xc == yc == zc: return 1\n",
    "    #if xc == '-' or yc == '-' or zc == '-': return 0\n",
    "    else: return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def multipleSequenceAlignment(v, w, u, s):\n",
    "    arr3d = np.zeros((len(v)+1, len(w)+1, len(u)+1), dtype = 'int')\n",
    "    for i in xrange(1, len(v) + 1):\n",
    "        for j in xrange(1, len(w) + 1):\n",
    "            for k in xrange(1, len(u) + 1):\n",
    "                arr3d[i,j,k] = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n",
    "                                  arr3d[i, j-1, k] + s('-', w[j-1], '-'),\n",
    "                                  arr3d[i, j, k-1] + s('-', '-', u[k-1]),\n",
    "                                  arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-'),\n",
    "                                  arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]),\n",
    "                                  arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]),\n",
    "                                  arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]))\n",
    "    return arr3d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 0 0 0 0 0 0 0]\n",
      "  [0 1 1 1 1 1 1 1 1]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 1 1 1 1 1 1 1 1]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 1 1 1 1 2 2 2 2]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 1 1 1 1 2 2 2 2]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 1 1 1 1 2 2 2 2]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 1 1 1 1 2 2 2 2]]\n",
      "\n",
      " [[0 0 0 0 0 0 0 0 0]\n",
      "  [0 0 1 1 1 1 1 1 1]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 1 1 1 2 2 2]\n",
      "  [0 0 1 2 2 2 2 2 3]\n",
      "  [0 1 1 2 2 2 2 2 3]]]\n"
     ]
    }
   ],
   "source": [
    "v, w, u = 'ATATCCG', 'TCCGA', 'ATGTACTG'\n",
    "arr3d = multipleSequenceAlignment(v, w, u, exampleCost)\n",
    "print arr3d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def traceback_MSA(arr3d, v, w, u, s):\n",
    "    i,j,k = len(v), len(w), len(u)\n",
    "    alv, alw, alu = [], [], []\n",
    "    while (i,j,k) != (0,0,0):\n",
    "        max_score = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n",
    "                        arr3d[i, j-1, k] + s('-', w[j-1], '-'),\n",
    "                        arr3d[i, j, k-1] + s('-', '-', u[k-1]),\n",
    "                        arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-'),\n",
    "                        arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]),\n",
    "                        arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]),\n",
    "                        arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]))\n",
    "        print max_score\n",
    "        if arr3d[i, j-1, k-1] + s('-', w[j-1], u[k-1]) == max_score:\n",
    "            alv.append('-'); alw.append(w[j-1]); alu.append(u[k-1])\n",
    "            j -= 1; k -=1 \n",
    "        elif arr3d[i-1, j-1, k-1] + s(v[i-1], w[j-1], u[k-1]) == max_score:\n",
    "            alv.append(v[i-1]); alw.append(w[j-1]); alu.append(u[k-1])\n",
    "            i -= 1; j -= 1; k -= 1\n",
    "        elif arr3d[i, j-1, k] + s('-', w[j-1], '-') == max_score:\n",
    "            alv.append('-'); alw.append(w[j-1]); alu.append('-')\n",
    "            j -= 1\n",
    "        elif arr3d[i-1, j-1, k] + s(v[i-1], w[j-1], '-') == max_score:\n",
    "            alv.append(v[i-1]); alw.append(w[j-1]); alu.append('-')\n",
    "            i -= 1; j -= 1\n",
    "        elif arr3d[i-1, j, k] + s(v[i-1], '-', '-') == max_score:\n",
    "            alv.append(v[i-1]); alw.append('-'); alu.append('-')\n",
    "            i -= 1\n",
    "        elif arr3d[i, j, k-1] + s('-', '-', u[k-1]) == max_score:\n",
    "            alv.append('-'); alw.append('-'); alu.append(u[k-1])\n",
    "            k -= 1\n",
    "        elif arr3d[i-1, j, k-1] + s(v[i-1], '-', u[k-1]) == max_score:\n",
    "            alv.append(v[i-1]); alw.append('-'); alu.append(u[k-1])\n",
    "            i-=1; k-=1 \n",
    "        print alv, alw, alu\n",
    "    alignment = map(lambda x: ''.join(x), [alv[::-1], alw[::-1], alu[::-1]])\n",
    "    return alignment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n",
      "['-'] ['A'] ['-']\n",
      "3\n",
      "['-', 'G'] ['A', 'G'] ['-', 'G']\n",
      "2\n",
      "['-', 'G', '-'] ['A', 'G', 'C'] ['-', 'G', 'T']\n",
      "2\n",
      "['-', 'G', '-', 'C'] ['A', 'G', 'C', 'C'] ['-', 'G', 'T', 'C']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C'] ['A', 'G', 'C', 'C', '-'] ['-', 'G', 'T', 'C', '-']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T'] ['A', 'G', 'C', 'C', '-', '-'] ['-', 'G', 'T', 'C', '-', '-']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A'] ['A', 'G', 'C', 'C', '-', '-', '-'] ['-', 'G', 'T', 'C', '-', '-', '-']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G']\n",
      "1\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T']\n",
      "0\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A']\n",
      "0\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G']\n",
      "0\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C', 'C'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G', 'T']\n",
      "0\n",
      "['-', 'G', '-', 'C', 'C', 'T', 'A', '-', 'T', '-', '-', '-', '-', '-', '-'] ['A', 'G', 'C', 'C', '-', '-', '-', '-', 'T', 'A', '-', 'G', 'C', 'C', 'T'] ['-', 'G', 'T', 'C', '-', '-', '-', 'A', 'T', 'G', 'T', 'A', 'G', 'T', 'C']\n"
     ]
    },
    {
     "ename": "IndexError",
     "evalue": "string index out of range",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-131-d26f35c10e3d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ATATCCG'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'TCCGA'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'ATGTACTG'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0marr3d\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmultipleSequenceAlignment\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexampleCost\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0malgn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraceback_MSA\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marr3d\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexampleCost\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0malgn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-130-e00898d95f52>\u001b[0m in \u001b[0;36mtraceback_MSA\u001b[0;34m(arr3d, v, w, u, s)\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32mwhile\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         max_score = max(arr3d[i-1, j, k] + s(v[i-1], '-', '-'),\n\u001b[0;32m----> 6\u001b[0;31m                         \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m                         \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m                         \u001b[0marr3d\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'-'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mIndexError\u001b[0m: string index out of range"
     ]
    }
   ],
   "source": [
    "v, w, u = 'ATATCCG', 'TCCGA', 'ATGTACTG'\n",
    "arr3d = multipleSequenceAlignment(v, w, u, exampleCost)\n",
    "algn = traceback_MSA(arr3d, v, w, u, exampleCost)\n",
    "print algn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(u)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'a' == 'a' == 'a'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}