{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1, -2, -2, -2, -2],\n", " [-2, 1, -2, -2, -2],\n", " [-2, -2, 1, -2, -2],\n", " [-2, -2, -2, 1, -2],\n", " [-2, -2, -2, -2, -2]])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#simple scoring method in which matches count +1 and both the mismatch and indel penalties are 1\n", "import numpy as np\n", "alphabet = ['A', 'C', 'G', 'T', '-']\n", "score = np.array([[1, -2, -2, -2, -2],[-2, 1, -2, -2, -2],[-2, -2, 1, -2, -2],[-2, -2, -2, 1, -2],[-2, -2, -2, -2, -2]], \n", " dtype = 'int')\n", "score" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def overlapAlignment(x, y, score):\n", " \"\"\"Calculate the score of an optimal overlap alignment of suffix of x and prefix of y. \n", " The maximum score for overlap alignment\"\"\"\n", " \n", " D = np.zeros((len(x)+1, len(y)+1), dtype=int)\n", " for j in range(1, len(y) + 1):\n", " D[0,j] = D[0, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]\n", " for i in xrange(1, len(x)+1):\n", " for j in xrange(1, len(y)+1):\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]\n", " vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]\n", " diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]\n", " D[i,j] = max(horz, vert, diag)\n", " max_score = D[-1,].max()\n", " return D, max_score" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 -2 -4 -6 -8 -10 -12]\n", " [ 0 1 -1 -3 -5 -7 -9]\n", " [ 0 -1 2 0 -2 -4 -6]\n", " [ 0 1 0 0 -2 -4 -6]\n", " [ 0 -1 -1 -2 1 -1 -3]\n", " [ 0 -2 0 -2 -1 -1 0]\n", " [ 0 -2 -2 1 -1 -3 -2]]\n", "1\n" ] } ], "source": [ "D, max_score = overlapAlignment('AGACGT', 'AGTCCG', score)\n", "print (D)\n", "print (max_score)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def traceback(D, x, y, score):\n", " ''' Find and return the alignment of a substring of t to p by \n", " trace back from given cell in global-alignment matrix D . \n", " A highest-scoring fitting alignment between v and w. \n", " If multiple alignments tie for best, we report the leftmost. '''\n", " # get i, j for maximal cell\n", " max_score = D[-1,].max()\n", " for index in reversed(range(len(y))):\n", " if D[-1,index] == max_score:\n", " max_index = index\n", " i, j = len(x), max_index\n", " alp, alt = [], []\n", " while j > 0:\n", " diag, horz, vert = -float('inf'), -float('inf'), -float('inf')\n", " if i > 0 and j > 0:\n", " diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]\n", " if i > 0:\n", " vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]\n", " if j > 0:\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]\n", " if diag >= vert and diag >= horz:\n", " alp.append(x[i-1]); alt.append(y[j-1])\n", " i -= 1; j -= 1 \n", " elif vert >= horz:\n", " alp.append(x[i-1]); alt.append('-')\n", " i -= 1\n", " else:\n", " alp.append('-'); alt.append(y[j-1])\n", " j -= 1\n", " alignment = map(lambda x: ''.join(x), [alp[::-1], alt[::-1]])\n", " return alignment" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 -2 -4 -6 -8 -10 -12]\n", " [ 0 1 -1 -3 -5 -7 -9]\n", " [ 0 -1 2 0 -2 -4 -6]\n", " [ 0 1 0 0 -2 -4 -6]\n", " [ 0 -1 -1 -2 1 -1 -3]\n", " [ 0 -2 0 -2 -1 -1 0]\n", " [ 0 -2 -2 1 -1 -3 -2]]\n", "1\n", "ACGT\n", "A-GT\n" ] } ], "source": [ "D, max_score = overlapAlignment('AGACGT', 'AGTCCG', score)\n", "print (D)\n", "print (max_score)\n", "algn = traceback(D, 'AGACGT', 'AGTCCG', score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "58\n", "AAGTAGGCAAATTCA-G-TTCTTGTCACGAGCT-GATTG-CCGGAAG-AAAATGAACTAGATCGGGCGAAGTC-CTGTCCGAGGGATACGGAG-AAT-GCTGGAACAAATCAG---AGCAGCTGGAG-TGGGACAGCCGCCGC-CTC-CTTTCCAAGCCAATGC-CGGCAGGCGTACGCCGGCT-TTCGCCGACCCAGAGGTCGGTTCGCTG-AGCAGAGCAG-TTTGTTACG-ATTTGACAGGA-TCCACGAAGATAC-GTACGTGACAACT-GTGTGCGATCGGCCC\n", "AAGTTGGCAAATTTATGCTTCTTGTCACGAGCTCTATTGACC-GAAGTACAATGAATTAGACCGGGTTAAGTCTCCGTCCGA-GGTTAAGGAGAAATACCTGGAATAAATCAGCCTA-CTGCTGGCGTTCATAC-G-CG-CGCTC-CGCTTT-TAA-CCAATGCTCCGTAGG-ATACGCCAG-TATTCGCCG-TCC-G-GGCCGG-TCGCAGAAGCAGAGCGGATTTGATA-GAATTTGACAGGACTGCA-AAAGTTACGGTAC-TGACAACTAATGT-ATAT-TGCCC\n" ] } ], "source": [ "x, y = [i.strip() for i in open('input/dataset_248_7.txt', 'r')]\n", "D, max_score = overlapAlignment(x, y, score)\n", "print (max_score)\n", "algn = traceback(D, x, y, score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1, 0, 0, 0, -2],\n", " [ 0, 1, 0, 0, -2],\n", " [ 0, 0, 1, 0, -2],\n", " [ 0, 0, 0, 1, -2],\n", " [-2, -2, -2, -2, -2]])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alphabet = ['A', 'C', 'G', 'T', '-']\n", "score = np.array([[1, 0, 0, 0, -2],[0, 1, 0, 0, -2],[0, 0, 1, 0, -2],[0, 0, 0, 1, -2],[-2, -2, -2, -2, -2]], \n", " dtype = 'int')\n", "score" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n", "AGTACATCAGAGGA-GTT-ACATA-CTAACG\n", "AGTTCA-CAGGCTA-CGT-ACAGATATTACG\n" ] } ], "source": [ "x, y = ['AGTACATCAGAGGAGTT-ACATACTAACG', 'AGTTCACAGGCTA-CGTACAGATATTACGACAGGCAGA']\n", "D, max_score = overlapAlignment(x, y, score)\n", "print (max_score)\n", "algn = traceback(D, x, y, score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }