{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1, -1, -1, -1, -1],\n", " [-1, 1, -1, -1, -1],\n", " [-1, -1, 1, -1, -1],\n", " [-1, -1, -1, 1, -1],\n", " [-1, -1, -1, -1, -1]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#simple scoring method in which matches count +1 and both the mismatch and indel penalties are 1\n", "import numpy as np\n", "alphabet = ['A', 'C', 'G', 'T', '-']\n", "score = np.array([[1, -1, -1, -1, -1],[-1, 1, -1, -1, -1],[-1, -1, 1, -1, -1],[-1, -1, -1, 1, -1],[-1, -1, -1, -1, -1]], \n", " dtype = 'int')\n", "score" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-1" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score[alphabet.index('A'), alphabet.index('C')]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-1" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score[alphabet.index('T'), alphabet.index('-')]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Using an edit-distance-like dynamic programming formulation, we can\n", "# look for approximate occurrences of p in t.\n", "def fittingAlignment(p, t, score):\n", " \"\"\" Calculate global alignment value of sequences x and y using\n", " dynamic programming. Return global alignment value. \"\"\"\n", " \n", " D = np.zeros((len(p)+1, len(t)+1), dtype=int)\n", " # Note: First row gets zeros. First column initialized as usual.\n", " for i in range(1, len(p) + 1):\n", " D[i,0] = D[i-1,0] + score[alphabet.index(p[i-1]), alphabet.index('-')]\n", " for i in xrange(1, len(p)+1):\n", " for j in xrange(1, len(t)+1):\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(t[j-1])]\n", " vert = D[i-1, j] + score[alphabet.index(p[i-1]), alphabet.index('-')]\n", " diag = D[i-1, j-1] + score[alphabet.index(p[i-1]), alphabet.index(t[j-1])]\n", " D[i,j] = max(horz, vert, diag)\n", " max_score = D[-1,].max()\n", " return D, max_score" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [-1 -1 1 0 -1 -1 -1 1 1 0 -1 -1 -1 1 1 0]\n", " [-2 -2 0 2 1 0 -1 0 0 2 1 0 -1 0 0 2]\n", " [-3 -1 -1 1 3 2 1 0 -1 1 1 2 1 0 -1 1]\n", " [-4 -2 -2 0 2 2 1 0 -1 0 2 1 1 0 -1 0]\n", " [-5 -3 -1 -1 1 1 1 2 1 0 1 1 0 2 1 0]\n", " [-6 -4 -2 0 0 0 0 1 1 2 1 0 0 1 1 2]]\n", "2\n" ] } ], "source": [ "D, max_score = fittingAlignment('TAGATA','GTAGGCTTAAGGTTA', score)\n", "print (D)\n", "print (max_score)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", " [-1 -1 -1 -1 -1 1 1 0 -1 -1 -1 -1 -1 1 1 0 -1]\n", " [-2 -2 -2 -2 -2 0 0 2 1 0 -1 -2 0 0 0 0 -1]\n", " [-3 -3 -3 -3 -3 -1 1 1 1 0 -1 -2 -1 1 1 0 -1]\n", " [-4 -2 -3 -2 -2 -2 0 0 2 1 1 0 -1 0 0 2 1]\n", " [-5 -3 -1 -2 -3 -3 -1 -1 1 3 2 2 1 0 -1 1 1]\n", " [-6 -4 -2 0 -1 -2 -2 -2 0 2 4 3 2 1 0 0 2]\n", " [-7 -5 -3 -1 -1 0 -1 -2 -1 1 3 3 2 3 2 1 1]\n", " [-8 -6 -4 -2 -2 -1 -1 0 -1 0 2 2 4 3 2 1 0]]\n", "4\n" ] } ], "source": [ "D, max_score = fittingAlignment('GCGTATGC','TATTGGCTATACGGTT', score)\n", "print (D)\n", "print (max_score)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def traceback(D, p, t, score):\n", " ''' Find and return the alignment of a substring of t to p by \n", " trace back from given cell in global-alignment matrix D . \n", " A highest-scoring fitting alignment between v and w. \n", " If multiple alignments tie for best, we report the leftmost. '''\n", " # get i, j for maximal cell\n", " for index in range(len(t)):\n", " if D[-1,index] == D[-1,].max():\n", " max_index = index\n", " i, j = len(p), max_index\n", " alp, alt = [], []\n", " while i > 0:\n", " diag, horz, vert = -float('inf'), -float('inf'), -float('inf')\n", " if i > 0 and j > 0:\n", " diag = D[i-1, j-1] + score[alphabet.index(p[i-1]), alphabet.index(t[j-1])]\n", " if i > 0:\n", " vert = D[i-1, j] + score[alphabet.index(p[i-1]), alphabet.index('-')]\n", " if j > 0:\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(t[j-1])]\n", " if diag >= vert and diag >= horz:\n", " alp.append(p[i-1]); alt.append(t[j-1])\n", " i -= 1; j -= 1 \n", " elif vert >= horz:\n", " alp.append(p[i-1]); alt.append('-')\n", " i -= 1\n", " else:\n", " alp.append('-'); alt.append(t[j-1])\n", " j -= 1\n", " alignment = map(lambda x: ''.join(x), [alt[::-1], alp[::-1]])\n", " return alignment" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n", "GC-TATAC\n", "GCGTATGC\n" ] } ], "source": [ "D, max_score = fittingAlignment('GCGTATGC','TATTGGCTATACGGTT', score)\n", "print (max_score)\n", "algn = traceback(D,'GCGTATGC','TATTGGCTATACGGTT', score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "128\n", "GATGTC-T-C-TACAATATTAT-G-C-CCTA-C-AC-CTCCCAATCTAGGGTAGAGCCGT--TCGCACAGCGGCCAG-ACGTACAACAAAGTG-GTA-GTCGTAAT-CC---G-ACGGA-GCTAC-CACCGGGAATCTGA-ATCG-T-CATTCT-T--G-TCCGTACCCAAC-TCTATACC---CCA--AGC-GAAAAGATTATAATACGGTCA-AGCCTGAATTTTATATGTCGTGGGACCGCCGAAGTGC-CCTCTT-GTTCAGCAGCCTGCG---AAAGTACGCGGGCTCGTTCACCAGGATAATCAT-GGGA-TTTCGACCT-GGTGGGT-TGTTTCGCTAGCGA-ACTGGCGC-GGCACCATGTTCGGGAGCATCCG-GT-TATAGAACCTA-TCCGTAAGTG-CAGGGTGGAATACAGTGTCGGCGCTACCCGTACCCGCATCAATGCAGCG-AAGCTAAAGCCATGTA--AT-C-CCGCAAGTTAGTCACAAAATGTCTGGTAGTCTACACTTAATTATGGCT-TCTGTAAGGACTAACCACACT-ATCTTCGCCGGCA-ACCG-AGATAGGCGTACTAT-AAAGG-ATG-GT-TTGAG-CGATGCGGACGACTTCA-CCTCTGCCATCCAACACTCCATCTCA-AACGGGTCCAGCGAGAGAATCGAGATCTTGAGTAGCACAGCTCCGGATCCGTACCCACGACAGGATATGCGAGACCGAGCGCTACGCCACATGAGTCTAACCACGTCTAGCGCGAGATTACCAAGTCTTGGGCTGCGACGGTATAGGCAGGCGAATCAAAGTCAGGTACAGCCGTGCCATACCCACTCGCCC-G-GC-GTG-GCAACCAACCGCGC-AGCATCACGCACCTGTATC-TCCCT--ACATGGTCAT-C-ACTAGT-TGGG-TAAGCG-AATTCAGTTGCGTGGTTAGCGATTCGGTACTGCACAGGAGTACTTAATCA--AATGCAGAT-G-GACTTCCT-CGA-T\n", "GA-GTCGTGCTTTGAATATTATCGACGAATACCAACACT-TCAGT-TA---TAGA-CCTTAGTGGC-C-GC-GCTAGAACGTGC-AGAGA-TGCG-ACAAC-TAATCCCTAGGTATGGAGGC--CTCACC---AATCGGACA-CGCTCCA-TCTGTAGGCTGCGT--CCAGCTTCTA-GCCGGTTGAGTAGCGGTCGGGA-TA-AAT-C-CTCACAGCC----CCCGA-ACGT-GT-GGA--ACTG-ATTTCTCCT-TTAGTT-TGCA--CTGCGCTTGAA-TTTG-GAG-T-ATACA-CTGG-T-AT--TGGGGAGAGTAGA--TAGGTCCATCCATTT-GC--G-GATCCT--CGCTGG----ATGTTC-CG-GCCTTCGAGTCGCTATAA-C-AGTCC--AAATGTAAGGGCGG-A-A-AGT-ACGGAGC-ATGGGT--CTG-TTCGAGGCA-CGTAAGCT--CGCCGTG-ACCATGCACC-C-A--T-G-AACAAAATG---GCTAG-C--CTCTTAGTTTTGGCTAAC---AA-G--T--TCTCTGTGGTCTTCG--TAAATACAGTAGCGGGGCG-CCCATGTAAGGCA-GAATATT-AGAAGAT--TGACG-C--GAGCC-C-GCC--CC-ACGCTCGGTCTTATAA---AT-CAGC-----AA-CG-GA-C---AAAAGGATACCTAC---TCC-TACCGATGA-A--A-A---GAG--TG-G-GC-A--CC-CGT-A-TCT-CCCAC-TC--G-GC-----TAACAA-TCTAGTGCT-C-A-GG----CGCAGCCG---C-ATGCCA--T--AGCCGTGCC---CCCGGTC-CCCAGTTCACTGTTTAACCAACGGC-CTA-C-TTA-GTA--TGTGGCGTGACTGAACAAAGT--TGCTA-TGGTCTGGGATAACCGCAATTTAGTTGCAAAG--AACCTTTATGTA--G-AAAGGA-T-TTTAAACATGAAT-TAGATCGTGCCCACCTGCCAGT\n" ] } ], "source": [ "t,p = [i.strip() for i in open('input/rosalind_ba5h.txt', 'r')]\n", "D, max_score = fittingAlignment(p, t, score)\n", "print (max_score)\n", "algn = traceback(D, p, t, score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1, 0, 0, 0, -2],\n", " [ 0, 1, 0, 0, -2],\n", " [ 0, 0, 1, 0, -2],\n", " [ 0, 0, 0, 1, -2],\n", " [-2, -2, -2, -2, -2]])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'fittingAlignment' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_score\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfittingAlignment\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'GATACACT'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'ACGACCACAGATACCGCTATTCACTATATCGTT'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscore\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmax_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0malgn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'GATACACT'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'ACGACCACAGATACCGCTATTCACTATATCGTT'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscore\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m'\\n'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malgn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'fittingAlignment' is not defined" ] } ], "source": [ "D, max_score = fittingAlignment('GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score)\n", "print (max_score)\n", "algn = traceback(D,'GATACACT','ACGACCACAGATACCGCTATTCACTATATCGTT', score)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }