{ "cells": [ { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']\n", "[[ 2 -2 0 0 -3 1 -1 -1 -1 -2 -1 0 1 0 -2 1 1 0 -6 -3 -5]\n", " [-2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4 0 -2 -2 -8 0 -5]\n", " [ 0 -5 4 3 -6 1 1 -2 0 -4 -3 2 -1 2 -1 0 0 -2 -7 -4 -5]\n", " [ 0 -5 3 4 -5 0 1 -2 0 -3 -2 1 -1 2 -1 0 0 -2 -7 -4 -5]\n", " [-3 -4 -6 -5 9 -5 -2 1 -5 2 0 -3 -5 -5 -4 -3 -3 -1 0 7 -5]\n", " [ 1 -3 1 0 -5 5 -2 -3 -2 -4 -3 0 0 -1 -3 1 0 -1 -7 -5 -5]\n", " [-1 -3 1 1 -2 -2 6 -2 0 -2 -2 2 0 3 2 -1 -1 -2 -3 0 -5]\n", " [-1 -2 -2 -2 1 -3 -2 5 -2 2 2 -2 -2 -2 -2 -1 0 4 -5 -1 -5]\n", " [-1 -5 0 0 -5 -2 0 -2 5 -3 0 1 -1 1 3 0 0 -2 -3 -4 -5]\n", " [-2 -6 -4 -3 2 -4 -2 2 -3 6 4 -3 -3 -2 -3 -3 -2 2 -2 -1 -5]\n", " [-1 -5 -3 -2 0 -3 -2 2 0 4 6 -2 -2 -1 0 -2 -1 2 -4 -2 -5]\n", " [ 0 -4 2 1 -3 0 2 -2 1 -3 -2 2 0 1 0 1 0 -2 -4 -2 -5]\n", " [ 1 -3 -1 -1 -5 0 0 -2 -1 -3 -2 0 6 0 0 1 0 -1 -6 -5 -5]\n", " [ 0 -5 2 2 -5 -1 3 -2 1 -2 -1 1 0 4 1 -1 -1 -2 -5 -4 -5]\n", " [-2 -4 -1 -1 -4 -3 2 -2 3 -3 0 0 0 1 6 0 -1 -2 2 -4 -5]\n", " [ 1 0 0 0 -3 1 -1 -1 0 -3 -2 1 1 -1 0 2 1 -1 -2 -3 -5]\n", " [ 1 -2 0 0 -3 0 -1 0 0 -2 -1 0 0 -1 -1 1 3 0 -5 -3 -5]\n", " [ 0 -2 -2 -2 -1 -1 -2 4 -2 2 2 -2 -1 -2 -2 -1 0 4 -6 -2 -5]\n", " [-6 -8 -7 -7 0 -7 -3 -5 -3 -2 -4 -4 -6 -5 2 -2 -5 -6 17 0 -5]\n", " [-3 0 -4 -4 7 -5 0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2 0 10 -5]\n", " [-5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5]]\n" ] } ], "source": [ "#Reading the scoring function from txt file\n", "import numpy as np\n", "f = open('input/PAM250_1.txt', 'r')\n", "score = [line.strip().split() for line in f]\n", "alphabet = score[0]\n", "PAM250 = np.array([i[1:] for i in score[1:]], dtype = 'int')\n", "print alphabet\n", "print PAM250" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def smithWaterman(x, y, score):\n", " \"\"\" Calculate local alignment value of sequences x and y using\n", " dynamic programming. Return local alignment value. \"\"\"\n", " \n", " D = np.zeros((len(x)+1, len(y)+1), dtype = 'int') \n", " for i in range(1, len(x)+1):\n", " for j in range(1, len(y)+1):\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]\n", " vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]\n", " diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]\n", " D[i,j] = max(horz, vert, diag, 0)\n", " argmax = np.where(D == D.max())\n", " return D, int(D[argmax])" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 0 0 0 0 0 0 0 0]\n", " [ 0 0 0 0 0 4 0 0]\n", " [ 0 0 4 1 0 0 4 0]\n", " [ 0 1 0 4 3 0 1 1]\n", " [ 0 0 2 2 4 0 0 0]\n", " [ 0 0 0 0 0 10 5 0]\n", " [ 0 0 0 0 0 5 7 15]]\n", "Best score=15, in cell (6, 7)\n" ] } ], "source": [ "D, best = smithWaterman('MEANLY','PENALTY', PAM250)\n", "print(D)\n", "print(\"Best score=%d, in cell %s\" % (best, np.unravel_index(np.argmax(D), D.shape)))" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "55" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.argmax(D)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def traceback(D, x, y, score):\n", " \"\"\" Trace back from given cell in local-alignment matrix D \"\"\"\n", " # get i, j for maximal cell\n", " i, j = np.unravel_index(np.argmax(D), D.shape)\n", " alx, aly = [], []\n", " while (i > 0 or j > 0) and D[i,j] != 0:\n", " diag, horz, vert = 0, 0, 0\n", " if i > 0 and j > 0:\n", " diag = D[i-1, j-1] + score[alphabet.index(x[i-1]), alphabet.index(y[j-1])]\n", " if i > 0:\n", " vert = D[i-1, j] + score[alphabet.index(x[i-1]), alphabet.index('-')]\n", " if j > 0:\n", " horz = D[i, j-1] + score[alphabet.index('-'), alphabet.index(y[j-1])]\n", " if diag >= vert and diag >= horz:\n", " alx.append(x[i-1]); aly.append(y[j-1])\n", " i -= 1; j -= 1 \n", " elif vert >= horz:\n", " alx.append(x[i-1]); aly.append('-')\n", " i -= 1\n", " else:\n", " alx.append('-'); aly.append(y[j-1])\n", " j -= 1\n", " alignment = map(lambda x: ''.join(x), [alx[::-1], aly[::-1]])\n", " return alignment" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "EANL-Y\n", "ENALTY\n" ] } ], "source": [ "x = 'MEANLY'\n", "y = 'PENALTY'\n", "algn = traceback(D, x, y, PAM250)\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3682\n", "NP-CPPA--GEC-LAVNMDPEYGAEDTPSKMMWHEVIFLFYGFHSGNSSISLRSFRMEVQYHSADKKKLWFTT-G--WIFQTMLFSQWHDRVD-RIQSAF-QICYMSALEKYPAMMDASRWNPQGTLWMATYMEWVDGMWSTAERHYPWN-LDHNEDFGFRRNDMYVLWCFVDHFFKDDSTLSGIFCLITFP-PGWFW-FQSIPDMKKFGLMHFFGGTWPAGSQHFANSKFTINL-MRPL-D-DWK-THHQFTMMAFQTGLKRFKGIQAAYM-HERRADKNSM-RRYWNRNNVPFYIFYDVHV-KWRLPQFVCACMEPPMVVTMS-ELGDMQ-SEFFHLNNQADCKNEFCARS-L-K--GYLFYLRVHCGCRSHPKNENWADFAWQKAHGVSIGKAQHAITEGFYSHGNNEPMF-CN--LHPWEPKPHMVIIQDF--ICKLFDEWQTMQWNLSNHLE-L-IN-IPC-GMM-YPG-AVEGGAWSAD-S-KQTGQLAVQLEFIVYKNMEYGQRDVC-DAIHNGKCLEYCYHG-WDYDRGLCFLIIPVENLYLC-MTTW-DNCTPLFLNPTIPARKPFPDIWQAMGSLVSLGTELTHEQVMGGPRRFTNF--Q-YHVGKSPETRQSGNIEKSIKHFHKLQGQF-RNWCMVQYLYQGIDDPDKSMGRPQCHCFNHGDEPPQSDMW-FCNGPVYETDMSMLRWQFRCKWPDMGYDFIHAHY-FVDFMTWG-ALEWMITSF---I-F-F-IGIEKGNSMV-SILY-VVGIMTYMFNMLTFMGEEVQCYSDIDNTATLDQLQ-VLRPSM-VLNNVPRQTRGHFRSDQTHARTCAHMHYLSIIG-GFFLYMQDGDEFIFCPQGAKLRMTTTYPPMIEKSATFDQNDNVELVLCLMHTVEHHSFATKDEHILQVPPFDGYHGPIEEKWWVQKIFKMTSMVQVWHVEGEKAFKKNYPVS-MIDDKGLCSHDHGSFQ-Q-LSYWMIKQHYHAIC--MAHKDCR---G--WGHASAIWTKPYQCDMEFSHMHLGKHCLYTD-K-NFHDSGAVHSKCFDNGNDSCWNIQNNHQFQEMKEGGYDDVIESCLAIQVGQQFKFEENSSMCPCRHCCHICQMMNWSPKLHE-IWSF-GCVLDN--QSS-MGQF-W-EETKWREKSWGN-E--HISEASSTNIHNDTYKTPDLHAYHERDDQPGEWWMQNLRCVNVFRYHDKKQGMGGPSYPVKTVTYLACYQTDIQFWMHQAWSMRINWPQKRIPETFHPFQYCYFQHRGPTKHGLKYWKMSFAKLQISLKWPHWYYKDWR-RDCQGGCVTMDRISNHIYIHPCSVRGDREYACCFKLTGTDHYKTRNTENKEPLWLEVPINVMEQTWHSLHVVFCFAC-LGFHHGKRFRL---NTS----VE-DLAWM-E--QKRTATVEFELVIPRAR------R-TRPILHCQCWCIHVFVIAERDPGYV--GTV--FNKETCSTRSWEGLKILSSNGHLMATMDKIEFPLCWSEHTDNFGLAHCEDYFDSWHWPLWYCV-Q--QEWVMK------FGE--Q--KRNEDWNHIDHEWNEIEPFASHAIGIVHIPAHCDYIV--VDYNVSGGYPPC---KGSTNVNMISCNWNVGKRAPCAPHTRKDDCKLSVT-A---ETLLKKDNFITVFVMLGWYDKQNVKIQPEYGLTCTPKWTATNEYGTTDDLMPIPCGEQNQYSFVWKHNMRDLMYTTKALIQVPRCYI--P-DQ----P--GARY--VEL--PWMNAGAHYQKVPNF---S----HWQVRMRYKRMMMSTDMHCIQRYVPYRGPQFWINGFPLNGGEYVGFSMNSCNNFNEIEHIREENLLGNFLMNNCFQMEQY-VFKARNYFIECDEY---S-SI--S-RPYGMPFY--AQKALMIVIAFVDTIPGDFVSLKQ-SYFR-NMLAF-GIML-ILRQAV-QWYKAHHHHYAVDWFTRSYIR-RIFYHNHFDMVVCPDWHICPDGPDYREVIWSFPDHLSQKGMAKNVYQVIYIEYPEPQKIFPCHKFHSEDISFKWMYIK--RDY---YTWVNDFLRDFGEQDITCMRNKFAHRHEQMNINPACDGKIPAVINCAYGFRQKQIKQIIHWPKENDFRCAKYALLIWQKNCREL-ENDVYHHKESVEQVWAGFTEKQRISWTHICNETLMCSIRCFADPENCPAETWTRHEWIQQEGDFDLDEHATSPTLMPERCSMQIGNDAHMNDAAHGDMHWLL-W-NYLNYKLGFVTKPVTMNCEFAVTWML-VK-WFQGIIDSF--Q-IAMMMIIGIMDF-SEN-RIHSVED-NRLYFHPKLWQQSAFMNYRGTINHRW--PQRANYSRREFVLATRPMVRTFKFVFMMVCMLNTYDTQLKAEWF--DPL-IK-GMNTWYMVAMW-AHSDQQHMDPC--LD--GMQHDAR-HYWCHDH-WMRGHF--Y-PMMP----E-H--W--SRQRRDAIV-NR-G-FMGNR-EGVGPE-FISMHPIVDDFHTLTKPSNETIAWRQSKENPARDKMH-PFYGLKDRGQIMGQVM-LD-QIR-VHWIYKEL-S-PVPRR-LPCNRSTSDQAWTLHTLHIRPWHV-EQEEVVRCR-LKN--APFN-D-PNFDFMI-LFTVVYNYMMRKF--EFVIKN-GDKW-RFTWTCMDMIQLVWRYVLPFGKKQVTQTFCKKDYVVRKNELSKFMSS-QSMFPVRR-WMQRINEGK-HQHGYS-NKQAGRASQTPQQDKAMRRHEKEPLRYNKDSLEHNNESSGVGFYGRIKKIVVCLLAVICITAHNGTTGQYGAFNAAR-NFCIMYRDI-KAFQGDH-HR-KCESKDHDEQLDNSRMPAEGQRAYDMLRK-MWFD-FDNNFGIMSH----R-REY-LHRCAPGC-WKWQCFLDLY--SWIFDTQIYGVPRCKFHK-PTQSNVAQC--C-IIMEPT\n", "SPFCAPGRCGNCVFAI-MAHHW----SRSKLVSYG-YFGSYGCEVPQNA-PVRPF-M---FKPTNWFCARFEACSKQWAFPVSRHRTRHKDTEKKLGMSYCSPCTASQCGPYHA-DEAGK--PMGYNY-GHYGNCPKAV-ATNVGG-PAVCMMRDIDLAY-CAHL-AM-G-ISH-LGIVHPMWPNY-TNRYKCPTW-WKYATLARIQPWNSENANCKLWSRLCIQ-PMCVL-VNLWISDVYQWLWMFVWPRWKMLE-Q-G-DEVSGWQHAYGCCGEANIIDCKEEDLWEPEAL-F-E--GVHLYERR--QYK-H-I-GWIVVARCFKVGDIKYSAELEVTVEGDC-ETYENWGDLTKIVDYMTFFAMQKSCYYQP-----C--VWA-ARAIQV-HTCHQL--GFVELVKSMRVTECKARLGNFDGHG-LNEGPSFLHIC--F-QW-T-EWCLTQRDDGMPMRCVSCFAQMENQEYAIVPG-WRISYRWRVTGGI-IKAIFPT-K-IYY-LSHMCVDFMWQMK--DW-YDGPRP-NKSE-YMQYAQNEGY-CGFAHMIDACSPWCME-SILCFSCIHIRWKC-EN-I-MES-FTHETSLGCP-AFVTYNPHLYTTHSSPPAWMTG-IG-EYM-YCEYE-KYPQPW--IH-G-DSIGASGVIMCLGQ-A-YVWGHE--Q-DLWGFIHDCVAICCPPKEHYEL-CNWRDWIFAF-RSSFSFMGRMPMQHAYVWSLPNFQQQVHFKFWIGSAQ-HYMRWANQYQHKGALSF-YQWLHPPGQRI--WPDIHAGIVMVPWQKAVSETLKAMGHDECTMWIGIDNNSYTGKICA---YAEVRGIKHYIRRHAHHPGIIC--FSQL----TFNPVCQDKFYYVH-GTV-YIKG-NHS---WCYA-KMEPI--IPPFDGFHGPIEESWWVEKI-K-TSMVQVWHV------KKIYPVSTKIDD--LFSHDHGSFQYQCLSYWMIKQHYHAKCTETAHKDCQISVSVYWGHASAIWTKIYQCVMEFSHMHLGKHCFYNNFSVAWCDSGPVHSKCFKIMKCVC-MIQNNHQ-EQYGEYGYDDVIESFLIYPI--QVKFEEVSSW-PPIE-VHICQMMVWSPKLHEIIW-FRGCVLDRMVPEKFVKQYRRCPETKWREKSWGNADCWTIYRESSTNIHNDLYKTPDL----------GN--I--M-VMKFFRYHDKKQGMGGPG-DV--LT-MK-I-TKKHFWM--SR--KSDR--KRIPETFHPVQYCYFQHR---------W---FAKLQISLKWPHWYYKDWRIRDCM--Y-TQDRISNHI-IH------DK-YHIC-SVRGWREYAICNKLATEN-W-E-PLNDMENTWHSLLVVFCFACNL-FHHGKRFRQEGGNTSNHWNNNIDLAWMAESFQKRTATVEFELVIPRIRMAGLANHCTRPILHCQCWCIHVFVISERDPGYVFTEPMGGFNKETCSTRSYSA----AW-S--MATMDKIQLEIP-GPAFLLFMLAHCEDRFDVWHWPLWYCVLKGLQEWVMQWVTFCDFGEVRDKIKRNEDTNHIDHELNEIEPFASDAIGIVHEMLH--FMACPAHCAIMNR-MRCEVFNSSTNVNMIM-NWNVGKRAPCAPHTRKDDCKHSVTGPLYCETLLKKDNFITV-VMLGWYDKQNVKFQPSYG---TPKWTAWNEYGRSGDLMPIP---QHN-P-MW-----DL----KALIQVPRCYIGNPSQQAEMKPCEGARFGCFSLRSSWMNA-AHYQKVPNFLRKSMGWLHWQVRMRYKWMMMSTKMHCIQRYVPYRGPQ---NG-G-E-G--VGWNMN--EAF--C-N--AQ-LFANFLMNNCFEV-VFWRMEVPEYW-QCSVFRDAPHAICYNQRPYGMPFYSSTQKALMI-V-FC-TV-GPYTVDKHYRHIPGDTVSLKSSKADILRQAMFRWYNS---TYAVDWFTRSYIRTHDFANQNMDMVVHPDWH---DGPDYREVIISF---WNP-ANAKNVYQVIYIEYRRPQKIFPCHKFHSEDISMKWMYIRVPRFYTRDYTWVNDFLRDF---DITCMRNKFAARHEVMNINPACDGKIPAVINCAYGFRQKQIKQIQHWPKENDFRCAKYAVHE-HRICSHIWQKDVYHHKECVE-SRPS-GCKPR-WW-RA-KKYWVCHFWHY-KAE-YVMEQ-IMLA-F-DEMQYEMWRHYHAWDFI-G-CRRWM--DILM--AWQIPRCWIFSREKY-GFHDEFMRDP---QCMLHY-WLVSFSAW--CITTTFCWRCVNKDMFQQHAKFYADNTDICKEHGFDQNY-GPIL-IGSMF-S-MCTFEPWWFKGIKGMYA-FQFLMEHKATDWCFTAAVIPFKIVNV-GAAQATNWFQSESITINCGPGG-CQSLFRCDNSPTQFYSKCTLLKFWG-DKTPRMGPWCATHDW-KGPWTTYESKLAFHFGDKKGFWFDEPASINSVTKSWFSCYTGVRWMGIGPECRIG-NAIDEPWWEMEEDNLDYTQSKLSTNMHGWCYRHVRFFP--D-SQVQPQAMYLQWHRKSAQESFHHLISRRDERKEFA-KRET-HITW--HTPDVIPWDISDQMELFKFWGWKDCMIKFQTECVKTNTSTRFFKGI-HW-ARAFIRPF-SKPFANPWYKLS-KCVERC-MHWDWAQMIS-K-GWEDF--IT-LLRWGSMF-FTECVQAYFMLSPEYAQHYTTPDVFEWVFDFHNEMVQVSGNPDHHQVTCSPTNN--QC-TGRM-LPF-SPG-GFHPR-SK----LMQKR-I-PW-PRTEQ-KTFQRLRQNFRSLEPGAPRGEEDDHIYMAHYLGETHNYMIQNCTMNM-G-YIYYIIPACV-FVMYTIQFQLMSEGLFGSFEQFEIHIC---CHWN-DCVAKQWEKRWAFVMMLPGIRKQTWKDFTMHNEFISCNWCQEMLNAT\n" ] } ], "source": [ "x, y = [i.strip() for i in open('input/rosalind_ba5f.txt', 'r')]\n", "D, best = smithWaterman(x, y, PAM250)\n", "algn = traceback(D, x, y, PAM250)\n", "print best\n", "print '\\n'.join(algn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }