{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Using an edit-distance-like dynamic programming formulation, we can\n", "# look for approximate occurrences of p in t.\n", "\n", "import sys\n", "import numpy\n", "\n", "# Assume x is the string labeling rows of the matrix and y is the\n", "# string labeling the columns\n", "\n", "def trace(D, x, y):\n", " ''' Backtrace edit-distance matrix D for strings x and y '''\n", " i, j = len(x), len(y)\n", " xscript = []\n", " while i > 0:\n", " diag, vert, horz = sys.maxsize, sys.maxsize, sys.maxsize\n", " delt = None\n", " if i > 0 and j > 0:\n", " delt = 0 if x[i-1] == y[j-1] else 1\n", " diag = D[i-1, j-1] + delt\n", " if i > 0:\n", " vert = D[i-1, j] + 1\n", " if j > 0:\n", " horz = D[i, j-1] + 1\n", " if diag <= vert and diag <= horz:\n", " # diagonal was best\n", " xscript.append('R' if delt == 1 else 'M')\n", " i -= 1; j -= 1\n", " elif vert <= horz:\n", " # vertical was best; this is an insertion in x w/r/t y\n", " xscript.append('I')\n", " i -= 1\n", " else:\n", " # horizontal was best\n", " xscript.append('D')\n", " j -= 1\n", " # j = offset of the first (leftmost) character of t involved in the\n", " # alignment\n", " return j, (''.join(xscript))[::-1] # reverse and string-ize\n", "\n", "def kEditDp(p, t):\n", " ''' Find and return the alignment of p to a substring of t with the\n", " fewest edits. We return the edit distance, the offset of the\n", " substring aligned to, and the edit transcript. If multiple\n", " alignments tie for best, we report the leftmost. '''\n", " D = numpy.zeros((len(p)+1, len(t)+1), dtype=int)\n", " # Note: First row gets zeros. First column initialized as usual.\n", " D[1:, 0] = range(1, len(p)+1)\n", " for i in range(1, len(p)+1):\n", " for j in range(1, len(t)+1):\n", " delt = 1 if p[i-1] != t[j-1] else 0\n", " D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)\n", " # Find minimum edit distance in last row\n", " mnJ, mn = None, len(p) + len(t)\n", " for j in range(len(t)+1):\n", " if D[len(p), j] < mn:\n", " mnJ, mn = j, D[len(p), j]\n", " # Backtrace; note: stops as soon as it gets to first row\n", " off, xcript = trace(D, p, t[:mnJ])\n", " # Return edit distance, offset into T, edit transcript\n", " return mn, off, xcript, D" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2, 5, 'MMRMMMMDMM')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p, t = 'TACGTCAGC', 'AACCCTATGTCATGCCTTGGA'\n", "mn, off, xscript, D = kEditDp(p, t)\n", "mn, off, xscript" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1],\n", " [2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1],\n", " [3, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2],\n", " [4, 3, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 3],\n", " [5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 1, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3],\n", " [6, 5, 5, 4, 3, 3, 4, 4, 3, 3, 2, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4],\n", " [7, 6, 5, 5, 4, 4, 4, 4, 4, 4, 3, 2, 1, 2, 3, 4, 4, 4, 4, 4, 5, 4],\n", " [8, 7, 6, 6, 5, 5, 5, 5, 5, 4, 4, 3, 2, 2, 2, 3, 4, 5, 5, 4, 4, 5],\n", " [9, 8, 7, 6, 6, 5, 6, 6, 6, 5, 5, 4, 3, 3, 3, 2, 3, 4, 5, 5, 5, 5]])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "D" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }