{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# For space reasons,I'm not showing the implementations of these\n", "# imported functions\n", "from index_edit import kEditDp, Index, partition" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "def queryIndexEdit(p, t, k, index):\n", " ''' Look for occurrences of p in t with up to k edits using an\n", " index combined with dynamic-programming alignment. '''\n", " l = index.ln\n", " occurrences = []\n", " seen = set() # for avoiding reporting same hit twice\n", " for part, poff in partition(p, k+1):\n", " for hit in index.occurrences(part): # query index w/ partition\n", " # left edge of T to include in DP matrix\n", " lf = max(0, hit - poff - k)\n", " # right edge of T to include in DP matrix\n", " rt = min(len(t), hit - poff + len(p) + k)\n", " mn, off, xcript = kEditDp(p, t[lf:rt])\n", " off += lf\n", " if mn <= k and (mn, off) not in seen:\n", " occurrences.append((mn, off, xcript))\n", " seen.add((mn, off))\n", " return occurrences" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "t = 'I had two banana splits'\n", "ind = Index(t, ln=4)\n", "queryIndexEdit('bananasplit', t, 1, ind)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": [ "[(1, 10, 'MMMMMMDMMMMM')]" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "t = 'haystack needle haystack noodle haystack nedle haystack'\n", "# needle noodle ne-dle\n", "# |||||| | ||| || |||\n", "# needle needle needle\n", "p = 'needle'\n", "# Find the two occurrences that are within edit distance 1\n", "# The substring length for the index has to be <= 3 (why?)\n", "queryIndexEdit(p, t, 1, Index(t, ln=3))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM')]" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "# Find the three occurrences that are within edit distance 2\n", "# The substring length for the index has to be <= 2 (why?)\n", "queryIndexEdit('needle', t, 2, Index(t, ln=2))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "[(0, 9, 'MMMMMM'), (1, 41, 'MIMMMM'), (2, 25, 'MRRMMM')]" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }