{ "metadata": { "name": "CG_BoyerMooreApprox" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# First we make a function that splits a string p up into a set of\n", "# non-overlapping, non-empty substrings.\n", "\n", "def partition(p, pieces=2):\n", " assert len(p) >= pieces\n", " base, mod = len(p) / pieces, len(p) % pieces\n", " idx = 0\n", " ps = []\n", " modAdjust = 1\n", " for i in xrange(0, pieces):\n", " if i >= mod: modAdjust = 0\n", " newIdx = idx + base + modAdjust\n", " ps.append(p[idx:newIdx])\n", " idx = newIdx\n", " return ps" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "def bmApproximate(p, t, k, alph=\"ACGT\"):\n", " \"\"\" Use the pigeonhole principle together with Boyer-Moore to find\n", " approximate matches with up to a specified number of mismatches. \"\"\"\n", " assert len(p) >= k+1\n", " ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings\n", " off = 0 # offset into p of current partition\n", " occurrences = set() # note we might see the same occurrence >1 time\n", " for pi in ps: # for each partition\n", " # NOTE: I haven't given the implementation for the BMPreprocessing object.\n", " # It implements the Boyer-Moore skipping rules as we discussed in class.\n", " bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition\n", " for hit in bm_prep.match(t)[0]:\n", " if hit - off < 0: continue # pattern falls off left end of T?\n", " if hit + len(p) - off > len(t): continue # falls off right end?\n", " # Count mismatches to left and right of the matching partition\n", " nmm = 0\n", " for i in range(0, off) + range(off+len(pi), len(p)):\n", " if t[hit-off+i] != p[i]:\n", " nmm += 1\n", " if nmm > k: break # exceeded maximum # mismatches\n", " if nmm <= k:\n", " occurrences.add(hit-off) # approximate match\n", " off += len(pi) # Update offset of current partition\n", " return sorted(list(occurrences))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 4, "text": [ "[0, 7]" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 } ], "metadata": {} } ] }