{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing helper functions" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import string\n", "\n", "\n", "def z_array(s):\n", " \"\"\" Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s \"\"\"\n", " assert len(s) > 1\n", " z = [len(s)] + [0] * (len(s)-1)\n", "\n", " # Initial comparison of s[1:] with prefix\n", " for i in range(1, len(s)):\n", " if s[i] == s[i-1]:\n", " z[1] += 1\n", " else:\n", " break\n", " \n", " r, l = 0, 0\n", " if z[1] > 0:\n", " r, l = z[1], 1\n", "\n", " for k in range(2, len(s)):\n", " assert z[k] == 0\n", " if k > r:\n", " # Case 1\n", " for i in range(k, len(s)):\n", " if s[i] == s[i-k]:\n", " z[k] += 1\n", " else:\n", " break\n", " r, l = k + z[k] - 1, k\n", " else:\n", " # Case 2\n", " # Calculate length of beta\n", " nbeta = r - k + 1\n", " zkp = z[k - l]\n", " if nbeta > zkp:\n", " # Case 2a: zkp wins\n", " z[k] = zkp\n", " else:\n", " # Case 2b: Compare characters just past r\n", " nmatch = 0\n", " for i in range(r+1, len(s)):\n", " if s[i] == s[i - k]:\n", " nmatch += 1\n", " else:\n", " break\n", " l, r = k, r + nmatch\n", " z[k] = r - k + 1\n", " return z\n", "\n", "\n", "def n_array(s):\n", " \"\"\" Compile the N array (Gusfield theorem 2.2.2) from the Z array \"\"\"\n", " return z_array(s[::-1])[::-1]\n", "\n", "\n", "def big_l_prime_array(p, n):\n", " \"\"\" Compile L' array (Gusfield theorem 2.2.2) using p and N array.\n", " L'[i] = largest index j less than n such that N[j] = |P[i:]| \"\"\"\n", " lp = [0] * len(p)\n", " for j in range(len(p)-1):\n", " i = len(p) - n[j]\n", " if i < len(p):\n", " lp[i] = j + 1\n", " return lp\n", "\n", "\n", "def big_l_array(p, lp):\n", " \"\"\" Compile L array (Gusfield theorem 2.2.2) using p and L' array.\n", " L[i] = largest index j less than n such that N[j] >= |P[i:]| \"\"\"\n", " l = [0] * len(p)\n", " l[1] = lp[1]\n", " for i in range(2, len(p)):\n", " l[i] = max(l[i-1], lp[i])\n", " return l\n", "\n", "\n", "def small_l_prime_array(n):\n", " \"\"\" Compile lp' array (Gusfield theorem 2.2.4) using N array. \"\"\"\n", " small_lp = [0] * len(n)\n", " for i in range(len(n)):\n", " if n[i] == i+1: # prefix matching a suffix\n", " small_lp[len(n)-i-1] = i+1\n", " for i in range(len(n)-2, -1, -1): # \"smear\" them out to the left\n", " if small_lp[i] == 0:\n", " small_lp[i] = small_lp[i+1]\n", " return small_lp\n", "\n", "\n", "def good_suffix_table(p):\n", " \"\"\" Return tables needed to apply good suffix rule. \"\"\"\n", " n = n_array(p)\n", " lp = big_l_prime_array(p, n)\n", " return lp, big_l_array(p, lp), small_l_prime_array(n)\n", "\n", "\n", "def good_suffix_mismatch(i, big_l_prime, small_l_prime):\n", " \"\"\" Given a mismatch at offset i, and given L/L' and l' arrays,\n", " return amount to shift as determined by good suffix rule. \"\"\"\n", " length = len(big_l_prime)\n", " assert i < length\n", " if i == length - 1:\n", " return 0\n", " i += 1 # i points to leftmost matching position of P\n", " if big_l_prime[i] > 0:\n", " return length - big_l_prime[i]\n", " return length - small_l_prime[i]\n", "\n", "\n", "def good_suffix_match(small_l_prime):\n", " \"\"\" Given a full match of P to T, return amount to shift as\n", " determined by good suffix rule. \"\"\"\n", " return len(small_l_prime) - small_l_prime[1]\n", "\n", "\n", "def dense_bad_char_tab(p, amap):\n", " \"\"\" Given pattern string and list with ordered alphabet characters, create\n", " and return a dense bad character table. Table is indexed by offset\n", " then by character. \"\"\"\n", " tab = []\n", " nxt = [0] * len(amap)\n", " for i in range(0, len(p)):\n", " c = p[i]\n", " assert c in amap\n", " tab.append(nxt[:])\n", " nxt[amap[c]] = i+1\n", " return tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Class for preprocessed pattern" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class BoyerMoore(object):\n", " \"\"\" Encapsulates pattern and associated Boyer-Moore preprocessing. \"\"\"\n", "\n", " def __init__(self, p, alphabet='ACGT'):\n", " # Create map from alphabet characters to integers\n", " self.amap = {alphabet[i]: i for i in range(len(alphabet))}\n", " # Make bad character rule table\n", " self.bad_char = dense_bad_char_tab(p, self.amap)\n", " # Create good suffix rule table\n", " _, self.big_l, self.small_l_prime = good_suffix_table(p)\n", "\n", " def bad_character_rule(self, i, c):\n", " \"\"\" Return # skips given by bad character rule at offset i \"\"\"\n", " assert c in self.amap\n", " assert i < len(self.bad_char)\n", " ci = self.amap[c]\n", " return i - (self.bad_char[i][ci]-1)\n", "\n", " def good_suffix_rule(self, i):\n", " \"\"\" Given a mismatch at offset i, return amount to shift\n", " as determined by (weak) good suffix rule. \"\"\"\n", " length = len(self.big_l)\n", " assert i < length\n", " if i == length - 1:\n", " return 0\n", " i += 1 # i points to leftmost matching position of P\n", " if self.big_l[i] > 0:\n", " return length - self.big_l[i]\n", " return length - self.small_l_prime[i]\n", "\n", " def match_skip(self):\n", " \"\"\" Return amount to shift in case where P matches T \"\"\"\n", " return len(self.small_l_prime) - self.small_l_prime[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Matching function" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def boyer_moore(p, p_bm, t):\n", " \"\"\" Do Boyer-Moore matching \"\"\"\n", " i = 0\n", " occurrences = []\n", " while i < len(t) - len(p) + 1:\n", " shift = 1\n", " mismatched = False\n", " for j in range(len(p)-1, -1, -1):\n", " if p[j] != t[i+j]:\n", " skip_bc = p_bm.bad_character_rule(j, t[i+j])\n", " skip_gs = p_bm.good_suffix_rule(j)\n", " shift = max(shift, skip_bc, skip_gs)\n", " mismatched = True\n", " break\n", " if not mismatched:\n", " occurrences.append(i)\n", " skip_gs = p_bm.match_skip()\n", " shift = max(shift, skip_gs)\n", " i += shift\n", " return occurrences" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "t = 'haystack needle haystack' # \"text\" - thing we search in\n", "p = 'needle' # \"pattern\" - thing we search for" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "p_bm = BoyerMoore(p, alphabet='abcdefghijklmnopqrstuvwxyz')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[9]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boyer_moore(p, p_bm, t)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'needle'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t[9:9+len(p)] # confirm it really occurs" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 6, 12]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boyer_moore(p, p_bm, 'needleneedleneedle')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'haystack needle haystack'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 1 }