{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preprocessing helper functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "\n",
    "\n",
    "def z_array(s):\n",
    "    \"\"\" Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s \"\"\"\n",
    "    assert len(s) > 1\n",
    "    z = [len(s)] + [0] * (len(s)-1)\n",
    "\n",
    "    # Initial comparison of s[1:] with prefix\n",
    "    for i in range(1, len(s)):\n",
    "        if s[i] == s[i-1]:\n",
    "            z[1] += 1\n",
    "        else:\n",
    "            break\n",
    "    \n",
    "    r, l = 0, 0\n",
    "    if z[1] > 0:\n",
    "        r, l = z[1], 1\n",
    "\n",
    "    for k in range(2, len(s)):\n",
    "        assert z[k] == 0\n",
    "        if k > r:\n",
    "            # Case 1\n",
    "            for i in range(k, len(s)):\n",
    "                if s[i] == s[i-k]:\n",
    "                    z[k] += 1\n",
    "                else:\n",
    "                    break\n",
    "            r, l = k + z[k] - 1, k\n",
    "        else:\n",
    "            # Case 2\n",
    "            # Calculate length of beta\n",
    "            nbeta = r - k + 1\n",
    "            zkp = z[k - l]\n",
    "            if nbeta > zkp:\n",
    "                # Case 2a: zkp wins\n",
    "                z[k] = zkp\n",
    "            else:\n",
    "                # Case 2b: Compare characters just past r\n",
    "                nmatch = 0\n",
    "                for i in range(r+1, len(s)):\n",
    "                    if s[i] == s[i - k]:\n",
    "                        nmatch += 1\n",
    "                    else:\n",
    "                        break\n",
    "                l, r = k, r + nmatch\n",
    "                z[k] = r - k + 1\n",
    "    return z\n",
    "\n",
    "\n",
    "def n_array(s):\n",
    "    \"\"\" Compile the N array (Gusfield theorem 2.2.2) from the Z array \"\"\"\n",
    "    return z_array(s[::-1])[::-1]\n",
    "\n",
    "\n",
    "def big_l_prime_array(p, n):\n",
    "    \"\"\" Compile L' array (Gusfield theorem 2.2.2) using p and N array.\n",
    "        L'[i] = largest index j less than n such that N[j] = |P[i:]| \"\"\"\n",
    "    lp = [0] * len(p)\n",
    "    for j in range(len(p)-1):\n",
    "        i = len(p) - n[j]\n",
    "        if i < len(p):\n",
    "            lp[i] = j + 1\n",
    "    return lp\n",
    "\n",
    "\n",
    "def big_l_array(p, lp):\n",
    "    \"\"\" Compile L array (Gusfield theorem 2.2.2) using p and L' array.\n",
    "        L[i] = largest index j less than n such that N[j] >= |P[i:]| \"\"\"\n",
    "    l = [0] * len(p)\n",
    "    l[1] = lp[1]\n",
    "    for i in range(2, len(p)):\n",
    "        l[i] = max(l[i-1], lp[i])\n",
    "    return l\n",
    "\n",
    "\n",
    "def small_l_prime_array(n):\n",
    "    \"\"\" Compile lp' array (Gusfield theorem 2.2.4) using N array. \"\"\"\n",
    "    small_lp = [0] * len(n)\n",
    "    for i in range(len(n)):\n",
    "        if n[i] == i+1:  # prefix matching a suffix\n",
    "            small_lp[len(n)-i-1] = i+1\n",
    "    for i in range(len(n)-2, -1, -1):  # \"smear\" them out to the left\n",
    "        if small_lp[i] == 0:\n",
    "            small_lp[i] = small_lp[i+1]\n",
    "    return small_lp\n",
    "\n",
    "\n",
    "def good_suffix_table(p):\n",
    "    \"\"\" Return tables needed to apply good suffix rule. \"\"\"\n",
    "    n = n_array(p)\n",
    "    lp = big_l_prime_array(p, n)\n",
    "    return lp, big_l_array(p, lp), small_l_prime_array(n)\n",
    "\n",
    "\n",
    "def good_suffix_mismatch(i, big_l_prime, small_l_prime):\n",
    "    \"\"\" Given a mismatch at offset i, and given L/L' and l' arrays,\n",
    "        return amount to shift as determined by good suffix rule. \"\"\"\n",
    "    length = len(big_l_prime)\n",
    "    assert i < length\n",
    "    if i == length - 1:\n",
    "        return 0\n",
    "    i += 1  # i points to leftmost matching position of P\n",
    "    if big_l_prime[i] > 0:\n",
    "        return length - big_l_prime[i]\n",
    "    return length - small_l_prime[i]\n",
    "\n",
    "\n",
    "def good_suffix_match(small_l_prime):\n",
    "    \"\"\" Given a full match of P to T, return amount to shift as\n",
    "        determined by good suffix rule. \"\"\"\n",
    "    return len(small_l_prime) - small_l_prime[1]\n",
    "\n",
    "\n",
    "def dense_bad_char_tab(p, amap):\n",
    "    \"\"\" Given pattern string and list with ordered alphabet characters, create\n",
    "        and return a dense bad character table.  Table is indexed by offset\n",
    "        then by character. \"\"\"\n",
    "    tab = []\n",
    "    nxt = [0] * len(amap)\n",
    "    for i in range(0, len(p)):\n",
    "        c = p[i]\n",
    "        assert c in amap\n",
    "        tab.append(nxt[:])\n",
    "        nxt[amap[c]] = i+1\n",
    "    return tab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Class for preprocessed pattern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BoyerMoore(object):\n",
    "    \"\"\" Encapsulates pattern and associated Boyer-Moore preprocessing. \"\"\"\n",
    "\n",
    "    def __init__(self, p, alphabet='ACGT'):\n",
    "        # Create map from alphabet characters to integers\n",
    "        self.amap = {alphabet[i]: i for i in range(len(alphabet))}\n",
    "        # Make bad character rule table\n",
    "        self.bad_char = dense_bad_char_tab(p, self.amap)\n",
    "        # Create good suffix rule table\n",
    "        _, self.big_l, self.small_l_prime = good_suffix_table(p)\n",
    "\n",
    "    def bad_character_rule(self, i, c):\n",
    "        \"\"\" Return # skips given by bad character rule at offset i \"\"\"\n",
    "        assert c in self.amap\n",
    "        assert i < len(self.bad_char)\n",
    "        ci = self.amap[c]\n",
    "        return i - (self.bad_char[i][ci]-1)\n",
    "\n",
    "    def good_suffix_rule(self, i):\n",
    "        \"\"\" Given a mismatch at offset i, return amount to shift\n",
    "            as determined by (weak) good suffix rule. \"\"\"\n",
    "        length = len(self.big_l)\n",
    "        assert i < length\n",
    "        if i == length - 1:\n",
    "            return 0\n",
    "        i += 1  # i points to leftmost matching position of P\n",
    "        if self.big_l[i] > 0:\n",
    "            return length - self.big_l[i]\n",
    "        return length - self.small_l_prime[i]\n",
    "\n",
    "    def match_skip(self):\n",
    "        \"\"\" Return amount to shift in case where P matches T \"\"\"\n",
    "        return len(self.small_l_prime) - self.small_l_prime[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Matching function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def boyer_moore(p, p_bm, t):\n",
    "    \"\"\" Do Boyer-Moore matching \"\"\"\n",
    "    i = 0\n",
    "    occurrences = []\n",
    "    while i < len(t) - len(p) + 1:\n",
    "        shift = 1\n",
    "        mismatched = False\n",
    "        for j in range(len(p)-1, -1, -1):\n",
    "            if p[j] != t[i+j]:\n",
    "                skip_bc = p_bm.bad_character_rule(j, t[i+j])\n",
    "                skip_gs = p_bm.good_suffix_rule(j)\n",
    "                shift = max(shift, skip_bc, skip_gs)\n",
    "                mismatched = True\n",
    "                break\n",
    "        if not mismatched:\n",
    "            occurrences.append(i)\n",
    "            skip_gs = p_bm.match_skip()\n",
    "            shift = max(shift, skip_gs)\n",
    "        i += shift\n",
    "    return occurrences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 'haystack needle haystack' # \"text\" - thing we search in\n",
    "p = 'needle' # \"pattern\" - thing we search for"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "p_bm = BoyerMoore(p, alphabet='abcdefghijklmnopqrstuvwxyz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[9]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boyer_moore(p, p_bm, t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'needle'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t[9:9+len(p)]  # confirm it really occurs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 6, 12]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "boyer_moore(p, p_bm, 'needleneedleneedle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'haystack needle haystack'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}