{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Not a great way to build a suffix array, but we'll use it\n", "# for the small examples here\n", "def naiveBuildSA(t):\n", " satups = sorted([(t[i:], i) for i in range(len(t))])\n", " return list(map(lambda x: x[1], satups))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[6, 5, 2, 3, 0, 4, 1]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "naiveBuildSA('abaaba$') # works on a simple example" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def binarySearchSA(t, sa, p):\n", " assert t[-1] == '$' # t already has terminator\n", " assert len(t) == len(sa) # sa is the suffix array for t\n", " if len(t) == 1: return 1\n", " l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n", " while True:\n", " c = (l + r) // 2\n", " # determine whether p < T[sa[c]:] by doing comparisons\n", " # starting from left-hand sides of p and T[sa[c]:]\n", " plt = True # assume p < T[sa[c]:] until proven otherwise\n", " i = 0\n", " while i < len(p) and sa[c]+i < len(t):\n", " if p[i] < t[sa[c]+i]:\n", " break # p < T[sa[c]:]\n", " elif p[i] > t[sa[c]+i]:\n", " plt = False\n", " break # p > T[sa[c]:]\n", " i += 1 # tied so far\n", " if plt:\n", " if c == l + 1: return c\n", " r = c\n", " else:\n", " if c == r - 1: return r\n", " l = c" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = 'abaaba$'\n", "sa = naiveBuildSA(t)\n", "binarySearchSA(t, sa, 'aba')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binarySearchSA(t, sa, 'bb') # p is greater than all suffixes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binarySearchSA(t, sa, 'aa')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def suffixLcp(t, toff, p):\n", " i = 0\n", " while i < len(p) and i + toff < len(t):\n", " if p[i] != t[i + toff]:\n", " return i\n", " i += 1\n", " return i" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "suffixLcp('abaaba$', 0, 'aba')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "suffixLcp('abaaba$', 0, 'abab')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "suffixLcp('abaaba$', 0, 'abaabaaba')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def binarySearchSA_lcp1(t, sa, p):\n", " assert t[-1] == '$' # t already has terminator\n", " assert len(t) == len(sa) # sa is the suffix array for t\n", " if len(t) == 1: return 1\n", " l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n", " lcp_lp, lcp_rp = 0, 0\n", " while True:\n", " c = (l + r) // 2\n", " # determine whether p < T[sa[c]:] by doing comparisons\n", " # starting from left-hand sides of p and T[sa[c]:]\n", " plt = True # assume p < T[sa[c]:] until proven otherwise\n", " i = min(lcp_lp, lcp_rp)\n", " while i < len(p) and sa[c]+i < len(t):\n", " if p[i] < t[sa[c]+i]:\n", " break # p < T[sa[c]:]\n", " elif p[i] > t[sa[c]+i]:\n", " plt = False\n", " break # p > T[sa[c]:]\n", " i += 1 # tied so far\n", " if plt:\n", " if c == l + 1: return c\n", " r = c\n", " lcp_rp = i\n", " else:\n", " if c == r - 1: return r\n", " l = c\n", " lcp_lp = i" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binarySearchSA_lcp1(t, sa, 'aba')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binarySearchSA_lcp1(t, sa, 'bb')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "binarySearchSA_lcp1(t, sa, 'aa')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }