{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Suffix tree" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that this suffix-tree implementation stores string labels for the edges. In reality, to achieve the O(m) space bound, we store (offset, length) pairs instead, where a pair refers to the substring of T labeling the edge. We use substrings here to keep the code simpler." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "class SuffixTree(object):\n", " \n", " class Node(object):\n", " def __init__(self, lab):\n", " self.lab = lab # label on path leading to this node\n", " self.out = {} # outgoing edges; maps characters to nodes\n", " \n", " def __init__(self, s):\n", " \"\"\" Make suffix tree, without suffix links, from s in quadratic time\n", " and linear space \"\"\"\n", " s += '$'\n", " self.root = self.Node(None)\n", " self.root.out[s[0]] = self.Node(s) # trie for just longest suf\n", " # add the rest of the suffixes, from longest to shortest\n", " for i in range(1, len(s)):\n", " # start at root; we’ll walk down as far as we can go\n", " cur = self.root\n", " j = i\n", " while j < len(s):\n", " if s[j] in cur.out:\n", " child = cur.out[s[j]]\n", " lab = child.lab\n", " # Walk along edge until we exhaust edge label or\n", " # until we mismatch\n", " k = j+1 \n", " while k-j < len(lab) and s[k] == lab[k-j]:\n", " k += 1\n", " if k-j == len(lab):\n", " cur = child # we exhausted the edge\n", " j = k\n", " else:\n", " # we fell off in middle of edge\n", " cExist, cNew = lab[k-j], s[k]\n", " # create “mid”: new node bisecting edge\n", " mid = self.Node(lab[:k-j])\n", " mid.out[cNew] = self.Node(s[k:])\n", " # original child becomes mid’s child\n", " mid.out[cExist] = child\n", " # original child’s label is curtailed\n", " child.lab = lab[k-j:]\n", " # mid becomes new child of original parent\n", " cur.out[s[j]] = mid\n", " else:\n", " # Fell off tree at a node: make new edge hanging off it\n", " cur.out[s[j]] = self.Node(s[j:])\n", " \n", " def followPath(self, s):\n", " \"\"\" Follow path given by s. If we fall off tree, return None. If we\n", " finish mid-edge, return (node, offset) where 'node' is child and\n", " 'offset' is label offset. If we finish on a node, return (node,\n", " None). \"\"\"\n", " cur = self.root\n", " i = 0\n", " while i < len(s):\n", " c = s[i]\n", " if c not in cur.out:\n", " return (None, None) # fell off at a node\n", " child = cur.out[s[i]]\n", " lab = child.lab\n", " j = i+1\n", " while j-i < len(lab) and j < len(s) and s[j] == lab[j-i]:\n", " j += 1\n", " if j-i == len(lab):\n", " cur = child # exhausted edge\n", " i = j\n", " elif j == len(s):\n", " return (child, j-i) # exhausted query string in middle of edge\n", " else:\n", " return (None, None) # fell off in the middle of the edge\n", " return (cur, None) # exhausted query string at internal node\n", " \n", " def hasSubstring(self, s):\n", " \"\"\" Return true iff s appears as a substring \"\"\"\n", " node, off = self.followPath(s)\n", " return node is not None\n", " \n", " def hasSuffix(self, s):\n", " \"\"\" Return true iff s is a suffix \"\"\"\n", " node, off = self.followPath(s)\n", " if node is None:\n", " return False # fell off the tree\n", " if off is None:\n", " # finished on top of a node\n", " return '$' in node.out\n", " else:\n", " # finished at offset 'off' within an edge leading to 'node'\n", " return node.lab[off] == '$'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "stree = SuffixTree('there would have been a time for such a word')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stree.hasSubstring('nope')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stree.hasSubstring('would have been')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stree.hasSubstring('such a word')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stree.hasSuffix('would have been')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stree.hasSuffix('such a word')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.15" } }, "nbformat": 4, "nbformat_minor": 1 }