{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "def suffixArray(s):\n", " ''' Given T return suffix array SA(T). Uses \"sorted\"\n", " function for simplicity, which is probably very slow. '''\n", " satups = sorted([(s[i:], i) for i in range(len(s))])\n", " return list(map(lambda x: x[1], satups)) # extract, return just offsets\n", "\n", "def bwtFromSa(t, sa=None):\n", " ''' Given T, returns BWT(T) by way of the suffix array. '''\n", " bw = []\n", " dollarRow = None\n", " if sa is None:\n", " sa = suffixArray(t)\n", " for si in sa:\n", " if si == 0:\n", " dollarRow = len(bw)\n", " bw.append('$')\n", " else:\n", " bw.append(t[si-1])\n", " return ''.join(bw), dollarRow # return string-ized version of list bw" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class FmCheckpoints(object):\n", " ''' Manages rank checkpoints and handles rank queries, which are\n", " O(1) time, with the checkpoints taking O(m) space, where m is\n", " length of text. '''\n", " \n", " def __init__(self, bw, cpIval=4):\n", " ''' Scan BWT, creating periodic checkpoints as we go '''\n", " self.cps = {} # checkpoints\n", " self.cpIval = cpIval # spacing between checkpoints\n", " tally = {} # tally so far\n", " # Create an entry in tally dictionary and checkpoint map for\n", " # each distinct character in text\n", " for c in bw:\n", " if c not in tally:\n", " tally[c] = 0\n", " self.cps[c] = []\n", " # Now build the checkpoints\n", " for i, c in enumerate(bw):\n", " tally[c] += 1 # up to *and including*\n", " if i % cpIval == 0:\n", " for c in tally.keys():\n", " self.cps[c].append(tally[c])\n", " \n", " def rank(self, bw, c, row):\n", " ''' Return # c's there are in bw up to and including row '''\n", " if row < 0 or c not in self.cps:\n", " return 0\n", " i, nocc = row, 0\n", " # Always walk to left (up) when calculating rank\n", " while (i % self.cpIval) != 0:\n", " if bw[i] == c:\n", " nocc += 1\n", " i -= 1\n", " return self.cps[c][i // self.cpIval] + nocc" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "st = 'teststring'\n", "# 0123456789\n", "cps = FmCheckpoints(st)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1, 1, 1, 2, 2, 3, 3, 3, 3, 3]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# should get back list of integers, where elt i gives\n", "# # times 't' appears up to and including offset i\n", "[ cps.rank(st, 't', i) for i in range(10) ]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# likewise for 'g'\n", "[ cps.rank(st, 'g', i) for i in range(10) ]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "class FmIndex():\n", " ''' O(m) size FM Index, where checkpoints and suffix array samples are\n", " spaced O(1) elements apart. Queries like count() and range() are\n", " O(n) where n is the length of the query. Finding all k\n", " occurrences of a length-n query string takes O(n + k) time.\n", " \n", " Note: The spacings in the suffix array sample and checkpoints can\n", " be chosen differently to achieve different bounds. '''\n", " \n", " @staticmethod\n", " def downsampleSuffixArray(sa, n=4):\n", " ''' Take only the suffix-array entries for every nth suffix. Keep\n", " suffixes at offsets 0, n, 2n, etc with respect to the text.\n", " Return map from the rows to their suffix-array values. '''\n", " ssa = {}\n", " for i, suf in enumerate(sa):\n", " # We could use i % n instead of sa[i] % n, but we lose the\n", " # constant-time guarantee for resolutions\n", " if suf % n == 0:\n", " ssa[i] = suf\n", " return ssa\n", " \n", " def __init__(self, t, cpIval=4, ssaIval=4):\n", " if t[-1] != '$':\n", " t += '$' # add dollar if not there already\n", " # Get BWT string and offset of $ within it\n", " sa = suffixArray(t)\n", " self.bwt, self.dollarRow = bwtFromSa(t, sa)\n", " # Get downsampled suffix array, taking every 1 out of 'ssaIval'\n", " # elements w/r/t T\n", " self.ssa = self.downsampleSuffixArray(sa, ssaIval)\n", " self.slen = len(self.bwt)\n", " # Make rank checkpoints\n", " self.cps = FmCheckpoints(self.bwt, cpIval)\n", " # Calculate # occurrences of each character\n", " tots = dict()\n", " for c in self.bwt:\n", " tots[c] = tots.get(c, 0) + 1\n", " # Calculate concise representation of first column\n", " self.first = {}\n", " totc = 0\n", " for c, count in sorted(tots.items()):\n", " self.first[c] = totc\n", " totc += count\n", " \n", " def count(self, c):\n", " ''' Return number of occurrences of characters < c '''\n", " if c not in self.first:\n", " # (Unusual) case where c does not occur in text\n", " for cc in sorted(self.first.iterkeys()):\n", " if c < cc: return self.first[cc]\n", " return self.first[cc]\n", " else:\n", " return self.first[c]\n", " \n", " def range(self, p):\n", " ''' Return range of BWM rows having p as a prefix '''\n", " l, r = 0, self.slen - 1 # closed (inclusive) interval\n", " for i in range(len(p)-1, -1, -1): # from right to left\n", " l = self.cps.rank(self.bwt, p[i], l-1) + self.count(p[i])\n", " r = self.cps.rank(self.bwt, p[i], r) + self.count(p[i]) - 1\n", " if r < l:\n", " break\n", " return l, r+1\n", " \n", " def resolve(self, row):\n", " ''' Given BWM row, return its offset w/r/t T '''\n", " def stepLeft(row):\n", " ''' Step left according to character in given BWT row '''\n", " c = self.bwt[row]\n", " return self.cps.rank(self.bwt, c, row-1) + self.count(c)\n", " nsteps = 0\n", " while row not in self.ssa:\n", " row = stepLeft(row)\n", " nsteps += 1\n", " return self.ssa[row] + nsteps\n", " \n", " def hasSubstring(self, p):\n", " ''' Return true if and only if p is substring of indexed text '''\n", " l, r = self.range(p)\n", " return r > l\n", " \n", " def hasSuffix(self, p):\n", " ''' Return true if and only if p is suffix of indexed text '''\n", " l, r = self.range(p)\n", " off = self.resolve(l)\n", " return r > l and off + len(p) == self.slen-1\n", " \n", " def occurrences(self, p):\n", " ''' Return offsets for all occurrences of p, in no particular order '''\n", " l, r = self.range(p)\n", " return [ self.resolve(x) for x in range(l, r) ]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "fm = FmIndex('abaaba')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fm.hasSubstring('aaba')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fm.hasSubstring('aabb')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 5)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fm.range('a')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6, 7)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fm.range('baaba')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p, t = \"CAT\", \"TTGTGTGCATGTTGTTTCATCATTTAGAGATACATTGCGCTGCATCATGGTCA\"\n", "# 01234567890123456789012345678901234567890123456789012\n", "# Occurrences: * * * * * *\n", "fm = FmIndex(t)\n", "matches = sorted(fm.occurrences(p))\n", "matches == [7, 17, 20, 32, 42, 45]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }