{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Let's make a class that implements an inverted (substring) index where the map data\n", "# structure is a sorted list of (substring, offset) pairs. Query w/ binary search.\n", "\n", "import sys\n", "import bisect # for binary search\n", "\n", "class IndexSorted(object):\n", " \n", " def __init__(self, t, ln, ival):\n", " \"\"\" Create index, extracting substrings of length 'ln' \"\"\"\n", " self.t = t\n", " self.ln = ln\n", " self.ival = ival\n", " self.index = []\n", " for i in range(len(t)-ln+1):\n", " self.index.append((t[i:i+ln], i)) # add pair\n", " self.index.sort() # sort pairs\n", " \n", " def query(self, p):\n", " \"\"\" Return candidate alignments for p \"\"\"\n", " st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search\n", " en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxsize)) # binary search\n", " hits = self.index[st:en] # this range of elements corresponds to the hits\n", " return [ h[1] for h in hits ] # return just the offsets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[8, 12]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index.query('CTTACTTA') # index: give us hints on where to look for this pattern" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index.query('TTTTTTTT') # index: give us hints on where to look for this pattern" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Now let's make a similar class but using a Python dictionary instead of a sorted\n", "# list. A Python dictionary is basically a hash table.\n", "\n", "class IndexHash(object):\n", " \n", " def __init__(self, t, ln, ival):\n", " \"\"\" Create index, extracting substrings of length 'ln' \"\"\"\n", " self.t = t\n", " self.ln = ln\n", " self.ival = ival\n", " self.index = {}\n", " for i in range(len(t)-ln+1):\n", " substr = t[i:i+ln]\n", " if substr in self.index:\n", " self.index[substr].append(i) # substring already in dictionary\n", " else:\n", " self.index[substr] = [i] # add to dictionary\n", " \n", " def query(self, p):\n", " \"\"\" Return candidate alignments for p \"\"\"\n", " return self.index.get(p[:self.ln], [])[:]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[8, 12]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index2.query('CTTACTTA') # index: give us hints on where to look for this pattern" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }