{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import bisect\n", "import sys" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class Index(object):\n", " def __init__(self, t, k):\n", " ''' Create index from all substrings of size 'length' '''\n", " self.k = k # k-mer length (k)\n", " self.index = []\n", " for i in range(len(t) - k + 1): # for each k-mer\n", " self.index.append((t[i:i+k], i)) # add (k-mer, offset) pair\n", " self.index.sort() # alphabetize by k-mer\n", " \n", " def query(self, p):\n", " ''' Return index hits for first k-mer of P '''\n", " kmer = p[:self.k] # query with first k-mer\n", " i = bisect.bisect_left(self.index, (kmer, -1)) # binary search\n", " hits = []\n", " while i < len(self.index): # collect matching index entries\n", " if self.index[i][0] != kmer:\n", " break\n", " hits.append(self.index[i][1])\n", " i += 1\n", " return hits[:]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def queryIndex(p, t, index):\n", " k = index.k\n", " offsets = []\n", " for i in index.query(p):\n", " if p[k:] == t[i+k:i+len(p)]: # verify that rest of P matches\n", " offsets.append(i)\n", " return offsets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'\n", "p = 'GGTATTCGGGA'" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[21, 68]\n" ] } ], "source": [ "index = Index(t, 4)\n", "print(queryIndex(p, t, index))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n" ] } ], "source": [ "index = Index(t, 4)\n", "print(queryIndex('TTTT', t, index))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[8, 24, 31, 41, 50, 54, 60, 62, 71]\n" ] } ], "source": [ "index = Index(t, 2)\n", "print(queryIndex('AT', t, index))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "t = 'There would have been a time for such a word'\n", "p = 'word'" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[40]\n" ] } ], "source": [ "index = Index(t, 4)\n", "print(queryIndex('word', t, index))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }