{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bisect\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Index(object):\n",
    "    def __init__(self, t, k):\n",
    "        ''' Create index from all substrings of size 'length' '''\n",
    "        self.k = k  # k-mer length (k)\n",
    "        self.index = []\n",
    "        for i in range(len(t) - k + 1):  # for each k-mer\n",
    "            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair\n",
    "        self.index.sort()  # alphabetize by k-mer\n",
    "    \n",
    "    def query(self, p):\n",
    "        ''' Return index hits for first k-mer of P '''\n",
    "        kmer = p[:self.k]  # query with first k-mer\n",
    "        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search\n",
    "        hits = []\n",
    "        while i < len(self.index):  # collect matching index entries\n",
    "            if self.index[i][0] != kmer:\n",
    "                break\n",
    "            hits.append(self.index[i][1])\n",
    "            i += 1\n",
    "        return hits[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def queryIndex(p, t, index):\n",
    "    k = index.k\n",
    "    offsets = []\n",
    "    for i in index.query(p):\n",
    "        if p[k:] == t[i+k:i+len(p)]:  # verify that rest of P matches\n",
    "            offsets.append(i)\n",
    "    return offsets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'\n",
    "p = 'GGTATTCGGGA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[21, 68]\n"
     ]
    }
   ],
   "source": [
    "index = Index(t, 4)\n",
    "print(queryIndex(p, t, index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    }
   ],
   "source": [
    "index = Index(t, 4)\n",
    "print(queryIndex('TTTT', t, index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[8, 24, 31, 41, 50, 54, 60, 62, 71]\n"
     ]
    }
   ],
   "source": [
    "index = Index(t, 2)\n",
    "print(queryIndex('AT', t, index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 'There would have been a time for such a word'\n",
    "p = 'word'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[40]\n"
     ]
    }
   ],
   "source": [
    "index = Index(t, 4)\n",
    "print(queryIndex('word', t, index))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}