{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A simple example of a substring index; mirrors example from lecture notes\n",
    "\n",
    "# we're going to extract 4 substrings like this:\n",
    "# t:           CGTGCCTACTTACTTACAT\n",
    "# substring 1: CGTGC\n",
    "# substring 2:     CCTAC\n",
    "# substring 3:         CTTAC\n",
    "# substring 4:             CTTAC\n",
    "t = 'CGTGCCTACTTACTTACAT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From t, make list of pairs, where first pair item is substring, second is its offset\n",
    "def substringize(t, ln, iv):\n",
    "    # ln = length of substrings to extract\n",
    "    # iv = distance between substings to extract; e.g. 1 means take *every* substring\n",
    "    pairs = []\n",
    "    # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n",
    "    for i in range(0, len(t) - ln + 1, iv):\n",
    "        pairs.append((t[i:i+ln], i))\n",
    "    return pairs\n",
    "    # Could also have used list comprehension:\n",
    "    # return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "substringize('CGTGCCTACTTACTTACAT', 5, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Like substringize, but uses a map data structure\n",
    "def mapize(t, ln, iv):\n",
    "    index = {}\n",
    "    for i in range(0, len(t) - ln + 1, iv):\n",
    "        sub = t[i:i+ln]\n",
    "        if sub in index:\n",
    "            index[sub].append(i) # already have an entry; append to it\n",
    "        else:\n",
    "            index[sub] = [i] # don't yet have entry, make new one\n",
    "    return index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n",
    "index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = 'CTTACTTA'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[8, 12]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# index: give me a hint where I should look for occurrences of p in t\n",
    "index[p[:5]]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}