{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# A simple example of a substring index; mirrors example from lecture notes\n", "\n", "# we're going to extract 4 substrings like this:\n", "# t: CGTGCCTACTTACTTACAT\n", "# substring 1: CGTGC\n", "# substring 2: CCTAC\n", "# substring 3: CTTAC\n", "# substring 4: CTTAC\n", "t = 'CGTGCCTACTTACTTACAT'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# From t, make list of pairs, where first pair item is substring, second is its offset\n", "def substringize(t, ln, iv):\n", " # ln = length of substrings to extract\n", " # iv = distance between substings to extract; e.g. 1 means take *every* substring\n", " pairs = []\n", " # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n", " for i in range(0, len(t) - ln + 1, iv):\n", " pairs.append((t[i:i+ln], i))\n", " return pairs\n", " # Could also have used list comprehension:\n", " # return [ (t[i:i+ln], i) for i in range(0, len(t) - ln + 1, iv) ]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "substringize('CGTGCCTACTTACTTACAT', 5, 4)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Like substringize, but uses a map data structure\n", "def mapize(t, ln, iv):\n", " index = {}\n", " for i in range(0, len(t) - ln + 1, iv):\n", " sub = t[i:i+ln]\n", " if sub in index:\n", " index[sub].append(i) # already have an entry; append to it\n", " else:\n", " index[sub] = [i] # don't yet have entry, make new one\n", " return index" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n", "index" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "p = 'CTTACTTA'" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[8, 12]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# index: give me a hint where I should look for occurrences of p in t\n", "index[p[:5]]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 1 }