{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import (\n", " \"fmt\"\n", " \"sort\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sorted list as index" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "// Let's make a class that implements an inverted (substring)\n", "// index where the map data structure is a sorted list of\n", "// (substring, offset) pairs. Query w/ binary search.\n", "\n", "type IndexItem struct {\n", " mer string\n", " offset int\n", "}\n", "\n", "type IndexSorted struct {\n", " length, interval int\n", " index []IndexItem\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "// Constructor for Boyer-Moore preprocessing object\n", "func NewIndexSorted(t string, ln, ival int) *IndexSorted {\n", " idx := new(IndexSorted)\n", " idx.length = ln\n", " idx.interval = ival\n", " // add all k-mers to the index\n", " for i := 0; i < len(t) - ln + 1; i++ {\n", " idx.index = append(idx.index, IndexItem{mer: t[i:i+ln], offset:i})\n", " }\n", " // sort the index, first by k-mer (alphabetically) then by offset\n", " sort.Slice(idx.index, func(i, j int) bool {\n", " if idx.index[i].mer != idx.index[j].mer {\n", " return idx.index[i].mer < idx.index[j].mer\n", " } else {\n", " return idx.index[i].offset < idx.index[j].offset\n", " }\n", " })\n", " return idx\n", "}" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "func (idx *IndexSorted) query(p string) []int {\n", " qu := p[:idx.length]\n", " // sort.Search does binary search, returning first\n", " // element for which function returns true\n", " st := sort.Search(len(idx.index), func(i int) bool { return qu <= idx.index[i].mer })\n", " en := sort.Search(len(idx.index), func(i int) bool { return qu < idx.index[i].mer })\n", " ret := make([]int, 0)\n", " for i := st; i < en; i++ {\n", " ret = append(ret, idx.index[i].offset)\n", " }\n", " return ret\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[8 12]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index := NewIndexSorted(\"CGTGCCTACTTACTTACAT\", 5, 4)\n", "index.query(\"CTTACTTA\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index.query(\"TTTTTTTT\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Map (hash table) as index" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "// Now let's make a similar class but using a map instead of a sorted\n", "// list. A Go map is basically a hash table.\n", "\n", "type IndexHash struct {\n", " length, interval int\n", " index map[string][]int\n", "}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "func NewIndexHash(t string, ln, ival int) *IndexHash {\n", " idx := new(IndexHash)\n", " idx.length = ln\n", " idx.interval = ival\n", " idx.index = make(map[string][]int)\n", " // add all k-mers to the index (map)\n", " for i := 0; i < len(t) - ln + 1; i++ {\n", " mer := t[i:i+ln]\n", " idx.index[mer] = append(idx.index[mer], i)\n", " }\n", " return idx\n", "}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "func (idx *IndexHash) query(p string) []int {\n", " qu := p[:idx.length]\n", " // No need for binary search here; just a lookup\n", " return idx.index[qu]\n", "}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[8 12]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index := NewIndexHash(\"CGTGCCTACTTACTTACAT\", 5, 4)\n", "index.query(\"CTTACTTA\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index.query(\"TTTTTTTT\")" ] } ], "metadata": { "kernelspec": { "display_name": "Go", "language": "go", "name": "gophernotes" }, "language_info": { "codemirror_mode": "", "file_extension": ".go", "mimetype": "", "name": "go", "nbconvert_exporter": "", "pygments_lexer": "", "version": "go1.11" } }, "nbformat": 4, "nbformat_minor": 2 }