{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Non-greedy SCS, for comparison:\n", "\n", "import itertools\n", "\n", "def overlap(a, b, min_length=3):\n", " \"\"\" Return length of longest suffix of 'a' matching\n", " a prefix of 'b' that is at least 'min_length'\n", " characters long. If no such overlap exists,\n", " return 0. \"\"\"\n", " start = 0 # start all the way at the left\n", " while True:\n", " start = a.find(b[:min_length], start) # look for b's suffx in a\n", " if start == -1: # no more occurrences to right\n", " return 0\n", " # found occurrence; check for full suffix/prefix match\n", " if b.startswith(a[start:]):\n", " return len(a)-start\n", " start += 1 # move just past previous match\n", "\n", "def scs(ss):\n", " \"\"\" Returns shortest common superstring of given\n", " strings, which must be the same length \"\"\"\n", " shortest_sup = None\n", " for ssperm in itertools.permutations(ss):\n", " sup = ssperm[0] # superstring starts as first string\n", " for i in range(len(ss)-1):\n", " # overlap adjacent strings A and B in the permutation\n", " olen = overlap(ssperm[i], ssperm[i+1], min_length=1)\n", " # add non-overlapping portion of B to superstring\n", " sup += ssperm[i+1][olen:]\n", " if shortest_sup is None or len(sup) < len(shortest_sup):\n", " shortest_sup = sup # found shorter superstring\n", " return shortest_sup # return shortest" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def pick_maximal_overlap(reads, k):\n", " \"\"\" Return a pair of reads from the list with a\n", " maximal suffix/prefix overlap >= k. Returns\n", " overlap length 0 if there are no such overlaps.\"\"\"\n", " reada, readb = None, None\n", " best_olen = 0\n", " for a, b in itertools.permutations(reads, 2):\n", " olen = overlap(a, b, min_length=k)\n", " if olen > best_olen:\n", " reada, readb = a, b\n", " best_olen = olen\n", " return reada, readb, best_olen\n", "\n", "def greedy_scs(reads, k):\n", " \"\"\" Greedy shortest-common-superstring merge.\n", " Repeat until no edges (overlaps of length >= k)\n", " remain. \"\"\"\n", " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", " while olen > 0:\n", " reads.remove(read_a)\n", " reads.remove(read_b)\n", " reads.append(read_a + read_b[-(len(read_b) - olen):])\n", " read_a, read_b, olen = pick_maximal_overlap(reads, k)\n", " return ''.join(reads)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'ABCDBCDA'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Small example where brute-force gives shorter superstring than greedy\n", "scs(['ABCD', 'CDBC', 'BCDA'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'CDBCABCDA'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Example from lecture notes\n", "greedy_sup = greedy_scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'], 1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('AAABBBABAAB', 11)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "greedy_sup, len(greedy_sup)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "scs_sup = scs(['BAA', 'AAB', 'BBA', 'ABA', 'ABB', 'BBB', 'AAA', 'BAB'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('BAAABABBBA', 10)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Greedy SCS superstring might be longer than optimal\n", "scs_sup, len(scs_sup)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }