{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "### CODE CHALLENGE: Solve the String Composition Problem.\n", " \n", "def kmer_composition(text,k):\n", " \"\"\"Input: An integer k and a string Text.\n", " Output: Compositionk(Text) (the k-mers can be provided in any order).\"\"\"\n", " composition = []\n", " for i in range(len(text)-k+1):\n", " composition.append(text[i:i+k])\n", " return composition " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']\n" ] }, { "data": { "text/plain": [ "'CAATC AATCC ATCCA TCCAA CCAAC'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = 'CAATCCAAC'\n", "x = kmer_composition(text,5)\n", "print x\n", "' '.join(str(i) for i in x)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [], "source": [ "###String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.\n", "def string_from_genome_path(kmers):\n", " return kmers[0] + ''.join(map(lambda x: x[-1], kmers[1:])) " ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'ACCGAAGCT'" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmers = ['ACCGA', 'CCGAA', 'CGAAG', 'GAAGC', 'AAGCT']\n", "string_from_genome_path(kmers)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'AGACTGAAAGTCAATGCAGGTTGTCAAGAGCTGGTCGGAGGAGCCATTATTGTTAACGGGTCCCGGTAATCCGTCATGTGTATCCTACATGTCAATAGAACGACACAAAGGCTAAAATGATCTGTCGATGATACACATGTCCTGCAAGACAACTGACGCTAAGGTTTGAACTGCAGAGGGTTTGTCCAACACACATAATCTCGGCACATCTTATTAAGCGCCCCGAATCGCTTTGTTGGATTAACACGCCTCGACGCCTGAATAGGAACCACATCCCCGTAACTGAGCTCGTGCACAGAGCTTCATCGCATAGGTCACTCAAGACTTGTGGCTTGTTCATCGTAGGCCGCATTACAAGGCAATAGCATGTCGTGGTGTGTACAGCAGAATCCCCGTAGACCAAAAGAGAGAAGTGCCCCGGCGTCCTAACTCTAGCGCTTCATTACTAAGGGACGCAAACATTGAACGTCCTGTAATTTAAAGCATGGAGCTTTATGGTAGAATATAAACCCTGTGGCTTTCAGCTTAAAAATTGTCCCAATAGCCGGTTTACTTATCGGCCGCGTGTCGCCAACTGTGGCTTGCGTAGGTCGCGTTGGTAAGGGGCACAGCGTTTTGGTCCTCAATCCGTGCGGTCTGTCTTAGTACCAAGCAAAGCAGTGCCTTGCAGATCTCCGTGCACACAAAGCGAGTGTAACGCAACTGCCAATCCCCGAGTCCATAGGAGGTTAGACTTCCCAAAAAAACTACCAATGTGGATATGTCTCCGTTTGACCCTGATCAATTCGCACTTGCTTGTCATTCCCACGACCTCCATTTTTAGCCATAGACAGCGTACTGTTCCCGATGATAAGCGTTGCCACGATGGCTAGACTAATCCACCGCTAAAGTGTGAAAAAGCGAGCTGAAGGTGATATGATAGTCCAGTACAGTGTGCTACTACGGGGTAGAGGATCCAATAGGTTTCAGCCGAATGAAGAATAATCATCGCAGCTGGCTTGATCCGGGCGCGTAACTGCTGCGAGAGAAGTCAACTGGTGGTGACCTCGCGGACTTTCCCTCAGATAGACGCATACGTTCACTAGACAGCAGCTGAACACGTATTGTTGATGGGGATTTAGGCGGTTTATCGTACGCCGCATGGAATGCAGCAACCGACAATGAAAGACGAGCACGAGCCGATAGCTCGCTCATAATTTCCGTGGTGTTGATAGAATGTGCTTAAGCTACCAGGCGTCTTTTACATCCTATTCCGTTCACTTTTATTACACTTACTCCGGCGTATCCCAGGGAGCATACCACAGCTTGGATCATAGAGATCTGTAGCCGGCTTGCATGGGCAGAAGCTATGAGGGAAAGAAAGGTGACGAAACATACGTACAGTAAAGGCACAAGCTCTAGCCTTCTAGCATGTTGCACGACAGTCTGCGTGTACTTCCACACCACACTCGTTTGGTTTCATCTCACGTAAAGCGTACAACATCCGTTTCGGCACGGGCGTTTACGTTAAAACGCGCCGCGTAACCTTAATACAGTCGGTTCAGGTGCATTGCGACCGACACATTAACTCAAGGTGATGGCTAGAAGGGGCTCAGTTCGGGCCTAACGCCTACTGAACCTACTGCGGTCGACTCAGAGGGTATATTGATTATGCCATCGTGGACATGTCACAATTTCGGACGAGGCGACCCCAAGGGCTGCGGTCCAATGCCTTGACTTGAGCACACCTGGAGCATTCGCTGTCCTCTACCCTAGAGTTCAAGTCGTGGGACGGTACTCGACCTTCCGCCTCCGCTTGGTGGCAGACTCTGATCGCCCTCACATCCTACATGGTACCTTAGGATTAAGCTTCCGAAAGGGGATATACTCACCCGTATCCAACCTCTATTTATAGGTGCAACTCGCGGCATGAATCGCCATCAAGAGTCTGCGACACTATACCCTACGCCCTCGAGGAAGCAACTCAGAAGCATACTGTGCTAGTGAGATATTAATGGAGTAACCATCCTACGGGTCATTCCAATTACTCTGTACGAATATGTGCTCTAAGCTTCTAGCGGGTCCACAAGAATACCGTCGGCCTCGTCCTGGTCCTGACGCCCCAGCTCGCGGATAATTGCCCATTAGCGGACGAGGTTCAGTCCCACCGACCCCACAGATTTTCCAGACCATCGGGGCCCGATCGCTCGCTGAGCACCTAGTCTCGGGGGATCTGTACCGATGGAAAACCCGACACAGGCAGAGGTCCCGGCGACAGCCGCACTTAATGTACCTCTCTCCAAGTTCATGAATCTGACGGGACGATGCAGACCTAATAACCAAGCCGAGCTTCTATTAGGAAAGAGGATCCGCCTTGGGCACCTGGATGACTCACAGAAAATTATGGGTTCTATGGACGCGTCTTGTATACAGATAGTCGGAAGACACCATGGTGGACAGCGGCGAACGAGGTACGATTTTTCGACATCCACGCGTTCCTCAGGGGGGCTTGGTAGGCCATGATTGAAGGACACATCAGGGGAGTCAACCAATGGTGCAGGTGGGACAAGCGTTGGGCAGGCTTGGACAGCAGGGAATCGCCACCAGGTAATACGTAGCGTAACAGCTTAAGAGGTTGGCCGTAGAACCGGCATTCTTAACTATGAGCATATACCAACCCCCCCTCCATCCGAGAGGAGCCTATCCGATCCGTCGTATCACATATGGGCGCATGCATGGACTACGGGACGGCTAGATATCTCCAATGGTAGGGGCGTGATGGTAAGTCTCTGATAGAGCTTTTCTTTTTATCGAGTACACGTCTAAGTAGCAGTGGCAGACCCATGGAGTCGCCGGTCATAAAGCCATTACCCGTCGTGCATGTGTGAGCATTTTGGAGCCTCTCGGATGATTATTAGTATCCGCCAGCATGATACAAAAGGCTAGTAGAGTGGCAGATACTAAGCGGTATGGAGCAAACTATACCTACTGAACCATAGGATTGATTTCTTCGTGTCAAGGCCGGTATGGCTTCAGAATTCTTTTGTTATAGTATGACTCGATGTATGATGGTCCGCGCCCTACCCCCCGGAGTGTACCTAAAAGAAATCTGCAGCGCGCCGAGTTATGACCTAGGTGAACCTACAATGCTGGCCGCGTGACTCGGAAGCAACTACCGTTAAATAATCATTCTCTAATGGAGCTAAAAGGCCATGCTCTAGGACTCGGTCGCGGCAAAGTCCCGGCTAAACTTATCCCCCATAGTGCGCCCCAACTAGACCTGCCAATCGGACGTTTTGAGGAGCACCGTATCATTGTCACGACTGCCGTGGGCGCTAAACCTAACTAGTCACACATGGATGTACACTTGAGGTGGCGTTACTCTGCACGGCCTATTGAAAGCGCGCGCCTGCTAAGTGCCCTTAGCGCTCTTAATTTTGGCGCGAGCATGTGTAGGTATCAATCCAGCGCCAACGCTTATTCGGCTATTGCGGAAAGTTAAGCTAACTACACTCGCTATCACCCAGACCATTGATTAGGATGATGTTACTAGCAACCGATCGGCAACCCTGCCGTCATGGTTTGTGTGGGTTTGGCTAGTTCCAAAACCTATTGTAACTTGAACTGGTGAGTGACTGGCCTGCATGAGGCTGTGCTTCTGGCAGAAAGGAGCGATCCAGGATATGAAAATAAACGAATCGCCACGTGAGCTCACTAAGTCTTCACCCATGTAGCACGAGTCTGGCATTTAATACCTACACGTTAGAGTCCACGATCCTATAACATGGGTAGCTGCCGTGAGAAGTCAATCATGGCACCACCATGCTTTAGCAAGCAGGGTCCAAATGAGAAGACTCCACTTACAGATATCAAAACCGAGCCTTAAGCACCTGAACGCGCCTTTCTTGCGTATTCCACATCCCACTTGGCTGCTTGAGAGAGTCGATATCGAAGTTAGGTCCGATTGTAATTCGCACGGGGTAGAGTGCCCAAGGCCAAAACTGATTAGTTTCCATAGAATCGGTGCTAGTAGTAGACAAGGGTGCGGCGGAGTCATCGTTTATGACTCTCTCCCGCAACATTGAAGCCGGCAAAGTAGATAAGGCGTTTATAAGACTGAGACTACGTGGTCCCTTACTATTTTTGCTGGAGGAGGACGCCGCACGCAGCATGCTTATAAGCACGTTGATGGATAGTGCCCACCCTTCAGTGGAAGAGTGTACGATTCGCCAGAGCGGAGGAACAGATTTCTCCGTGTCTGTATACTTTAGCGCCGTCACCCCACTGACTCATCGGTTAGTACAGCACGAATGGGGCCTTGGGACGCCGTAGTACCAGACGCATTTTTGAGAGTCTCGCTATATCGGCCCTCTGATACTCAGAGTGTCGTTAATCGTAGCTACTTATGAAAAGCTATTACGATCCTCAAATATTCGGACTTGAAGATCAGTTAAGCCTCAGTCGCCGGTCGTTAAAACTAACTCCTGGCTCTTAGGACGGAGAGTAGCAATATACGTGACATTGGCTTCACACTCTGTATTAGCTACACGTGCAGCTGTACAATTTTAGCCTTACAGAGTTTATGCGAAGTTTCACCCTTGTTGCCGTACAAGTAGGTAGTAAGTTTGCCACGTACTCAGAAGGCCTAAGCCTCAACACTGTTCGCTGCCTCCATGGGGGGGAGGCTTTGTTCCAGGCTCTGCTTTGTTTCCAACTAATTCTGAATGCTGAGCTAGGCATATTCGTGTATCCGGGGCAAACTCTTACGGTCTAAGTATATAGAACAGTGTCGTGCCAACCGGCCAGGTGTTCCCTTTGTCCGGTGGTGTATATTTAGCTCCAACAGACATCTGCAGCGGAATTTAAAGTCGTTATTTATTTCTTCGCGTAAGCCCATAATATTGGCATACCGGCCCAGTAATCGCGGCAAGTGATAGTGGCAACGCACAGACAGGGTTTAGCTCAAACTGACCGCCCACACGTAGATGGCTGTTATAGAAGGTTTTACGATTCAGACCTTC'" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]\n", "string_from_genome_path(kmers)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def overlap(a, b, min_length=3):\n", " \"\"\" Return length of longest suffix of 'a' matching\n", " a prefix of 'b' that is at least 'min_length'\n", " characters long. If no such overlap exists,\n", " return 0. \"\"\"\n", " start = 0 # start all the way at the left\n", " while True:\n", " start = a.find(b[:min_length], start) # look for b's prefix in a\n", " if start == -1: # no more occurrences to right\n", " return 0\n", " # found occurrence; check for full suffix/prefix match\n", " if b.startswith(a[start:]):\n", " return len(a)-start\n", " start += 1 # move just past previous match" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from itertools import permutations\n", "list(permutations([1,2,3],3))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('ACGGATC', 'GATCAAGT'),\n", " ('ACGGATC', 'TTCACGGA'),\n", " ('GATCAAGT', 'ACGGATC'),\n", " ('GATCAAGT', 'TTCACGGA'),\n", " ('TTCACGGA', 'ACGGATC'),\n", " ('TTCACGGA', 'GATCAAGT')]" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']\n", "list(permutations(reads, 2))" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Solve the Overlap Graph Problem (restated below).\n", "from itertools import permutations\n", "from collections import defaultdict\n", "def overlap_graph(reads):\n", " '''Input: A collection Patterns of k-mers.\n", " Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. (You may return the edges in any order.)'''\n", " olaps = defaultdict(list)\n", " for a, b in permutations(reads, 2):\n", " if a[1:] == b[:-1]:\n", " olaps[a].append(b)\n", " return olaps" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGGCA -> GGCAT\n", "GGCAT -> GCATG\n", "GCATG -> CATGC\n", "CATGC -> ATGCG\n" ] } ], "source": [ "reads = ['ATGCG','GCATG','CATGC','AGGCA','GGCAT']\n", "graph = (overlap_graph(reads))\n", "for x,y in graph.iteritems():\n", " print x,'->', ' '.join(y)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a -> b\n", "b -> c\n" ] } ], "source": [ "d = {}\n", "d['a'] = 'b'\n", "d['b'] = 'c'\n", "d\n", "for x,y in d.iteritems():\n", " print x,'->',y" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Solve the De Bruijn Graph from a String Problem.\n", "from collections import defaultdict\n", "def debruijnize(st, k):\n", " \"\"\" Return a adjacency list, for each k-mer, its left\n", " k-1-mer and its right k-1-mer in a pair \"\"\"\n", " edges = defaultdict(list)\n", " for i in range(len(st) - k + 1):\n", " kmer = st[i:i+k]\n", " edges[kmer[:-1]].append(kmer[1:])\n", " return edges" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GC -> CG\n", "AC -> CG\n", "GT -> TC\n", "CG -> GC,GT\n", "TC -> CG\n" ] } ], "source": [ "edges = debruijnize(\"ACGCGTCG\", 3)\n", "for x,y in edges.iteritems():\n", " print x,'->', ','.join(y)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def de_bruijn_ize(st, k):\n", " \"\"\" Return a list holding, for each k-mer, its left\n", " k-1-mer and its right k-1-mer in a pair \"\"\"\n", " edges = []\n", " nodes = set()\n", " for i in range(len(st) -k + 1):\n", " edges.append((st[i:i+k-1], st[i+1:i+k]))\n", " nodes.add(st[i:i+k-1])\n", " nodes.add(st[i+1:i+k])\n", " return nodes, edges" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def visualize_de_bruijn(st, k):\n", " \"\"\" Visualize a directed multigraph using graphviz \"\"\"\n", " nodes, edges = de_bruijn_ize(st, k)\n", " dot_str = 'digraph \"DeBruijn graph\" {\\n'\n", " for node in nodes:\n", " dot_str += ' %s [label=\"%s\"] ;\\n' % (node, node)\n", " for src, dst in edges:\n", " dot_str += ' %s -> %s ;\\n' % (src, dst)\n", " return dot_str + '}\\n'" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The gvmagic extension is already loaded. To reload it, use:\n", " %reload_ext gvmagic\n" ] } ], "source": [ "%load_ext gvmagic" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "DeBruijn graph\n", "\n", "\n", "A\n", "\n", "A\n", "\n", "\n", "A->A\n", "\n", "\n", "\n", "\n", "T\n", "\n", "T\n", "\n", "\n", "A->T\n", "\n", "\n", "\n", "\n", "A->T\n", "\n", "\n", "\n", "\n", "A->T\n", "\n", "\n", "\n", "\n", "C\n", "\n", "C\n", "\n", "\n", "C->A\n", "\n", "\n", "\n", "\n", "C->C\n", "\n", "\n", "\n", "\n", "T->A\n", "\n", "\n", "\n", "\n", "T->T\n", "\n", "\n", "\n", "\n", "G\n", "\n", "G\n", "\n", "\n", "T->G\n", "\n", "\n", "\n", "\n", "T->G\n", "\n", "\n", "\n", "\n", "T->G\n", "\n", "\n", "\n", "\n", "G->A\n", "\n", "\n", "\n", "\n", "G->C\n", "\n", "\n", "\n", "\n", "G->T\n", "\n", "\n", "\n", "\n", "G->G\n", "\n", "\n", "\n", "\n", "G->G\n", "\n", "\n", "\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 2)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "DeBruijn graph\n", "\n", "\n", "AA\n", "\n", "AA\n", "\n", "\n", "AT\n", "\n", "AT\n", "\n", "\n", "AA->AT\n", "\n", "\n", "\n", "\n", "GT\n", "\n", "GT\n", "\n", "\n", "TT\n", "\n", "TT\n", "\n", "\n", "GT->TT\n", "\n", "\n", "\n", "\n", "CC\n", "\n", "CC\n", "\n", "\n", "CA\n", "\n", "CA\n", "\n", "\n", "CC->CA\n", "\n", "\n", "\n", "\n", "CA->AT\n", "\n", "\n", "\n", "\n", "GG\n", "\n", "GG\n", "\n", "\n", "GG->GG\n", "\n", "\n", "\n", "\n", "GA\n", "\n", "GA\n", "\n", "\n", "GG->GA\n", "\n", "\n", "\n", "\n", "GC\n", "\n", "GC\n", "\n", "\n", "GC->CC\n", "\n", "\n", "\n", "\n", "TG\n", "\n", "TG\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "GA->AT\n", "\n", "\n", "\n", "\n", "TG->GT\n", "\n", "\n", "\n", "\n", "TG->GG\n", "\n", "\n", "\n", "\n", "TG->GC\n", "\n", "\n", "\n", "\n", "TA\n", "\n", "TA\n", "\n", "\n", "TA->AA\n", "\n", "\n", "\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 3)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "DeBruijn graph\n", "\n", "\n", "ATG\n", "\n", "ATG\n", "\n", "\n", "TGT\n", "\n", "TGT\n", "\n", "\n", "ATG->TGT\n", "\n", "\n", "\n", "\n", "TGC\n", "\n", "TGC\n", "\n", "\n", "ATG->TGC\n", "\n", "\n", "\n", "\n", "TGG\n", "\n", "TGG\n", "\n", "\n", "ATG->TGG\n", "\n", "\n", "\n", "\n", "GCC\n", "\n", "GCC\n", "\n", "\n", "CCA\n", "\n", "CCA\n", "\n", "\n", "GCC->CCA\n", "\n", "\n", "\n", "\n", "GTT\n", "\n", "GTT\n", "\n", "\n", "TGT->GTT\n", "\n", "\n", "\n", "\n", "AAT\n", "\n", "AAT\n", "\n", "\n", "AAT->ATG\n", "\n", "\n", "\n", "\n", "GAT\n", "\n", "GAT\n", "\n", "\n", "GAT->ATG\n", "\n", "\n", "\n", "\n", "CAT\n", "\n", "CAT\n", "\n", "\n", "CAT->ATG\n", "\n", "\n", "\n", "\n", "TGC->GCC\n", "\n", "\n", "\n", "\n", "GGG\n", "\n", "GGG\n", "\n", "\n", "GGA\n", "\n", "GGA\n", "\n", "\n", "GGG->GGA\n", "\n", "\n", "\n", "\n", "GGA->GAT\n", "\n", "\n", "\n", "\n", "TAA\n", "\n", "TAA\n", "\n", "\n", "TAA->AAT\n", "\n", "\n", "\n", "\n", "CCA->CAT\n", "\n", "\n", "\n", "\n", "TGG->GGG\n", "\n", "\n", "\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 4)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "DeBruijn graph\n", "\n", "\n", "AA\n", "\n", "AA\n", "\n", "\n", "AT\n", "\n", "AT\n", "\n", "\n", "AA->AT\n", "\n", "\n", "\n", "\n", "GT\n", "\n", "GT\n", "\n", "\n", "TT\n", "\n", "TT\n", "\n", "\n", "GT->TT\n", "\n", "\n", "\n", "\n", "CC\n", "\n", "CC\n", "\n", "\n", "CA\n", "\n", "CA\n", "\n", "\n", "CC->CA\n", "\n", "\n", "\n", "\n", "CA->AT\n", "\n", "\n", "\n", "\n", "GG\n", "\n", "GG\n", "\n", "\n", "GG->GG\n", "\n", "\n", "\n", "\n", "GA\n", "\n", "GA\n", "\n", "\n", "GG->GA\n", "\n", "\n", "\n", "\n", "GC\n", "\n", "GC\n", "\n", "\n", "GC->CC\n", "\n", "\n", "\n", "\n", "TG\n", "\n", "TG\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "AT->TG\n", "\n", "\n", "\n", "\n", "GA->AT\n", "\n", "\n", "\n", "\n", "TG->GT\n", "\n", "\n", "\n", "\n", "TG->GG\n", "\n", "\n", "\n", "\n", "TG->GC\n", "\n", "\n", "\n", "\n", "TA\n", "\n", "TA\n", "\n", "\n", "TA->AA\n", "\n", "\n", "\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%dotstr visualize_de_bruijn(\"TAATGGGATGCCATGTT\", 3)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import defaultdict\n", "def debruijn_from_kmer(kmers):\n", " '''Input: A collection of k-mers Patterns.\n", " Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n", " edges = defaultdict(list)\n", " for kmer in kmers:\n", " edges[kmer[:-1]].append(kmer[1:])\n", " return edges" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGG -> GGG\n", "CAG -> AGG,AGG\n", "GAG -> AGG\n", "GGA -> GAG\n", "GGG -> GGG,GGA\n" ] } ], "source": [ "kmers = ['GAGG','CAGG','GGGG','GGGA','CAGG','AGGG','GGAG']\n", "for x,y in sorted(debruijn_from_kmer(kmers).iteritems()):\n", " print x,'->', ','.join(y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Eulerian cycle and eulerian walk" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def generate_edges(graph):\n", " edges = []\n", " for node in graph:\n", " for neighbour in graph[node]:\n", " edges.append((node, neighbour))\n", "\n", " return edges" ] }, { "cell_type": "code", "execution_count": 143, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 3), (1, 0), (2, 1), (2, 6), (3, 2), (4, 2), (5, 4), (6, 5), (6, 8), (7, 9), (8, 7), (9, 6)]\n" ] } ], "source": [ "graph = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}\n", "edges = generate_edges(graph)\n", "print edges" ] }, { "cell_type": "code", "execution_count": 157, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def nodes_degrees(edges):\n", " ''' This returns a dictionary of nodes and their degrees. The degree of a vertex is the number of edges connecting\n", " it, i.e. the number of adjacent vertices. Loops are counted double, i.e. every occurence of vertex in the list \n", " of adjacent vertices. '''\n", "\n", " degrees = dict() \n", " for a, b in edges:\n", " degrees[a] = degrees.get(a, 0) + 1\n", " degrees[b] = degrees.get(b, 0) + 1\n", " return degrees" ] }, { "cell_type": "code", "execution_count": 158, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0: 2, 1: 2, 2: 4, 3: 2, 4: 2, 5: 2, 6: 4, 7: 2, 8: 2, 9: 2}" ] }, "execution_count": 158, "metadata": {}, "output_type": "execute_result" } ], "source": [ "edges = generate_edges(graph)\n", "nodes_degrees(edges)" ] }, { "cell_type": "code", "execution_count": 159, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def highest_degree_node(edges):\n", " degrees = nodes_degrees(edges)\n", " return max(degrees, key=degrees.get)" ] }, { "cell_type": "code", "execution_count": 160, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "highest_degree_node(edges)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [], "source": [ "g = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}\n", "def eulerianCycle(g):\n", " \"\"\" Input: The adjacency list of an Eulerian directed graph.\n", " Output: An Eulerian cycle in this graph.\"\"\"\n", " tour = []\n", " src = g.iterkeys().next() # pick arbitrary starting node\n", " def visit(n):\n", " while len(g[n]) > 0:\n", " dst = g[n].pop()\n", " visit(dst)\n", " tour.append(n)\n", " visit(src)\n", " tour = tour[::-1] # reverse and then take all but last node\n", " return tour" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'0->3->2->6->8->7->9->6->5->4->2->1->0'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tour = eulerianCycle(g)\n", "'->'.join(str(i) for i in tour)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def read_input(filename):\n", " f = open(filename, 'r')\n", " g = dict()\n", " for line in f:\n", " line = line.strip()\n", " line = line.split(' -> ')\n", " g[int(line[0])] = [int(x) for x in line[1].split(',')]\n", " return g" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'0->2->6->541->1215->1214->1213->541->542->543->6->31->32->771->1355->2441->2442->2440->1355->1356->1354->771->770->769->32->33->156->911->910->1203->1201->1202->1674->2189->2190->2188->1674->1672->1673->1202->910->912->1232->2903->2902->2904->1232->1873->1874->1875->1232->1231->1233->912->156->154->553->555->554->932->931->933->554->2488->2489->2490->554->2201->2200->2202->554->2088->2086->2087->554->1184->1183->1185->2103->2102->2101->2396->2395->2397->2101->1185->554->154->155->33->6->4->8->83->231->1723->1725->1800->1799->1798->1725->1724->231->229->230->83->82->84->8->39->48->62->688->1088->1586->1585->1587->1088->1089->1087->688->690->967->1250->1249->1251->967->968->969->690->1486->1487->1488->690->689->62->63->61->48->1365->1363->2030->2031->2029->1363->1364->48->47->514->516->627->625->626->516->515->876->875->874->515->2323->2325->2324->515->47->483->482->481->530->1438->1440->1439->530->529->680->1860->1858->1859->2041->2042->2956->2957->2958->2042->2043->1859->680->679->681->529->531->926->927->1111->1113->1112->927->925->531->481->47->46->39->37->215->2057->2056->2058->215->216->1661->1660->1662->216->1015->1971->1970->1969->1015->1017->1016->216->214->2558->2559->2652->2651->2650->2559->2571->2570->2569->2559->2557->214->37->38->8->1621->1622->2452->2454->2453->1622->1623->2433->2432->2431->1623->8->126->124->478->480->962->1128->1167->1165->1166->1171->1930->1931->1932->1171->1173->1172->1166->1128->1127->1126->962->961->1037->1038->1036->1581->1579->1580->1639->1640->1641->1580->1036->961->963->480->1257->1256->2513->2512->2514->1256->1255->480->479->1749->1748->1747->479->124->125->238->240->1278->1276->1277->240->239->2018->2938->2940->2939->2018->2536->2537->2538->2018->2017->2019->239->125->8->7->9->24->462->949->950->2684->2683->2685->950->951->462->1655->1656->1654->462->460->1446->1444->1445->2337->2335->2336->1445->460->461->24->1839->1837->1838->24->22->96->1146->1145->1144->96->95->114->261->323->322->324->465->511->2920->2921->2922->511->512->513->465->1216->1217->1497->1495->1496->1217->1218->465->464->520->865->867->866->986->985->987->1681->1683->1682->987->866->520->1510->1511->1512->520->522->521->464->463->324->261->260->259->1319->1318->1320->259->114->113->268->878->879->877->268->1180->1389->2241->2240->2239->2386->2387->2388->2239->1389->1387->1388->1180->1182->1181->268->270->1626->1625->1624->270->269->363->361->362->269->113->218->894->893->892->218->217->2290->2326->2328->2882->2881->2883->2328->2327->2290->2291->2292->217->1631->1632->1630->217->219->113->112->330->329->728->727->1018->1019->1020->727->729->1062->1061->1060->2230->2232->2889->2888->2887->2232->2231->1060->729->329->328->525->2690->2691->2689->525->1407->1405->1406->1771->1977->1975->1976->1771->1773->1772->1775->1774->1776->1772->1406->525->523->524->1191->1705->1854->1853->1852->1705->1706->1707->1191->1190->1189->1649->1648->1650->2141->2140->2142->1650->1189->524->328->2553->2551->2552->328->112->213->2149->2151->2150->213->211->351->349->2343->2835->2833->2834->2343->2342->2341->349->350->2678->2677->2679->350->211->212->341->564->897->896->895->1043->1118->1117->1119->1043->1044->1179->1994->1995->1993->1179->1177->1178->1044->1042->895->564->562->1334->2839->2841->2840->1334->1909->1911->1910->1334->1335->1333->1419->1417->1418->2524->2525->2526->1418->1333->562->1285->1287->1286->562->1109->1108->1110->1761->1759->1760->1110->562->563->341->366->364->365->341->1281->1280->1279->341->340->452->451->798->797->2457->2455->2456->797->796->2159->2768->2769->2767->2159->2158->2160->796->451->545->602->755->754->756->602->603->601->545->544->546->585->2620->2622->2898->2896->2897->2622->2621->585->584->1695->1693->1905->2752->2754->2753->1905->1904->1903->1693->1694->584->583->978->977->976->583->546->451->453->616->676->1964->2894->2893->2895->1964->1965->2705->2706->2704->2947->2948->2949->2704->1965->1963->676->678->677->616->618->617->453->1528->2067->2065->2066->1528->1529->1530->453->340->342->376->377->2658->2656->2657->377->378->342->1030->1032->1135->1137->1136->1032->1031->1767->1765->2530->2531->2532->1765->1766->1031->342->212->112->95->94->22->78->264->263->262->78->1539->1537->1538->78->137->971->972->2519->2520->2518->972->970->137->606->604->685->686->687->1242->1240->1241->687->604->605->137->136->586->587->588->136->138->650->651->649->668->669->667->649->1021->1022->2345->2344->2346->1022->1023->2257->2259->2258->1023->649->138->1507->1508->1509->138->1393->1395->1394->138->78->77->76->367->662->663->661->367->369->2219->2379->2378->2377->2219->2218->2220->369->368->2306->2305->2307->368->76->22->432->2535->2533->2534->432->1844->1845->1843->432->1543->1545->1544->432->430->431->22->23->311->1050->2419->2421->2581->2582->2583->2421->2420->1050->1049->1048->311->310->312->1519->1520->2877->2876->2875->1520->1829->1828->2715->2714->2713->1828->1830->1520->1521->312->1459->1461->1460->312->23->101->256->842->841->843->2125->2127->2126->843->256->1195->1196->1197->256->258->298->299->300->2450->2449->2451->300->1569->1567->1568->300->1494->1492->1493->300->258->257->845->2015->2014->2016->845->844->1134->1966->1967->1968->1134->1132->1133->844->846->257->101->100->936->934->935->100->102->23->9->4->2900->2901->2899->4->27->129->285->296->568->570->1592->1688->1689->1687->1592->1593->2546->2545->2547->1593->1591->2161->2163->2162->1591->570->569->713->1534->1536->2199->2198->2322->2659->2660->2661->2988->2986->2987->2661->2322->2321->2320->2332->2333->2334->2352->2351->2350->2334->2320->2198->2197->1536->1535->713->712->2562->2560->2561->712->714->569->1096->2301->2299->2300->1096->1098->1097->569->296->537->536->1399->1401->1400->2365->2367->2366->1400->536->535->1746->1744->1745->535->296->1736->1735->1737->2252->2253->2251->1737->296->295->297->285->284->283->1652->1653->1651->283->129->127->1299->1298->1297->127->128->2522->2521->2523->128->2007->2006->2005->128->27->26->49->50->53->201->1156->1158->1157->1936->1938->1937->1157->201->200->2004->2003->2002->200->199->53->52->406->407->2264->2265->2263->407->408->52->337->338->339->52->2310->2308->2309->52->54->209->208->558->2280->2422->2448->2447->2446->2422->2424->2423->2280->2279->2278->558->556->557->208->232->233->748->750->749->233->254->1710->1708->1709->254->253->255->916->918->917->1404->1832->1833->1831->1404->1402->2668->2670->2669->1402->1877->1876->1878->1402->1403->917->255->267->266->265->487->890->1244->1823->2575->2577->2576->1823->1824->1822->2382->2380->2381->1822->1244->1243->1505->1504->1551->1550->1549->1504->1506->1243->1245->890->889->1541->1722->1721->1720->2206->2207->2208->1720->1541->1542->1540->889->891->487->2630->2629->2631->487->2080->2082->2081->487->1861->1863->1862->487->488->620->621->2625->2624->2623->621->619->488->489->265->255->1872->1870->2463->2462->2461->1870->1871->255->233->234->208->210->699->2154->2153->2152->699->697->698->210->657->655->656->210->54->1160->1161->1159->1435->1437->1436->1159->54->50->1347->1345->1346->1703->1702->1704->1346->50->51->1357->1358->1359->2808->2807->2806->1359->51->26->224->223->225->26->25->35->309->307->1001->1002->2097->2095->2096->1002->1000->1381->1382->1383->1000->307->308->35->34->220->947->948->946->1373->1374->1680->2786->2787->2785->1680->1679->1678->1374->1372->946->220->2664->2662->2663->220->221->726->725->724->221->571->572->1139->1138->1140->572->573->221->2675->2674->2676->221->222->34->153->1142->1143->1141->153->151->186->185->538->2460->2458->2459->538->540->539->185->184->151->152->34->36->25->244->371->825->823->824->1812->1810->1811->824->371->370->372->2711->2712->2710->372->2177->2579->2580->2578->2177->2178->2176->372->2137->2139->2271->2270->2269->2139->2138->372->244->1124->1306->1307->1308->2037->2036->2035->2916->2915->2914->2035->1308->1124->1125->1235->1325->1326->1324->2077->2079->2078->1324->1728->1727->1726->1324->1235->1312->1313->1314->1235->1236->1247->1248->2789->2788->2790->1248->1457->2166->2164->2165->1457->1456->1458->1664->1663->3000->2999->2998->1663->1665->1815->1813->1814->1665->1458->1248->1246->1236->1234->1941->1940->1939->2617->2626->2628->2627->2617->2618->2619->1939->1234->1627->1628->2871->2869->2870->1628->1803->1802->1801->1628->1629->1234->1125->1123->1634->1635->1633->1123->244->246->245->25->4->18->16->67->559->561->560->1091->1452->1450->1451->1091->1092->1090->560->67->317->318->404->785->784->786->404->405->486->485->703->705->704->1294->1296->2497->2498->2963->2962->2990->2991->2989->2962->2964->2498->2499->2501->2500->2502->2499->1296->1295->704->485->484->1066->1068->1067->484->405->403->318->316->684->2246->2245->2247->684->683->1427->1426->2931->2929->2930->1426->1428->683->682->1398->2133->2131->2132->1398->1397->2740->2742->2741->1397->1396->682->1148->1789->1791->1790->1148->1147->1149->682->316->67->179->781->1645->2146->2147->2148->1645->1646->1647->781->782->783->179->644->741->739->2196->2195->2194->2634->2859->2857->2858->2634->2632->2633->2194->739->740->1523->1524->1522->740->644->643->645->737->736->738->645->179->417->415->416->518->2071->2073->2072->518->519->1410->1409->1408->1503->2971->2972->2973->1503->1576->1577->2038->2040->2039->2854->2855->2856->2960->2961->2959->2856->2039->1577->1578->1503->1501->1502->1408->519->517->416->1471->2968->2970->2969->1471->1472->1473->1950->1948->2107->2108->2109->1948->1949->1473->416->179->178->180->67->119->332->2475->2474->2473->332->333->331->959->960->958->331->810->808->809->331->119->118->792->790->791->118->120->67->69->157->158->2401->2403->2942->2943->2941->2403->2402->158->159->804->803->802->159->69->68->574->576->575->68->16->17->646->647->648->17->4->165->164->1366->2010->2008->2009->1366->1368->1367->1697->1698->1696->1367->164->163->386->387->385->2979->2978->2977->385->2316->2315->2318->2317->2319->2315->2314->385->163->4->5->2745->2743->2744->5->2->3->98->97->501->500->499->97->1658->1659->1657->97->1155->1154->1925->1924->1926->2340->2339->2338->1926->1154->1153->97->111->109->2047->2912->2911->2913->2047->2049->2048->109->110->829->831->1265->1264->2797->2798->2799->1264->1266->831->830->2391->2390->2389->830->110->450->1227->2732->2731->2733->1227->1226->1225->1732->1733->1734->1225->450->449->448->110->97->99->1376->1377->1935->1933->1934->1377->1375->99->3->11->248->249->634->635->636->249->247->11->2168->2167->2700->2699->2698->2167->2169->2601->2599->2600->2169->11->10->965->966->964->1685->1684->1686->964->1353->1351->1352->964->10->66->65->64->271->272->468->533->534->592->594->593->734->733->735->902->2076->2398->2399->2400->2076->2074->2075->902->903->2011->2013->2012->2906->2905->2907->2012->903->901->735->593->534->532->468->466->2113->2115->2114->466->1115->1114->1116->466->467->2709->2708->2707->467->272->313->746->837->835->1315->1466->1465->1467->1315->1316->1317->835->1078->1464->1463->1462->1078->1080->1079->2593->2594->2595->1079->835->836->746->747->745->313->2639->2640->2638->313->314->436->447->598->707->708->706->598->2329->2331->2330->598->600->1856->1855->1857->600->599->731->1448->1447->1449->731->732->730->599->447->446->445->1590->1589->1588->2354->2355->2353->1588->445->436->437->869->868->870->1176->1175->1174->2268->2267->2266->1174->870->437->776->777->775->437->438->314->315->1490->1489->1491->315->272->1546->1548->1547->272->273->1821->1820->1819->2647->2648->2649->1819->273->64->10->60->59->58->72->133->2680->2681->2682->133->134->188->457->1293->1292->1291->457->1083->1350->1348->1349->2815->2816->2817->1349->1083->1082->1081->457->458->1527->1525->1526->458->459->759->1258->2735->2734->2736->1258->2567->2568->2566->1258->1599->1598->1597->1258->1259->2665->2666->2667->1259->1882->1884->2411->2412->2410->1884->1883->1259->1260->759->757->758->459->188->374->702->701->819->923->924->922->2507->2506->2508->922->1670->1669->1671->922->819->1453->1455->1454->819->817->850->852->851->817->818->1985->1984->1986->818->701->720->719->718->701->700->1616->1615->1617->700->374->373->409->410->580->581->582->410->411->473->2757->2756->2755->473->474->508->510->863->864->862->1895->1894->1896->862->510->1336->1338->1337->2950->2952->2951->1337->510->1102->1103->1104->510->509->787->789->788->509->474->472->872->1230->1951->1953->2437->2439->2438->1953->1952->1230->1229->1228->872->871->1607->1608->1606->871->1073->1074->1979->1980->1978->1074->1072->871->873->472->411->373->375->780->1596->2027->2028->2026->2814->2812->2813->2026->1596->1594->1595->780->778->779->1868->1867->1869->779->375->591->2759->2758->2760->591->589->590->375->1211->1302->1301->1300->2179->2180->2784->2783->2782->2180->2181->1300->1211->1210->1212->1788->1787->1786->2486->2554->2555->2556->2486->2485->2487->1786->1212->375->1003->1004->1005->375->188->189->884->883->1267->1268->1269->883->885->189->187->134->135->659->658->660->135->72->70->71->81->475->477->1392->1391->1390->477->476->847->848->849->476->1150->2694->2692->2909->2910->2908->2692->2693->1150->1152->1151->476->1077->1076->1075->476->81->1432->1780->1991->1992->1990->1780->1782->1781->1432->1434->1433->81->79->282->711->812->811->1558->1560->1559->811->1554->1553->1552->811->813->711->709->710->2821->2822->2823->710->1846->1847->1848->710->1095->1093->1094->710->282->281->280->2436->2434->2435->2739->2737->2738->2435->280->79->1414->1415->1416->79->80->71->58->10->14->88->335->768->767->766->335->336->334->2848->2850->2849->334->1223->1224->1222->334->88->90->89->527->528->1441->1442->1443->2598->2596->2597->1443->528->526->89->14->15->13->654->1069->2244->2242->2243->1069->1071->1070->654->652->905->988->989->2313->2311->2312->989->990->905->904->906->652->1923->1921->1922->652->653->1208->1209->1956->1955->1954->1209->1207->1794->1792->2295->2294->2293->1792->1912->1913->1914->1792->1793->1207->653->13->30->86->87->2796->2795->2794->87->104->252->980->2481->2479->2480->980->981->979->2288->2287->2289->979->252->505->506->2803->2805->2804->506->507->252->279->1054->1055->1056->2867->2868->2866->1056->279->278->624->622->623->1412->1411->1413->623->278->277->252->1273->1275->1274->252->251->2262->2935->2937->2936->2262->2261->2260->2476->2477->2478->2260->251->250->104->1385->1386->2886->2884->2885->1386->1816->1817->1818->1386->1384->104->105->183->181->182->379->381->380->182->2932->2934->2984->2985->2983->2934->2933->182->105->103->886->1479->1478->1477->1636->1637->1638->1477->886->888->887->103->1850->1880->1879->1881->2529->2528->2527->1881->1850->1849->2362->2363->2364->1849->1851->2730->2728->2729->1851->103->87->85->142->143->938->939->937->143->288->815->816->814->288->722->881->880->882->722->1220->1221->1219->722->723->721->288->353->2750->2830->2832->2831->2750->2749->2751->353->352->354->288->2653->2655->2654->288->287->286->143->144->206->412->414->420->1750->2203->2205->2204->1750->1751->1752->420->418->717->920->1311->1310->1309->920->921->942->2565->2563->2564->942->941->2586->2585->2584->941->940->2098->2100->2099->940->921->919->717->1676->1677->1809->1808->1807->1677->1675->2089->2090->2091->2482->2484->2483->2091->2186->2185->2187->2091->1675->717->716->1340->1341->1339->716->715->2643->2642->2641->715->418->2541->2539->2540->418->419->414->2717->2716->2718->414->413->206->207->2645->2646->2644->207->205->359->360->642->744->2120->2119->2121->744->743->742->642->2064->2063->2062->2111->2112->2110->2062->642->640->1289->1290->1288->640->641->360->1361->1362->1360->360->358->1982->1981->1983->358->205->290->289->306->305->394->395->396->610->928->929->930->2892->2890->2891->930->610->612->611->396->305->304->289->2809->2811->2810->289->2573->2572->2574->289->291->205->275->276->274->205->144->85->30->826->828->1106->1107->1989->2000->1999->2001->1989->1988->1987->1107->1105->828->827->30->2405->2406->2404->30->28->1199->1200->1198->28->29->141->760->762->1304->1601->2851->2853->2852->1601->1600->1602->2636->2637->2635->1602->1887->2224->2721->2719->2720->2224->2226->2225->1887->1885->1886->1897->2695->2696->2697->1897->1898->1899->1886->1602->1304->1303->1305->762->761->1051->2214->2349->2347->2348->2214->2212->2213->1051->1053->1052->761->141->139->140->29->13->293->292->294->13->176->595->596->597->2763->2761->2762->597->176->1205->1666->1667->2606->2607->2605->1667->1915->1916->1917->1667->1668->1205->1206->2590->2591->2592->1206->1204->2837->2836->2838->1204->2800->2802->2801->1204->2277->2276->2842->2844->2843->2276->2775->2773->2774->2276->2275->1204->176->175->177->13->10->12->117->116->632->1742->1743->2702->2703->2701->1743->1741->632->631->2722->2723->2724->631->633->116->115->171->169->204->202->639->1100->1101->1099->639->638->637->957->956->955->1763->1762->1764->955->637->2393->2392->2394->637->202->203->169->170->1785->1929->1928->1927->1785->1783->1784->170->115->145->443->490->491->492->1574->1575->1604->1605->1943->1944->1942->1605->1603->1575->1573->492->443->1865->1866->1864->443->1162->1164->1163->443->442->444->984->1779->1777->1778->984->982->983->1739->1738->1795->1796->1797->1738->1740->983->444->615->613->1840->1841->1842->613->614->444->548->549->547->444->145->2144->2371->2373->2372->2144->2143->2145->145->147->243->2510->2509->2511->243->242->383->384->382->242->1692->1890->1888->2183->2184->2613->2612->2611->2184->2182->1888->1889->1692->1691->1690->242->241->147->146->115->130->996->1007->1423->1425->1424->1007->1006->1008->2608->2609->2610->1008->996->994->995->2384->2383->2385->995->130->857->1187->1332->1330->1974->1972->1973->1330->1331->1187->1186->1614->1612->1613->1186->1188->2829->2827->2828->1188->857->858->2283->2878->2879->2880->2283->2282->2281->858->856->1284->1282->1283->856->130->132->131->1431->1430->1429->131->115->12->3->1->497->496->498->1->2982->2980->2981->1->21->493->494->1122->1121->1120->494->495->1962->1960->2106->2216->2370->2368->2369->2216->2217->2215->2106->2105->2104->1960->1961->2069->2068->2070->1961->495->1718->1719->2726->2725->2727->1719->1717->495->1371->1369->1370->495->21->44->191->192->190->1920->1919->1918->190->44->45->1327->1328->1329->45->43->441->1755->1753->1754->441->440->1514->1513->1515->440->439->43->235->236->578->855->2992->2993->2994->855->853->854->1768->1770->1769->854->1714->2032->2034->2055->2672->2671->2673->2055->2054->2053->2034->2033->1714->1715->1716->854->578->579->1902->1900->1901->579->577->1013->1012->1014->577->236->321->392->840->1996->1998->2864->2865->2863->1998->1997->840->838->839->392->391->552->550->1805->2927->2928->2926->1805->1806->1827->1825->1826->1806->1804->550->1480->1481->1482->550->551->2124->2123->2686->2687->2688->2123->2249->2250->2248->2123->2122->551->391->393->859->2543->2542->2589->2746->2747->2748->2589->2588->2587->2542->2544->859->861->860->1033->2223->2221->2361->2359->2360->2221->2222->1033->1034->1323->1321->1322->2792->2791->2793->1322->1034->1035->860->393->321->320->1644->1643->1642->320->1063->2296->2297->2298->1063->1064->1065->2492->2493->2491->1065->1609->1610->1611->1065->320->319->993->992->2358->2356->2357->992->991->319->236->237->1046->1045->1047->237->43->21->42->40->2085->2285->2286->2284->2085->2083->2084->40->41->607->608->953->954->1583->1582->1584->954->952->608->609->1620->1619->2272->2274->2273->1619->1618->609->41->56->73->75->1252->1253->2060->2059->2117->2118->2116->2059->2061->1253->1254->75->74->502->503->504->74->2427->2425->2426->2430->2429->2428->2426->74->56->55->92->167->168->753->751->752->799->1169->1168->1170->2135->2136->2134->1170->799->800->1484->1729->1730->1731->1484->1485->1483->800->801->752->168->166->92->91->93->691->2504->2503->2505->691->692->693->93->55->343->344->425->426->424->2945->2946->2944->424->344->345->975->1379->1957->1958->2236->2237->2238->1958->1959->2415->2414->2413->1959->1379->1469->1470->1468->2495->2494->2496->1468->1379->1380->1378->975->973->974->345->55->2818->2819->2820->55->194->195->193->2771->2770->2772->193->55->122->123->303->302->346->2470->2471->2472->346->348->356->357->421->422->423->357->1908->1906->1907->357->355->794->900->899->898->2974->2976->2975->898->1517->1518->2092->2093->2094->2766->2765->2764->2094->1518->1516->898->794->2044->2860->2861->2862->2044->2045->2046->794->793->1836->2022->2021->2020->1836->1835->1834->793->1498->1500->2516->2517->2515->1500->1499->793->795->355->2443->2445->2444->355->348->2845->2847->2846->348->347->302->301->435->471->470->469->435->434->2824->2826->2825->434->433->2023->2024->2025->433->301->123->121->55->108->326->325->327->108->1028->1027->1029->1533->1532->1531->1029->108->107->807->805->806->2374->2376->2375->806->107->399->694->833->834->832->1474->1700->1699->1701->1474->1475->1476->832->694->822->820->1010->1009->1011->820->821->944->945->943->821->694->696->2614->2616->2615->696->1421->2234->2233->2995->2997->2996->2233->2235->1421->1420->1557->1556->1555->1420->1422->696->695->399->398->397->2549->2550->2548->397->107->106->427->2228->2229->2227->427->429->566->1129->1131->1130->566->565->567->429->1563->2781->2779->2780->1563->1562->1561->429->428->1239->1271->2211->2209->2210->1271->1272->1270->2917->2918->2919->1270->1239->1237->1238->428->106->162->160->773->774->772->1565->1566->1564->772->160->2050->2466->2464->2465->2050->2051->2193->2192->2191->2051->2052->160->161->2408->2407->2409->161->106->55->57->41->401->402->2965->2967->2966->402->400->1085->1084->1086->2925->2923->2924->1086->400->41->21->174->1261->1262->1263->174->172->671->1756->1757->1758->671->1192->1194->1193->2304->2302->2303->1193->671->672->670->1947->2256->2255->2254->1947->1946->1945->670->1713->1711->1712->670->1570->1571->1572->670->172->173->1040->1041->1039->173->21->20->389->1892->2874->2872->2873->1892->1891->2129->2173->2174->2175->2129->2130->2128->2778->2777->2776->2128->1891->1893->389->390->2954->2953->2955->390->388->20->226->628->629->907->908->909->1059->1057->1058->909->1024->1026->1025->909->629->630->666->665->664->630->226->2170->2171->2172->226->227->2417->2416->2418->227->228->20->197->196->764->763->765->914->913->999->2469->2467->2468->999->998->997->913->915->765->2157->2156->2155->765->196->198->2603->2604->2602->198->20->19->1->0->149->148->150->456->455->675->673->674->1344->1342->1343->674->455->454->150->0'" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = read_input('rosalind_ba3f.txt')\n", "def eulerianCycle(g):\n", " \"\"\" Input: The adjacency list of an Eulerian directed graph.\n", " Output: An Eulerian cycle in this graph.\"\"\"\n", " tour = []\n", " src = g.iterkeys().next() # pick arbitrary starting node\n", " def visit(n):\n", " while len(g[n]) > 0:\n", " dst = g[n].pop()\n", " visit(dst)\n", " tour.append(n)\n", " visit(src)\n", " tour = tour[::-1] # reverse and then take all but last node\n", " return tour\n", "tour = eulerianCycle(g)\n", "'->'.join(str(i) for i in tour)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "def eulerianCycle(graph):\n", " # init cycle with any node\n", " cycle = [graph.keys()[0]]\n", " ##### move all edges from graph to cycle\n", " while len(graph) > 0:\n", " # if cycle is closed shift to a node with remaining targets\n", " if cycle[0] == cycle[-1]:\n", " while not cycle[0] in graph:\n", " cycle.pop(0)\n", " cycle.append(cycle[0])\n", " ##### source is the last node of cycle\n", " source = cycle[-1]\n", " ##### move one target from graph to cycle\n", " cycle.append(graph[source].pop())\n", " ##### clean sources without targets\n", " if len(graph[source]) == 0: del graph[source]\n", " return cycle" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3, 6, 7, 8, 9]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n", "nodes = g.keys()\n", "nodes" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def outdegree(g,node):\n", " \"\"\"Returns the number of outgoing edges from a node\"\"\"\n", " try:\n", " return len(g[node])\n", " except:\n", " return 0\n", "outdegree(g,4)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def indegree(g,node):\n", " \"\"\"Returns the number of incoming edges from a node\"\"\"\n", " incoming = [k for k,v in g.iteritems() if node in v]\n", " return len(incoming)\n", "indegree(g,6)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def isSemiBalanced(g,node):\n", " return abs(indegree(g,node) - outdegree(g,node)) == 1" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def isBalanced(g,node):\n", " return indegree(g,node) == outdegree(g,node)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def head(g):\n", " \"\"\"Returns the head of a graph in case of Eulerian Walk\"\"\"\n", " nodes = g.keys()\n", " head_node = None\n", " for node in nodes:\n", " if indegree(g,node) == outdegree(g,node) - 1:\n", " head_node = node\n", " break\n", " return head_node" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def tail(g):\n", " \"\"\"Returns the tail of a graph in case of Eulerian Walk\"\"\"\n", " neighbors = [v for k in g for v in g[k]]\n", " tail_node = None\n", " for node in neighbors:\n", " if indegree(g,node) == outdegree(g,node) + 1:\n", " tail_node = node\n", " break\n", " return tail_node" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6\n", "4\n", "{0: [2], 1: [3], 2: [1], 3: [0, 4], 4: [6], 6: [3, 7], 7: [8], 8: [9], 9: [6]}\n", "src 0\n", "tour [0, 3, 6, 9, 8, 7, 6, 4, 3, 1, 2, 0]\n", "tour [0, 2, 1, 3, 4, 6, 7, 8, 9, 6, 3]\n", "sti 5\n", "tour [6, 7, 8, 9, 6, 3, 0, 2, 1, 3, 4]\n" ] } ], "source": [ "g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n", "#kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']\n", "#kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]\n", "#g = debruijn_from_kmer(kmers)\n", "#setdefault is useful for setting defaults while or after filling the dict.\n", "initialNode = head(g)\n", "print initialNode\n", "terminalNode = tail(g)\n", "print terminalNode\n", "g.setdefault(terminalNode, []).append(initialNode)\n", "print g\n", "# graph g has an Eulerian cycle\n", "tour = []\n", "src = g.iterkeys().next() # pick arbitrary starting node\n", "print \"src\", src\n", "def visit(n):\n", " while len(g[n]) > 0:\n", " dst = g[n].pop()\n", " visit(dst)\n", " tour.append(n)\n", " \n", "visit(src)\n", "print \"tour\", tour\n", "tour = tour[::-1][:-1] # reverse and then take all but last node\n", "print \"tour\", tour\n", "sti = tour.index(initialNode)\n", "print \"sti\", sti\n", "tour = tour[sti:] + tour[:sti]\n", "print \"tour\", tour" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def eulerianWalk(g):\n", " initialNode = head(g)\n", " terminalNode = tail(g)\n", " #setdefault is useful for setting defaults while or after filling the dict.\n", " g.setdefault(terminalNode, []).append(initialNode)\n", " # graph g has an Eulerian cycle\n", " tour = []\n", " src = g.iterkeys().next() # pick arbitrary starting node\n", " def visit(n):\n", " while len(g[n]) > 0:\n", " dst = g[n].pop()\n", " visit(dst)\n", " tour.append(n)\n", " \n", " visit(src)\n", " tour = tour[::-1][:-1] # reverse and then take all but last node\n", " sti = tour.index(initialNode)\n", " tour = tour[sti:] + tour[:sti]\n", " return tour" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'6->7->8->9->6->3->0->2->1->3->4'" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n", "#g = read_input('dataset_203_5.txt')\n", "'->'.join(str(i) for i in eulerianWalk(g))" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "defaultdict(, {'CTT': ['TTA'], 'ACC': ['CCA'], 'GCT': ['CTT'], 'GGC': ['GCT'], 'TAC': ['ACC'], 'TTA': ['TAC']})\n" ] }, { "data": { "text/plain": [ "['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]\n", "kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']\n", "g = debruijn_from_kmer(kmers)\n", "print g\n", "eulerianWalk(g)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def stringReconstruction(kmers):\n", " \"\"\"Input: An integer k followed by a list of k-mers Patterns.\n", " Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may\n", " return any one.)\"\"\"\n", " g = debruijn_from_kmer(kmers)\n", " walk = eulerianWalk(g)\n", " return walk[0] + ''.join(map(lambda x: x[-1], walk[1:])) " ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'GGCTTACCA'" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stringReconstruction(kmers)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'CAAATGCATCATACGCTCACCCAG'" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmers = ['AAAT','AATG','ACCC','ACGC','ATAC','ATCA','ATGC','CAAA','CACC','CATA','CATC','CCAG','CCCA','CGCT','CTCA','GCAT',\n", " 'GCTC','TACG','TCAC','TCAT','TGCA']\n", "stringReconstruction(kmers)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'GGCTTACCA'" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#map() will apply its lambda function to the elements of the argument lists, i.e. it first applies to the elements with the 0th index, then to the elements with the 1st index until the n-th index is reached\n", "walk = ['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']\n", "walk[0] + ''.join(map(lambda x: x[-1], walk[1:])) " ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys\n", "sys.setrecursionlimit(2500)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'AATAAAAGTCAGTATCAGGTGTTCACCGTTATCTCCTGCCGGGCACGGGGGAGGTACCGTGCGTATCTAGGGAGAAGCCCGTCGAGTTCAGGGCCCAATGGTAGGTACTCCCGGTCCACTAATGACCCAACTAGACCACAAGGGGACTACGTACCCTACTCGCGAGCCTAGCTAACTAACTGCCCTGGCTTACGCATGACATACTGCGGTTTGCAGGGTGATGCTAGTATACATGGATGTGATAGCAATCTTTCAACCAAGCATTAGTACCATCGCACTGGAGCGATTTAGGTCGAAACTCATATTCGCTTTATGTGTCCTGGACGTGTATACAAGTCCTTCCTTACTTTACACAATAGCTTGGTACAGTGTCTCGACTCAACTTAAGGTCTTCACCATTTGATCTGTGCTCGTGTGCAGTTAGAGGAGTAGCCCCCACTTTCGTTTCGACGAGTAGACCCACAGCGTTGCGACGTCCACCACCTTTCATCACCACAGTCTCTTCTGGACTTAACAGCATTTATAATACCAGAGGCGCGGTATCACTGGATCTTCAACTGCGAATGTGAGGAGGTCCTGGATTATAGTGTCCTGCAACATGGATGCACGACCCGGAAACACCGGGATTGATCCAGACGAGATTGACTATTTAACGAGGAAGGTTCGTAATAGCATCGGGAGAGGATATGGCTCACCAGTCTTCGGTATATGGTTGACTGCTGACAGACTGTCGAGCAGCGATCGTTATAGAAGCTCCCTTTTAGCGCTCCGTGCCCCAGCTGATTGGTCGGTATATAATGCCTAGCGGGCTGCTTTACTCACGTTCCGCTTGTGGCGCCACAGACCCACGGGCAAAAGAGGGGGTCACCGCAAGTCTCCGTAGGCGCGAGTAACCTTATATTCACTTAACTTTTTTTCAAGGAAATCACAGCTTGGGACTACGAGCGACGACGCAATTGCATTGACGGCTAGTTGTGATATGCCCATTACACTGCGAGATAAGGGGAAACACATCACGGAATCCAAACCTCTACATTCCTTATAGGGCAAGTGATGTTCGGATCCCAAAGTATCGTGCATGAGTAGAAGCCGTGAGGGTATTCGAATCAGTGAATTACGGCGCGCGGAGGCAACCATACATATGTCAGACTATGCCCCGATGGATGAGGCCCATCAGGCTAAGCGATCATTCGGACGGGCCCCGATTCCTAACGCAGCCTTGTATAGCTGATGAAAGTGAGAGACGCACTCGATGCACATGTCCTGTCGTGTTATGGACCCAACGCGAGAAGAAGCACGTGAAACGTACAAGGGGCCGGACTGCAGCAAGGTGTTCGGTGTATCAAACACGTGTCAGCCCAGAGAAGTAGCCAACCGAGAGTATGAAACATCAGTAACAAGCTTACTCGGTTGAAGCGATGATCCCTATCAAGGGTCATCGGCACCACCTTTGAAGTAGCGGTAAGACCCCTCAGTTAAGAGTGGCGTTAGGTGGGACGGCATATGATCCTATTATTAGGTACACAGGGACAACAGAGTAGACTCATGAGTCATGCCAAACCGCTTCCGCTATGGTTTATTACAAAAAGACGCGGCCCTAAATTAAGCCTATGTGGAGACCATGGACCAATGACAAGATGTGAGTTGAGGCGTGGGTAGAGTGCTATGAATCAACGTTTAACTCGGGTAATGGCTCTGACACAACCGACTAATATCCCCGTTAGACAGACTGCCACGACGAGGACCTCCACCGATGGGGTGGAGTGCGACCTCTGGCGGGCAATGCGTAAGCTACAATTGATCTGCCTTAATCTGTCACCGGCAGTTATCCTCATATGAAGCGCTGCGGCAATGCCGCCTGATGAAGGTGCCGGATGACTGCGCTCTATACGAGGTTCTACGGACACGTTCCGTTACTCCAGACTACCCGTCAAACCTCCCGTCCGTCCACGCCATAATAGGTCCACTGAATGTGGATATTGCTAAATTGGAACAGATGTCGGTTCTAACAATATATAAAGAGGTTAAGTTACTTTGTAGTGCGAATTAAGTCCGGCATATGAGATACATCTTCAGCCATTGCCAGTTGTGCGTTATGCCCAACTCGGTTAGCTGTTATCCTTTTGATTACCAGTCTTCGGACCCCCCAGTATCGTAGCCTTTAGATAGATGCGGTCCACCTGGGTGGTTCCTCTGTGTAAATCGAAAAGAGTATGGCGACGCCGGGTTTCACTTCAGAA'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]\n", "stringReconstruction(kmers)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## k universal circular string" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['000', '001', '010', '011', '100', '101', '110', '111']" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import itertools\n", "[''.join(binary) for binary in itertools.product('01', repeat = 3)]\n" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#CODE CHALLENGE: Solve the k-Universal Circular String Problem.\n", "import itertools\n", "def k_universal_circular_string(k):\n", " \"\"\"Input: An integer k.\n", " Output: A k-universal circular string.\"\"\"\n", " binary_kmers = [''.join(binary) for binary in itertools.product('01', repeat = k)]\n", " g = debruijn_from_kmer(binary_kmers)\n", " cycle = eulerianCycle(g)\n", " return cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-(k-1)]))" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'0101111010011000'" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "k_universal_circular_string(4)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'11101000'" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']\n", "cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-2]))" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'0001011111111011111010111101101111001111110010111011101010111001101110001111100010110110101101001111010010110011101100101011000110110000111100001010101001101010001110100010100100111001001010000110100000111000001001100110001001000110010000100010000001100000'" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "k_universal_circular_string(8)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['11', '10', '01', '10', '00', '00']" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']\n", "cycle[1:-2]" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "CODE CHALLENGE: Implement StringSpelledByGappedPatterns.\n", " Input: Integers k and d followed by a sequence of (k, d)-mers (a1|b1), … , (an|bn) such that\n", " Suffix(ai|bi) = Prefix(ai+1|bi+1) for 1 ≤ i ≤ n-1.\n", " Output: A string Text of length k + d + k + n - 1 such that the i-th (k, d)-mer in Text is equal to\n", " (ai|bi) for 1 ≤ i ≤ n (if such a string exists).\n", "Sample Input:\n", "4 2\n", "GACC|GCGC\n", "ACCG|CGCC\n", "CCGA|GCCG\n", "CGAG|CCGG\n", "GAGC|CGGA\n", "Sample Output:\n", "GACCGAGCGCCGGA" ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def gappedPatterns(f):\n", " firstPatterns = []\n", " secondPatterns = []\n", " for lines in f:\n", " lines = lines.strip()\n", " lines = lines.split('|')\n", " firstPatterns.append(lines[0])\n", " secondPatterns.append(lines[1])\n", " return firstPatterns, secondPatterns\n", "def stringSpelledByGappedPatterns(firstPatterns, secondPatterns, k, d):\n", " prefixString = string_from_genome_path(firstPatterns)\n", " suffixString = string_from_genome_path(secondPatterns)\n", " for i in range(k+d, len(prefixString)):\n", " if prefixString[i] != suffixString[i-k-d]:\n", " return \"there is no string spelled by the gapped patterns\"\n", " return prefixString + suffixString[-(k+d):] " ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'GACCGAGCGCCGGA'" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = open('StringSpelledByGappedPatterns.txt', 'r')\n", "firstPatterns, secondPatterns = gappedPatterns(f)\n", "stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 4, 2)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'there is no string spelled by the gapped patterns'" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = open('dataset_204_14.txt', 'r')\n", "firstPatterns, secondPatterns = gappedPatterns(f)\n", "stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Solve the String Reconstruction from Read-Pairs Problem.\n", "#Input: Integers k and d followed by a collection of paired k-mers PairedReads.\n", "#Output: A string Text with (k, d)-mer composition equal to PairedReads." ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": false }, "outputs": [], "source": [ "read_pairs = [('TAA','GCC'), ('AAT','CCA'), ('ATG','CAT'), ('TGC','ATG'), ('GCC','TGG'), ('CCA','GGG'), ('CAT','GGA'), ('ATG','GAT'), ('TGG','ATG'), ('GGG','TGT'), ('GGA','GTT')]\n", "#read_pairs = [('GACC','GCGC'),('ACCG','CGCC'),('CCGA','GCCG'),('CGAG','CCGG'),('GAGC','CGGA')]" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import defaultdict\n", "def debruijn_from_readpairs(read_pairs):\n", " '''Input: A collection of k-mers Patterns.\n", " Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n", " edges = defaultdict(list)\n", " for pair in read_pairs:\n", " edges[(pair[0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:]))\n", " return edges" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "defaultdict(list,\n", " {('AA', 'CC'): [('AT', 'CA')],\n", " ('AT', 'CA'): [('TG', 'AT')],\n", " ('AT', 'GA'): [('TG', 'AT')],\n", " ('CA', 'GG'): [('AT', 'GA')],\n", " ('CC', 'GG'): [('CA', 'GG')],\n", " ('GC', 'TG'): [('CC', 'GG')],\n", " ('GG', 'GT'): [('GA', 'TT')],\n", " ('GG', 'TG'): [('GG', 'GT')],\n", " ('TA', 'GC'): [('AA', 'CC')],\n", " ('TG', 'AT'): [('GC', 'TG'), ('GG', 'TG')]})" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = debruijn_from_readpairs(read_pairs)\n", "g" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('TA', 'GC'),\n", " ('AA', 'CC'),\n", " ('AT', 'CA'),\n", " ('TG', 'AT'),\n", " ('GC', 'TG'),\n", " ('CC', 'GG'),\n", " ('CA', 'GG'),\n", " ('AT', 'GA'),\n", " ('TG', 'AT'),\n", " ('GG', 'TG'),\n", " ('GG', 'GT'),\n", " ('GA', 'TT')]" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "walk = eulerianWalk(g)\n", "walk" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['TA', 'AA', 'AT', 'TG', 'GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA']\n", "['GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA', 'AT', 'TG', 'GT', 'TT']\n" ] } ], "source": [ "firstPatterns = [a for (a,b) in walk]\n", "secondPatterns = [b for (a,b) in walk]\n", "print firstPatterns\n", "print secondPatterns" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'TAATGCCATGGGATGTT'" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 3, 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Quiz que 3" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "defaultdict(list,\n", " {('AC', 'AT'): [('CC', 'TA'), ('CT', 'TT')],\n", " ('AT', 'TG'): [('TA', 'GA'), ('TT', 'GA')],\n", " ('CA', 'GA'): [('AC', 'AT')],\n", " ('CC', 'TA'): [('CG', 'AC')],\n", " ('CG', 'AC'): [('GA', 'CT')],\n", " ('CT', 'AG'): [('TG', 'GC')],\n", " ('CT', 'TT'): [('TG', 'TC')],\n", " ('GA', 'CT'): [('AA', 'TT'), ('AT', 'TG'), ('AT', 'TG')],\n", " ('TA', 'GA'): [('AC', 'AT')],\n", " ('TC', 'AA'): [('CT', 'AG')],\n", " ('TG', 'GC'): [('GA', 'CT')],\n", " ('TG', 'TC'): [('GA', 'CT')],\n", " ('TT', 'GA'): [('TC', 'AA')]})" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "read_pairs = [('ACC','ATA'),('ACT','ATT'),('ATA','TGA'),('ATT','TGA'),('CAC','GAT'),('CCG','TAC'),('CGA','ACT'),\n", " ('CTG','AGC'),('CTG','TTC'),('GAA','CTT'),('GAT','CTG'),('GAT','CTG'),('TAC','GAT'),('TCT','AAG'),\n", " ('TGA','GCT'),('TGA','TCT'),('TTC','GAA')]\n", "g = debruijn_from_readpairs(read_pairs)\n", "g" ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('AT', 'TG'),\n", " ('TT', 'GA'),\n", " ('TC', 'AA'),\n", " ('CT', 'AG'),\n", " ('TG', 'GC'),\n", " ('GA', 'CT'),\n", " ('AT', 'TG'),\n", " ('TA', 'GA'),\n", " ('AC', 'AT'),\n", " ('CC', 'TA'),\n", " ('CG', 'AC'),\n", " ('GA', 'CT'),\n", " ('CT', 'TT'),\n", " ('TG', 'TC'),\n", " ('GA', 'CT'),\n", " ('AA', 'TT'),\n", " ('AT', 'TG')]" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "walk = eulerianWalk(g)\n", "walk" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['AT', 'TT', 'TC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CC', 'CG', 'GA', 'CT', 'TG', 'GA', 'AA', 'AT']\n", "['TG', 'GA', 'AA', 'AG', 'GC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CT', 'TT', 'TC', 'CT', 'TT', 'TG']\n" ] } ], "source": [ "x = [a for (a,b) in walk]\n", "y = [b for (a,b) in walk]\n", "print x\n", "print y" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'there is no string spelled by the gapped patterns'" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#ANSWER SHOULD BE: CACCGATACTGATTCTGAAGCTT\n", "stringSpelledByGappedPatterns(x, y, 3, 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Problem set" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Process the readpairs into list of tuples\n", "f = open('dataset_204_14.txt', 'r')\n", "read_pairs = []\n", "for lines in f:\n", " lines = lines.strip()\n", " lines = lines.split('|')\n", " read_pairs.append((lines[0],lines[1]))" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'CTCTACGAGCATTGGCATAACCGTAGCACCTCAAGCGTATTCTCGCTCGCCGATTGTCAGTGCCGCCAGGAGGCCCTTGACAAACGTACAACTAATGAGGCTTCCTAATCCCTCGTTTGGTAATGAGGAGTATCATGTTGATATCGGCCGCCTGCCGACTAACAGGTACTTTGGCTGCTGTACGAGGTGAAAATAGCGGCTGACATTGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGATATAAATTGCATTACGCTACTAGGACCGATTGACTCGGGGGGTGCCAGTTGGTGACCCACTACAATCTACGCCCTTGCGACTTTGCCTAACATACTCGTCAGGGCAAAGTGTTGGGTACATGTGTTTCACCACAATGCTCTTGGGTCTTGACAACAGGCAAAGGTCTTTAATGCGTTCCCTTGGCGAGTGTGGCGCGGTAATCGTCGAATCGCATGAGACGAGCAGGGAAAAACTTGGACGTTACATCCGGCAATAGATTCCCACGATCTGCCCTTGCCCATATGCACAGATGATATTACAGCTAGGATCCCCAGGAGCGGCATGTGTATCACTAGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCAAGGCTTGACCGCACCTCATAGACTCTCGTAACGTAGTAAATCTGTATTAACAGGAGCTATTCAAACTGCTCCTTGGGTGCTTCCTTGCACGAGCCGAAATCGCCTTCTAGTCAATGAAAACTAGGAGAGACTCCTAGCATCTGCGATACCGCCCCTTGATTTACCAACGAGGTAAGATGTCGAATTTCTTTTCTGTGGGCTACCCCGGAGTAGCAAGAGGTGAAGAGTATTCTTTGATCTAGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGATTTAGACAGTTAACTGATGGATCTTAAGGGTTAGAGCCATTCCCAGATTCATGTCGGGACTTTCTGTAGGTCACGGCCCCAACGGACAACCGGTCGAACTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTCTAAATCCAGGTGGCTATTGAGGTAAGCGGAGCTTTGATCTCGGCACGTCGACGGGAACCACATATATCTTCCGCCGGTACCTTCGGTAGCGCTGACCCGGAGGATCGGCGTGGAATCTCAGCTTAAGCCGTGTCTGAGTGGTTAGGCCTTTTAACCTGGCCACTCCCTCGACGGAAATTTTTGGACGCTAGCAAGAGTCAGACTAACGTACCCTAGACAGGGCAAAGTCGGAATCGACTCTGAGGTCTTTGTATCATATTGAGGGTGCAACACAAAGCGTCTTAGTCACTGACAGTTGGTTGCACTGTTTACCCCCAATCTCGTGCGGACTTGGAATTCGTAGGCACAGTTCTTGGCAGCCGGGAAAAAGAGCCACTTTCATCGTCGAAACTCTCGATTGGTCAGCCCCGGTTTGAACCACGCCGACGTCGCCCTCCCAACAGAACACCCCACCCGCTGAGGCAAGTGGACCGTTTGCGCTTCCGCACGCCGCTATCGCGTACGCAGTTGAGAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCACCAATAAAAAGCACCGACTTTATGCGAGTACGAACCAGAGGAACGGGGATGATCGATCGTGTACTGGGGTACGTCAAAAGGAGGTAACGGCCGTTACTGACCTGGTTGCAATGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGGGCGCAGGTGAGCGCGTCCCGTTGTCTCTATGATACCAACAAAGCACCCAACCTCTGGTTTTTTCCCTGTTATCGACCTCGGCATTGGAGTCTTTTAAGTTGAATAGCTACACAAAACCAAAAGAATTATGCCGCCGTAGTGCGGGTGGCCCGCCCGCACCGAAGCGGACGCGTTTTCATGAGTGCGGAGAGGCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCACTTTTCCCTTCATTGTACCTTTCATCTCCGCATAGCTATTTGAGATCAGTATGTCGTACTGATTTTAAAGATCACCAAAGCATCGGAGGACACATTATGAGTGTCTCAGGAGCTATAGATATGCTAGGCAGGGTCGGTCAGTCTCTGTCCGACGGAGCACAAGGCACCTCACCCCCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTGTTTCTCGATTGAATAAACCTGGCCCGTCAATTACTGAAGTAGCTTAAACGTTACGACGGCAGTCCTCATCACCATCCTGGCTCGAACTACCGAGCAGCGACGAGTTGCGCTCAACACCTTTTTTTGATACATCCAGATAGTCTACAGAATGCACGGATTGAGAATTCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGTGAGACTGTTCTACACATAAGCTTAAACACCATAAAGTTGCTTGAAGGGTCCTTTCTGCGCGAGTTCGGAAAATATGCGCCCAGGTAGCGTTGGCTTTTCATATAGCGGAGCCCGACTAACTGTTCCACCCCCGTCGAAACGCCATTGAAGCGGACGCAAGTTCATCCCTGCCAAATTGAGACAGGTTTGTACGTCCTAAGAAGGGGTTTGTACCTTTGAGATTGTTTATGATCTATCTTGGCAGCCCGAGCAGAGTATCTTGTTTTAATATACGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTGGCACACGGTACAATGTGCCAGGAGTTATTCTCATCGATGCTTCGACATTGACTTCGCTCTTTGAGTTAACAACTCGCGGAGGCACCTTCTCAGTTGCATGACTTGATGTCGTCATGAAAATGCTCGGTGGCTTCTCACGGGGACTCACCCAACTAAAACCCCGACGAACGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGACCGCCAGCGACAAATATAATTACGGATGTCGGGGGCTGCTTCGCTGTTAAACGATGAAGAATCTGAGATTTGACAAGGCAAGTCCATAATCGTATGGTTCCTCCTGAGGCGGGGATCATCAGGGACTTTGGTCCTGCCTTAGGGACGTATTAAATTAAATTATCTCGGCCGCGGCTCAGGTTCATTAAAAAAGCGGGGCTTGCTATTGTCCCCCAACCCTTCGTGTTGAGAAACGAGAGAATAGGGACGAGCGCACGACGACACATAGAGATTGTGTGCCACCAGTGCTCCGAGTACCCCTTATTCGTCTCACTTTGAGTTTAATCCTACTATGTTTGGTACAACAGATGTCATAGGTTACGTTGAACCCGATAGCTAGTTCCAGCACGAACCCGCTAAATGAGACACCCAAGAATCGATCCCCTAGTTCGCGCCATGTAGGCTTTAGTAAGTCGAGGTCATACTTTACTAAAAAAAATTTTAGGGTACTACTTCGAGCATGCCCCTTGTCCCTTACCCTTATACGTCAGGATCTTGATTGGTCAGTGGGCCACAAGGATGGCTAATCTTAAGAGCGTCGCTCTATTTCCGGAACGGCAACAAATTCAAACCGGTACGCCTTGTCTACATATCTACAAGCGATAAGGAATAGAAGCCGGTTCAGTGTGTCCAGTTCCCCACAGCGTCATCCGCTTGAGTCGGCGCAAGCCCGGGTCACAACTATGAGAAATCCGATATAACCTCTCATTTGTATACCTCAATCAATACGGCCGAATTTGTTTTGGCTCCCGAAACGCCACACACACGCGTCACCCAGCCAATTACTCCGTTGGGGGTAGATCGGGGCGTGATACCAGTAAAGTTCAACTGCAAGGGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGCCAATCTAAGGTGGGTTAGGTGAGCCCATTTGATGCCTCGCAAAGGATTATTTGCAAGATTGATCCTCGGGTTAACTTGTAATCTCACCGGTTGAAAGTAAACCTAGATACGCAGTATCACTTACTCCTCTACCGTTCCTAGAGGTAAGTTTTTGAACCAGTAACAACAATTAGACAATCACATAAAATTCTATCAATTTCTTAACGCTTAGATCCTTCTTCTACCTTATAACTGTTCCCACGCTACCGTGCTTGTTCGGTCGGCCACGCGAAACCTCTTGCGGCTAACCAGTATATCCCGGGCTTCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCATTTAGCGCGCGTTACCGCTCCACTTCACTCTCCGTGGGGATCTCTCATGAGCTGGCGGCTCCCAGATTGCCCTCCACAGTCAGAATGGGTGCGCAGAGCATGACGACAAAACTCGAACTGCTACACCTCTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGGATTTTCCCCTCTTTTGGTCATGCTGCCGCCATAATATAAGCCCAGTGCGATCTCATTTGCCATTTCGTTTTTCGCTTAGCAACCTATCTGAGTGCTCGTGTAGCAAATGACGCCGAATGTTATGAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGGTTATCGTCTCGAGGATGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGGTCGATTATTGACTGACAGTTGATTCCCTGCATATGGCGGGTATAGGACACCCTCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCCATACCGTATAACGGGTGGTAGAGAGCATCTCTTCTATGCTTGGGGCCGATTAAGGTAGACTGAGGAGTCAATCCGATCCTAGTATATAAGGGCGAGCTCGGCTACTGTGACGACCTAGTTACATAATACTCGCTCTTCTATTATTCAGTAACTTCATTATAGTTCAGACTTGTTGCACGCTTGCTATCTACGAAAAGTAGCCTCCTATCGCTAGGAGCACGTTTCCAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCCTGTACATCTGATGGCATTGTCTGAGTCCCGCCGAAACACTTCTGAAAAGGTTGCCGCTATCTTTAATTCCGTACCCGACGCTATTGTGTGAAATTGGATTTCAAAAGATAAACTGTCGCTCAACTGAACTTTCTTGAACTCGGTACCGGCGTACTATAGTAGGACTTTCTAGGCTGATACGTGACCGACCTATGCAATTACTATAAGTTCGCTGACGCGTCCGCCCGTGACCTTATCTGTGCCTGCCACGGTCGTCTGATGTGAGCGTTGTGGAAGGGAGAAATCGGGCACTCCTGTAACCCAAGTAAAGCCTTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCAGTTTACGCGATTCTAACTTGACTGGAGATTGCTCGGGACTCCACGAACCAAGTGTCACTCTCGCAGTGTTGCCTCGGCATGATGATGCGTATCGTCCCTTTTTTCTCCCGAAACCGCCGTTACCGCAACCCGCCAACCTTCGGAGTAGGGTCAACGCATTCGATACGTGGATAGAACTCATACTAATGTTTAGCGCCGCAATTGGCGGGTTCTCATGAATCAGCTCGCAATGTTGAACCTTTTTGCTTCTTGACATGTTTTATCCGCTGAGTAGAAGATCGCAATCCTAAAGAGACAGATGACATTCCGAGAATTTAGCGATAGCAGTGTCGTCGCCGGGAGAAGTTACACTAGGTTACGCTTCGACCACCTCCGGGCAGACGCGTTGG'" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sys\n", "sys.setrecursionlimit(10000) #To avoid maximum recursion depth exceeded\n", "g = debruijn_from_readpairs(read_pairs) # Constructs prefix and suffix for each pair in the read. De bruijnize the read pairs.\n", "walk = eulerianWalk(g)\n", "firstPatterns = [a for (a,b) in walk]\n", "secondPatterns = [b for (a,b) in walk]\n", "stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200) #Constructs the string from paired eulerian walk." ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }