{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"### CODE CHALLENGE: Solve the String Composition Problem.\n",
" \n",
"def kmer_composition(text,k):\n",
" \"\"\"Input: An integer k and a string Text.\n",
" Output: Compositionk(Text) (the k-mers can be provided in any order).\"\"\"\n",
" composition = []\n",
" for i in range(len(text)-k+1):\n",
" composition.append(text[i:i+k])\n",
" return composition "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['CAATC', 'AATCC', 'ATCCA', 'TCCAA', 'CCAAC']\n"
]
},
{
"data": {
"text/plain": [
"'CAATC AATCC ATCCA TCCAA CCAAC'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = 'CAATCCAAC'\n",
"x = kmer_composition(text,5)\n",
"print x\n",
"' '.join(str(i) for i in x)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"###String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.\n",
"def string_from_genome_path(kmers):\n",
" return kmers[0] + ''.join(map(lambda x: x[-1], kmers[1:])) "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'ACCGAAGCT'"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kmers = ['ACCGA', 'CCGAA', 'CGAAG', 'GAAGC', 'AAGCT']\n",
"string_from_genome_path(kmers)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'AGACTGAAAGTCAATGCAGGTTGTCAAGAGCTGGTCGGAGGAGCCATTATTGTTAACGGGTCCCGGTAATCCGTCATGTGTATCCTACATGTCAATAGAACGACACAAAGGCTAAAATGATCTGTCGATGATACACATGTCCTGCAAGACAACTGACGCTAAGGTTTGAACTGCAGAGGGTTTGTCCAACACACATAATCTCGGCACATCTTATTAAGCGCCCCGAATCGCTTTGTTGGATTAACACGCCTCGACGCCTGAATAGGAACCACATCCCCGTAACTGAGCTCGTGCACAGAGCTTCATCGCATAGGTCACTCAAGACTTGTGGCTTGTTCATCGTAGGCCGCATTACAAGGCAATAGCATGTCGTGGTGTGTACAGCAGAATCCCCGTAGACCAAAAGAGAGAAGTGCCCCGGCGTCCTAACTCTAGCGCTTCATTACTAAGGGACGCAAACATTGAACGTCCTGTAATTTAAAGCATGGAGCTTTATGGTAGAATATAAACCCTGTGGCTTTCAGCTTAAAAATTGTCCCAATAGCCGGTTTACTTATCGGCCGCGTGTCGCCAACTGTGGCTTGCGTAGGTCGCGTTGGTAAGGGGCACAGCGTTTTGGTCCTCAATCCGTGCGGTCTGTCTTAGTACCAAGCAAAGCAGTGCCTTGCAGATCTCCGTGCACACAAAGCGAGTGTAACGCAACTGCCAATCCCCGAGTCCATAGGAGGTTAGACTTCCCAAAAAAACTACCAATGTGGATATGTCTCCGTTTGACCCTGATCAATTCGCACTTGCTTGTCATTCCCACGACCTCCATTTTTAGCCATAGACAGCGTACTGTTCCCGATGATAAGCGTTGCCACGATGGCTAGACTAATCCACCGCTAAAGTGTGAAAAAGCGAGCTGAAGGTGATATGATAGTCCAGTACAGTGTGCTACTACGGGGTAGAGGATCCAATAGGTTTCAGCCGAATGAAGAATAATCATCGCAGCTGGCTTGATCCGGGCGCGTAACTGCTGCGAGAGAAGTCAACTGGTGGTGACCTCGCGGACTTTCCCTCAGATAGACGCATACGTTCACTAGACAGCAGCTGAACACGTATTGTTGATGGGGATTTAGGCGGTTTATCGTACGCCGCATGGAATGCAGCAACCGACAATGAAAGACGAGCACGAGCCGATAGCTCGCTCATAATTTCCGTGGTGTTGATAGAATGTGCTTAAGCTACCAGGCGTCTTTTACATCCTATTCCGTTCACTTTTATTACACTTACTCCGGCGTATCCCAGGGAGCATACCACAGCTTGGATCATAGAGATCTGTAGCCGGCTTGCATGGGCAGAAGCTATGAGGGAAAGAAAGGTGACGAAACATACGTACAGTAAAGGCACAAGCTCTAGCCTTCTAGCATGTTGCACGACAGTCTGCGTGTACTTCCACACCACACTCGTTTGGTTTCATCTCACGTAAAGCGTACAACATCCGTTTCGGCACGGGCGTTTACGTTAAAACGCGCCGCGTAACCTTAATACAGTCGGTTCAGGTGCATTGCGACCGACACATTAACTCAAGGTGATGGCTAGAAGGGGCTCAGTTCGGGCCTAACGCCTACTGAACCTACTGCGGTCGACTCAGAGGGTATATTGATTATGCCATCGTGGACATGTCACAATTTCGGACGAGGCGACCCCAAGGGCTGCGGTCCAATGCCTTGACTTGAGCACACCTGGAGCATTCGCTGTCCTCTACCCTAGAGTTCAAGTCGTGGGACGGTACTCGACCTTCCGCCTCCGCTTGGTGGCAGACTCTGATCGCCCTCACATCCTACATGGTACCTTAGGATTAAGCTTCCGAAAGGGGATATACTCACCCGTATCCAACCTCTATTTATAGGTGCAACTCGCGGCATGAATCGCCATCAAGAGTCTGCGACACTATACCCTACGCCCTCGAGGAAGCAACTCAGAAGCATACTGTGCTAGTGAGATATTAATGGAGTAACCATCCTACGGGTCATTCCAATTACTCTGTACGAATATGTGCTCTAAGCTTCTAGCGGGTCCACAAGAATACCGTCGGCCTCGTCCTGGTCCTGACGCCCCAGCTCGCGGATAATTGCCCATTAGCGGACGAGGTTCAGTCCCACCGACCCCACAGATTTTCCAGACCATCGGGGCCCGATCGCTCGCTGAGCACCTAGTCTCGGGGGATCTGTACCGATGGAAAACCCGACACAGGCAGAGGTCCCGGCGACAGCCGCACTTAATGTACCTCTCTCCAAGTTCATGAATCTGACGGGACGATGCAGACCTAATAACCAAGCCGAGCTTCTATTAGGAAAGAGGATCCGCCTTGGGCACCTGGATGACTCACAGAAAATTATGGGTTCTATGGACGCGTCTTGTATACAGATAGTCGGAAGACACCATGGTGGACAGCGGCGAACGAGGTACGATTTTTCGACATCCACGCGTTCCTCAGGGGGGCTTGGTAGGCCATGATTGAAGGACACATCAGGGGAGTCAACCAATGGTGCAGGTGGGACAAGCGTTGGGCAGGCTTGGACAGCAGGGAATCGCCACCAGGTAATACGTAGCGTAACAGCTTAAGAGGTTGGCCGTAGAACCGGCATTCTTAACTATGAGCATATACCAACCCCCCCTCCATCCGAGAGGAGCCTATCCGATCCGTCGTATCACATATGGGCGCATGCATGGACTACGGGACGGCTAGATATCTCCAATGGTAGGGGCGTGATGGTAAGTCTCTGATAGAGCTTTTCTTTTTATCGAGTACACGTCTAAGTAGCAGTGGCAGACCCATGGAGTCGCCGGTCATAAAGCCATTACCCGTCGTGCATGTGTGAGCATTTTGGAGCCTCTCGGATGATTATTAGTATCCGCCAGCATGATACAAAAGGCTAGTAGAGTGGCAGATACTAAGCGGTATGGAGCAAACTATACCTACTGAACCATAGGATTGATTTCTTCGTGTCAAGGCCGGTATGGCTTCAGAATTCTTTTGTTATAGTATGACTCGATGTATGATGGTCCGCGCCCTACCCCCCGGAGTGTACCTAAAAGAAATCTGCAGCGCGCCGAGTTATGACCTAGGTGAACCTACAATGCTGGCCGCGTGACTCGGAAGCAACTACCGTTAAATAATCATTCTCTAATGGAGCTAAAAGGCCATGCTCTAGGACTCGGTCGCGGCAAAGTCCCGGCTAAACTTATCCCCCATAGTGCGCCCCAACTAGACCTGCCAATCGGACGTTTTGAGGAGCACCGTATCATTGTCACGACTGCCGTGGGCGCTAAACCTAACTAGTCACACATGGATGTACACTTGAGGTGGCGTTACTCTGCACGGCCTATTGAAAGCGCGCGCCTGCTAAGTGCCCTTAGCGCTCTTAATTTTGGCGCGAGCATGTGTAGGTATCAATCCAGCGCCAACGCTTATTCGGCTATTGCGGAAAGTTAAGCTAACTACACTCGCTATCACCCAGACCATTGATTAGGATGATGTTACTAGCAACCGATCGGCAACCCTGCCGTCATGGTTTGTGTGGGTTTGGCTAGTTCCAAAACCTATTGTAACTTGAACTGGTGAGTGACTGGCCTGCATGAGGCTGTGCTTCTGGCAGAAAGGAGCGATCCAGGATATGAAAATAAACGAATCGCCACGTGAGCTCACTAAGTCTTCACCCATGTAGCACGAGTCTGGCATTTAATACCTACACGTTAGAGTCCACGATCCTATAACATGGGTAGCTGCCGTGAGAAGTCAATCATGGCACCACCATGCTTTAGCAAGCAGGGTCCAAATGAGAAGACTCCACTTACAGATATCAAAACCGAGCCTTAAGCACCTGAACGCGCCTTTCTTGCGTATTCCACATCCCACTTGGCTGCTTGAGAGAGTCGATATCGAAGTTAGGTCCGATTGTAATTCGCACGGGGTAGAGTGCCCAAGGCCAAAACTGATTAGTTTCCATAGAATCGGTGCTAGTAGTAGACAAGGGTGCGGCGGAGTCATCGTTTATGACTCTCTCCCGCAACATTGAAGCCGGCAAAGTAGATAAGGCGTTTATAAGACTGAGACTACGTGGTCCCTTACTATTTTTGCTGGAGGAGGACGCCGCACGCAGCATGCTTATAAGCACGTTGATGGATAGTGCCCACCCTTCAGTGGAAGAGTGTACGATTCGCCAGAGCGGAGGAACAGATTTCTCCGTGTCTGTATACTTTAGCGCCGTCACCCCACTGACTCATCGGTTAGTACAGCACGAATGGGGCCTTGGGACGCCGTAGTACCAGACGCATTTTTGAGAGTCTCGCTATATCGGCCCTCTGATACTCAGAGTGTCGTTAATCGTAGCTACTTATGAAAAGCTATTACGATCCTCAAATATTCGGACTTGAAGATCAGTTAAGCCTCAGTCGCCGGTCGTTAAAACTAACTCCTGGCTCTTAGGACGGAGAGTAGCAATATACGTGACATTGGCTTCACACTCTGTATTAGCTACACGTGCAGCTGTACAATTTTAGCCTTACAGAGTTTATGCGAAGTTTCACCCTTGTTGCCGTACAAGTAGGTAGTAAGTTTGCCACGTACTCAGAAGGCCTAAGCCTCAACACTGTTCGCTGCCTCCATGGGGGGGAGGCTTTGTTCCAGGCTCTGCTTTGTTTCCAACTAATTCTGAATGCTGAGCTAGGCATATTCGTGTATCCGGGGCAAACTCTTACGGTCTAAGTATATAGAACAGTGTCGTGCCAACCGGCCAGGTGTTCCCTTTGTCCGGTGGTGTATATTTAGCTCCAACAGACATCTGCAGCGGAATTTAAAGTCGTTATTTATTTCTTCGCGTAAGCCCATAATATTGGCATACCGGCCCAGTAATCGCGGCAAGTGATAGTGGCAACGCACAGACAGGGTTTAGCTCAAACTGACCGCCCACACGTAGATGGCTGTTATAGAAGGTTTTACGATTCAGACCTTC'"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]\n",
"string_from_genome_path(kmers)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def overlap(a, b, min_length=3):\n",
" \"\"\" Return length of longest suffix of 'a' matching\n",
" a prefix of 'b' that is at least 'min_length'\n",
" characters long. If no such overlap exists,\n",
" return 0. \"\"\"\n",
" start = 0 # start all the way at the left\n",
" while True:\n",
" start = a.find(b[:min_length], start) # look for b's prefix in a\n",
" if start == -1: # no more occurrences to right\n",
" return 0\n",
" # found occurrence; check for full suffix/prefix match\n",
" if b.startswith(a[start:]):\n",
" return len(a)-start\n",
" start += 1 # move just past previous match"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 2, 3), (1, 3, 2), (2, 1, 3), (2, 3, 1), (3, 1, 2), (3, 2, 1)]"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from itertools import permutations\n",
"list(permutations([1,2,3],3))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('ACGGATC', 'GATCAAGT'),\n",
" ('ACGGATC', 'TTCACGGA'),\n",
" ('GATCAAGT', 'ACGGATC'),\n",
" ('GATCAAGT', 'TTCACGGA'),\n",
" ('TTCACGGA', 'ACGGATC'),\n",
" ('TTCACGGA', 'GATCAAGT')]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reads = ['ACGGATC', 'GATCAAGT', 'TTCACGGA']\n",
"list(permutations(reads, 2))"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Solve the Overlap Graph Problem (restated below).\n",
"from itertools import permutations\n",
"from collections import defaultdict\n",
"def overlap_graph(reads):\n",
" '''Input: A collection Patterns of k-mers.\n",
" Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. (You may return the edges in any order.)'''\n",
" olaps = defaultdict(list)\n",
" for a, b in permutations(reads, 2):\n",
" if a[1:] == b[:-1]:\n",
" olaps[a].append(b)\n",
" return olaps"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AGGCA -> GGCAT\n",
"GGCAT -> GCATG\n",
"GCATG -> CATGC\n",
"CATGC -> ATGCG\n"
]
}
],
"source": [
"reads = ['ATGCG','GCATG','CATGC','AGGCA','GGCAT']\n",
"graph = (overlap_graph(reads))\n",
"for x,y in graph.iteritems():\n",
" print x,'->', ' '.join(y)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a -> b\n",
"b -> c\n"
]
}
],
"source": [
"d = {}\n",
"d['a'] = 'b'\n",
"d['b'] = 'c'\n",
"d\n",
"for x,y in d.iteritems():\n",
" print x,'->',y"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Solve the De Bruijn Graph from a String Problem.\n",
"from collections import defaultdict\n",
"def debruijnize(st, k):\n",
" \"\"\" Return a adjacency list, for each k-mer, its left\n",
" k-1-mer and its right k-1-mer in a pair \"\"\"\n",
" edges = defaultdict(list)\n",
" for i in range(len(st) - k + 1):\n",
" kmer = st[i:i+k]\n",
" edges[kmer[:-1]].append(kmer[1:])\n",
" return edges"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GC -> CG\n",
"AC -> CG\n",
"GT -> TC\n",
"CG -> GC,GT\n",
"TC -> CG\n"
]
}
],
"source": [
"edges = debruijnize(\"ACGCGTCG\", 3)\n",
"for x,y in edges.iteritems():\n",
" print x,'->', ','.join(y)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def de_bruijn_ize(st, k):\n",
" \"\"\" Return a list holding, for each k-mer, its left\n",
" k-1-mer and its right k-1-mer in a pair \"\"\"\n",
" edges = []\n",
" nodes = set()\n",
" for i in range(len(st) -k + 1):\n",
" edges.append((st[i:i+k-1], st[i+1:i+k]))\n",
" nodes.add(st[i:i+k-1])\n",
" nodes.add(st[i+1:i+k])\n",
" return nodes, edges"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def visualize_de_bruijn(st, k):\n",
" \"\"\" Visualize a directed multigraph using graphviz \"\"\"\n",
" nodes, edges = de_bruijn_ize(st, k)\n",
" dot_str = 'digraph \"DeBruijn graph\" {\\n'\n",
" for node in nodes:\n",
" dot_str += ' %s [label=\"%s\"] ;\\n' % (node, node)\n",
" for src, dst in edges:\n",
" dot_str += ' %s -> %s ;\\n' % (src, dst)\n",
" return dot_str + '}\\n'"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The gvmagic extension is already loaded. To reload it, use:\n",
" %reload_ext gvmagic\n"
]
}
],
"source": [
"%load_ext gvmagic"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 2)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 3)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%dotstr visualize_de_bruijn(\"TAATGCCATGGGATGTT\", 4)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%dotstr visualize_de_bruijn(\"TAATGGGATGCCATGTT\", 3)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"def debruijn_from_kmer(kmers):\n",
" '''Input: A collection of k-mers Patterns.\n",
" Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n",
" edges = defaultdict(list)\n",
" for kmer in kmers:\n",
" edges[kmer[:-1]].append(kmer[1:])\n",
" return edges"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AGG -> GGG\n",
"CAG -> AGG,AGG\n",
"GAG -> AGG\n",
"GGA -> GAG\n",
"GGG -> GGG,GGA\n"
]
}
],
"source": [
"kmers = ['GAGG','CAGG','GGGG','GGGA','CAGG','AGGG','GGAG']\n",
"for x,y in sorted(debruijn_from_kmer(kmers).iteritems()):\n",
" print x,'->', ','.join(y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Eulerian cycle and eulerian walk"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def generate_edges(graph):\n",
" edges = []\n",
" for node in graph:\n",
" for neighbour in graph[node]:\n",
" edges.append((node, neighbour))\n",
"\n",
" return edges"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 3), (1, 0), (2, 1), (2, 6), (3, 2), (4, 2), (5, 4), (6, 5), (6, 8), (7, 9), (8, 7), (9, 6)]\n"
]
}
],
"source": [
"graph = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}\n",
"edges = generate_edges(graph)\n",
"print edges"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def nodes_degrees(edges):\n",
" ''' This returns a dictionary of nodes and their degrees. The degree of a vertex is the number of edges connecting\n",
" it, i.e. the number of adjacent vertices. Loops are counted double, i.e. every occurence of vertex in the list \n",
" of adjacent vertices. '''\n",
"\n",
" degrees = dict() \n",
" for a, b in edges:\n",
" degrees[a] = degrees.get(a, 0) + 1\n",
" degrees[b] = degrees.get(b, 0) + 1\n",
" return degrees"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{0: 2, 1: 2, 2: 4, 3: 2, 4: 2, 5: 2, 6: 4, 7: 2, 8: 2, 9: 2}"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"edges = generate_edges(graph)\n",
"nodes_degrees(edges)"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def highest_degree_node(edges):\n",
" degrees = nodes_degrees(edges)\n",
" return max(degrees, key=degrees.get)"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"highest_degree_node(edges)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"g = {0:[3],1:[0],2:[1,6],3:[2], 4:[2], 5:[4], 6:[5,8], 7:[9], 8:[7], 9:[6]}\n",
"def eulerianCycle(g):\n",
" \"\"\" Input: The adjacency list of an Eulerian directed graph.\n",
" Output: An Eulerian cycle in this graph.\"\"\"\n",
" tour = []\n",
" src = g.iterkeys().next() # pick arbitrary starting node\n",
" def visit(n):\n",
" while len(g[n]) > 0:\n",
" dst = g[n].pop()\n",
" visit(dst)\n",
" tour.append(n)\n",
" visit(src)\n",
" tour = tour[::-1] # reverse and then take all but last node\n",
" return tour"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0->3->2->6->8->7->9->6->5->4->2->1->0'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tour = eulerianCycle(g)\n",
"'->'.join(str(i) for i in tour)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def read_input(filename):\n",
" f = open(filename, 'r')\n",
" g = dict()\n",
" for line in f:\n",
" line = line.strip()\n",
" line = line.split(' -> ')\n",
" g[int(line[0])] = [int(x) for x in line[1].split(',')]\n",
" return g"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0->2->6->541->1215->1214->1213->541->542->543->6->31->32->771->1355->2441->2442->2440->1355->1356->1354->771->770->769->32->33->156->911->910->1203->1201->1202->1674->2189->2190->2188->1674->1672->1673->1202->910->912->1232->2903->2902->2904->1232->1873->1874->1875->1232->1231->1233->912->156->154->553->555->554->932->931->933->554->2488->2489->2490->554->2201->2200->2202->554->2088->2086->2087->554->1184->1183->1185->2103->2102->2101->2396->2395->2397->2101->1185->554->154->155->33->6->4->8->83->231->1723->1725->1800->1799->1798->1725->1724->231->229->230->83->82->84->8->39->48->62->688->1088->1586->1585->1587->1088->1089->1087->688->690->967->1250->1249->1251->967->968->969->690->1486->1487->1488->690->689->62->63->61->48->1365->1363->2030->2031->2029->1363->1364->48->47->514->516->627->625->626->516->515->876->875->874->515->2323->2325->2324->515->47->483->482->481->530->1438->1440->1439->530->529->680->1860->1858->1859->2041->2042->2956->2957->2958->2042->2043->1859->680->679->681->529->531->926->927->1111->1113->1112->927->925->531->481->47->46->39->37->215->2057->2056->2058->215->216->1661->1660->1662->216->1015->1971->1970->1969->1015->1017->1016->216->214->2558->2559->2652->2651->2650->2559->2571->2570->2569->2559->2557->214->37->38->8->1621->1622->2452->2454->2453->1622->1623->2433->2432->2431->1623->8->126->124->478->480->962->1128->1167->1165->1166->1171->1930->1931->1932->1171->1173->1172->1166->1128->1127->1126->962->961->1037->1038->1036->1581->1579->1580->1639->1640->1641->1580->1036->961->963->480->1257->1256->2513->2512->2514->1256->1255->480->479->1749->1748->1747->479->124->125->238->240->1278->1276->1277->240->239->2018->2938->2940->2939->2018->2536->2537->2538->2018->2017->2019->239->125->8->7->9->24->462->949->950->2684->2683->2685->950->951->462->1655->1656->1654->462->460->1446->1444->1445->2337->2335->2336->1445->460->461->24->1839->1837->1838->24->22->96->1146->1145->1144->96->95->114->261->323->322->324->465->511->2920->2921->2922->511->512->513->465->1216->1217->1497->1495->1496->1217->1218->465->464->520->865->867->866->986->985->987->1681->1683->1682->987->866->520->1510->1511->1512->520->522->521->464->463->324->261->260->259->1319->1318->1320->259->114->113->268->878->879->877->268->1180->1389->2241->2240->2239->2386->2387->2388->2239->1389->1387->1388->1180->1182->1181->268->270->1626->1625->1624->270->269->363->361->362->269->113->218->894->893->892->218->217->2290->2326->2328->2882->2881->2883->2328->2327->2290->2291->2292->217->1631->1632->1630->217->219->113->112->330->329->728->727->1018->1019->1020->727->729->1062->1061->1060->2230->2232->2889->2888->2887->2232->2231->1060->729->329->328->525->2690->2691->2689->525->1407->1405->1406->1771->1977->1975->1976->1771->1773->1772->1775->1774->1776->1772->1406->525->523->524->1191->1705->1854->1853->1852->1705->1706->1707->1191->1190->1189->1649->1648->1650->2141->2140->2142->1650->1189->524->328->2553->2551->2552->328->112->213->2149->2151->2150->213->211->351->349->2343->2835->2833->2834->2343->2342->2341->349->350->2678->2677->2679->350->211->212->341->564->897->896->895->1043->1118->1117->1119->1043->1044->1179->1994->1995->1993->1179->1177->1178->1044->1042->895->564->562->1334->2839->2841->2840->1334->1909->1911->1910->1334->1335->1333->1419->1417->1418->2524->2525->2526->1418->1333->562->1285->1287->1286->562->1109->1108->1110->1761->1759->1760->1110->562->563->341->366->364->365->341->1281->1280->1279->341->340->452->451->798->797->2457->2455->2456->797->796->2159->2768->2769->2767->2159->2158->2160->796->451->545->602->755->754->756->602->603->601->545->544->546->585->2620->2622->2898->2896->2897->2622->2621->585->584->1695->1693->1905->2752->2754->2753->1905->1904->1903->1693->1694->584->583->978->977->976->583->546->451->453->616->676->1964->2894->2893->2895->1964->1965->2705->2706->2704->2947->2948->2949->2704->1965->1963->676->678->677->616->618->617->453->1528->2067->2065->2066->1528->1529->1530->453->340->342->376->377->2658->2656->2657->377->378->342->1030->1032->1135->1137->1136->1032->1031->1767->1765->2530->2531->2532->1765->1766->1031->342->212->112->95->94->22->78->264->263->262->78->1539->1537->1538->78->137->971->972->2519->2520->2518->972->970->137->606->604->685->686->687->1242->1240->1241->687->604->605->137->136->586->587->588->136->138->650->651->649->668->669->667->649->1021->1022->2345->2344->2346->1022->1023->2257->2259->2258->1023->649->138->1507->1508->1509->138->1393->1395->1394->138->78->77->76->367->662->663->661->367->369->2219->2379->2378->2377->2219->2218->2220->369->368->2306->2305->2307->368->76->22->432->2535->2533->2534->432->1844->1845->1843->432->1543->1545->1544->432->430->431->22->23->311->1050->2419->2421->2581->2582->2583->2421->2420->1050->1049->1048->311->310->312->1519->1520->2877->2876->2875->1520->1829->1828->2715->2714->2713->1828->1830->1520->1521->312->1459->1461->1460->312->23->101->256->842->841->843->2125->2127->2126->843->256->1195->1196->1197->256->258->298->299->300->2450->2449->2451->300->1569->1567->1568->300->1494->1492->1493->300->258->257->845->2015->2014->2016->845->844->1134->1966->1967->1968->1134->1132->1133->844->846->257->101->100->936->934->935->100->102->23->9->4->2900->2901->2899->4->27->129->285->296->568->570->1592->1688->1689->1687->1592->1593->2546->2545->2547->1593->1591->2161->2163->2162->1591->570->569->713->1534->1536->2199->2198->2322->2659->2660->2661->2988->2986->2987->2661->2322->2321->2320->2332->2333->2334->2352->2351->2350->2334->2320->2198->2197->1536->1535->713->712->2562->2560->2561->712->714->569->1096->2301->2299->2300->1096->1098->1097->569->296->537->536->1399->1401->1400->2365->2367->2366->1400->536->535->1746->1744->1745->535->296->1736->1735->1737->2252->2253->2251->1737->296->295->297->285->284->283->1652->1653->1651->283->129->127->1299->1298->1297->127->128->2522->2521->2523->128->2007->2006->2005->128->27->26->49->50->53->201->1156->1158->1157->1936->1938->1937->1157->201->200->2004->2003->2002->200->199->53->52->406->407->2264->2265->2263->407->408->52->337->338->339->52->2310->2308->2309->52->54->209->208->558->2280->2422->2448->2447->2446->2422->2424->2423->2280->2279->2278->558->556->557->208->232->233->748->750->749->233->254->1710->1708->1709->254->253->255->916->918->917->1404->1832->1833->1831->1404->1402->2668->2670->2669->1402->1877->1876->1878->1402->1403->917->255->267->266->265->487->890->1244->1823->2575->2577->2576->1823->1824->1822->2382->2380->2381->1822->1244->1243->1505->1504->1551->1550->1549->1504->1506->1243->1245->890->889->1541->1722->1721->1720->2206->2207->2208->1720->1541->1542->1540->889->891->487->2630->2629->2631->487->2080->2082->2081->487->1861->1863->1862->487->488->620->621->2625->2624->2623->621->619->488->489->265->255->1872->1870->2463->2462->2461->1870->1871->255->233->234->208->210->699->2154->2153->2152->699->697->698->210->657->655->656->210->54->1160->1161->1159->1435->1437->1436->1159->54->50->1347->1345->1346->1703->1702->1704->1346->50->51->1357->1358->1359->2808->2807->2806->1359->51->26->224->223->225->26->25->35->309->307->1001->1002->2097->2095->2096->1002->1000->1381->1382->1383->1000->307->308->35->34->220->947->948->946->1373->1374->1680->2786->2787->2785->1680->1679->1678->1374->1372->946->220->2664->2662->2663->220->221->726->725->724->221->571->572->1139->1138->1140->572->573->221->2675->2674->2676->221->222->34->153->1142->1143->1141->153->151->186->185->538->2460->2458->2459->538->540->539->185->184->151->152->34->36->25->244->371->825->823->824->1812->1810->1811->824->371->370->372->2711->2712->2710->372->2177->2579->2580->2578->2177->2178->2176->372->2137->2139->2271->2270->2269->2139->2138->372->244->1124->1306->1307->1308->2037->2036->2035->2916->2915->2914->2035->1308->1124->1125->1235->1325->1326->1324->2077->2079->2078->1324->1728->1727->1726->1324->1235->1312->1313->1314->1235->1236->1247->1248->2789->2788->2790->1248->1457->2166->2164->2165->1457->1456->1458->1664->1663->3000->2999->2998->1663->1665->1815->1813->1814->1665->1458->1248->1246->1236->1234->1941->1940->1939->2617->2626->2628->2627->2617->2618->2619->1939->1234->1627->1628->2871->2869->2870->1628->1803->1802->1801->1628->1629->1234->1125->1123->1634->1635->1633->1123->244->246->245->25->4->18->16->67->559->561->560->1091->1452->1450->1451->1091->1092->1090->560->67->317->318->404->785->784->786->404->405->486->485->703->705->704->1294->1296->2497->2498->2963->2962->2990->2991->2989->2962->2964->2498->2499->2501->2500->2502->2499->1296->1295->704->485->484->1066->1068->1067->484->405->403->318->316->684->2246->2245->2247->684->683->1427->1426->2931->2929->2930->1426->1428->683->682->1398->2133->2131->2132->1398->1397->2740->2742->2741->1397->1396->682->1148->1789->1791->1790->1148->1147->1149->682->316->67->179->781->1645->2146->2147->2148->1645->1646->1647->781->782->783->179->644->741->739->2196->2195->2194->2634->2859->2857->2858->2634->2632->2633->2194->739->740->1523->1524->1522->740->644->643->645->737->736->738->645->179->417->415->416->518->2071->2073->2072->518->519->1410->1409->1408->1503->2971->2972->2973->1503->1576->1577->2038->2040->2039->2854->2855->2856->2960->2961->2959->2856->2039->1577->1578->1503->1501->1502->1408->519->517->416->1471->2968->2970->2969->1471->1472->1473->1950->1948->2107->2108->2109->1948->1949->1473->416->179->178->180->67->119->332->2475->2474->2473->332->333->331->959->960->958->331->810->808->809->331->119->118->792->790->791->118->120->67->69->157->158->2401->2403->2942->2943->2941->2403->2402->158->159->804->803->802->159->69->68->574->576->575->68->16->17->646->647->648->17->4->165->164->1366->2010->2008->2009->1366->1368->1367->1697->1698->1696->1367->164->163->386->387->385->2979->2978->2977->385->2316->2315->2318->2317->2319->2315->2314->385->163->4->5->2745->2743->2744->5->2->3->98->97->501->500->499->97->1658->1659->1657->97->1155->1154->1925->1924->1926->2340->2339->2338->1926->1154->1153->97->111->109->2047->2912->2911->2913->2047->2049->2048->109->110->829->831->1265->1264->2797->2798->2799->1264->1266->831->830->2391->2390->2389->830->110->450->1227->2732->2731->2733->1227->1226->1225->1732->1733->1734->1225->450->449->448->110->97->99->1376->1377->1935->1933->1934->1377->1375->99->3->11->248->249->634->635->636->249->247->11->2168->2167->2700->2699->2698->2167->2169->2601->2599->2600->2169->11->10->965->966->964->1685->1684->1686->964->1353->1351->1352->964->10->66->65->64->271->272->468->533->534->592->594->593->734->733->735->902->2076->2398->2399->2400->2076->2074->2075->902->903->2011->2013->2012->2906->2905->2907->2012->903->901->735->593->534->532->468->466->2113->2115->2114->466->1115->1114->1116->466->467->2709->2708->2707->467->272->313->746->837->835->1315->1466->1465->1467->1315->1316->1317->835->1078->1464->1463->1462->1078->1080->1079->2593->2594->2595->1079->835->836->746->747->745->313->2639->2640->2638->313->314->436->447->598->707->708->706->598->2329->2331->2330->598->600->1856->1855->1857->600->599->731->1448->1447->1449->731->732->730->599->447->446->445->1590->1589->1588->2354->2355->2353->1588->445->436->437->869->868->870->1176->1175->1174->2268->2267->2266->1174->870->437->776->777->775->437->438->314->315->1490->1489->1491->315->272->1546->1548->1547->272->273->1821->1820->1819->2647->2648->2649->1819->273->64->10->60->59->58->72->133->2680->2681->2682->133->134->188->457->1293->1292->1291->457->1083->1350->1348->1349->2815->2816->2817->1349->1083->1082->1081->457->458->1527->1525->1526->458->459->759->1258->2735->2734->2736->1258->2567->2568->2566->1258->1599->1598->1597->1258->1259->2665->2666->2667->1259->1882->1884->2411->2412->2410->1884->1883->1259->1260->759->757->758->459->188->374->702->701->819->923->924->922->2507->2506->2508->922->1670->1669->1671->922->819->1453->1455->1454->819->817->850->852->851->817->818->1985->1984->1986->818->701->720->719->718->701->700->1616->1615->1617->700->374->373->409->410->580->581->582->410->411->473->2757->2756->2755->473->474->508->510->863->864->862->1895->1894->1896->862->510->1336->1338->1337->2950->2952->2951->1337->510->1102->1103->1104->510->509->787->789->788->509->474->472->872->1230->1951->1953->2437->2439->2438->1953->1952->1230->1229->1228->872->871->1607->1608->1606->871->1073->1074->1979->1980->1978->1074->1072->871->873->472->411->373->375->780->1596->2027->2028->2026->2814->2812->2813->2026->1596->1594->1595->780->778->779->1868->1867->1869->779->375->591->2759->2758->2760->591->589->590->375->1211->1302->1301->1300->2179->2180->2784->2783->2782->2180->2181->1300->1211->1210->1212->1788->1787->1786->2486->2554->2555->2556->2486->2485->2487->1786->1212->375->1003->1004->1005->375->188->189->884->883->1267->1268->1269->883->885->189->187->134->135->659->658->660->135->72->70->71->81->475->477->1392->1391->1390->477->476->847->848->849->476->1150->2694->2692->2909->2910->2908->2692->2693->1150->1152->1151->476->1077->1076->1075->476->81->1432->1780->1991->1992->1990->1780->1782->1781->1432->1434->1433->81->79->282->711->812->811->1558->1560->1559->811->1554->1553->1552->811->813->711->709->710->2821->2822->2823->710->1846->1847->1848->710->1095->1093->1094->710->282->281->280->2436->2434->2435->2739->2737->2738->2435->280->79->1414->1415->1416->79->80->71->58->10->14->88->335->768->767->766->335->336->334->2848->2850->2849->334->1223->1224->1222->334->88->90->89->527->528->1441->1442->1443->2598->2596->2597->1443->528->526->89->14->15->13->654->1069->2244->2242->2243->1069->1071->1070->654->652->905->988->989->2313->2311->2312->989->990->905->904->906->652->1923->1921->1922->652->653->1208->1209->1956->1955->1954->1209->1207->1794->1792->2295->2294->2293->1792->1912->1913->1914->1792->1793->1207->653->13->30->86->87->2796->2795->2794->87->104->252->980->2481->2479->2480->980->981->979->2288->2287->2289->979->252->505->506->2803->2805->2804->506->507->252->279->1054->1055->1056->2867->2868->2866->1056->279->278->624->622->623->1412->1411->1413->623->278->277->252->1273->1275->1274->252->251->2262->2935->2937->2936->2262->2261->2260->2476->2477->2478->2260->251->250->104->1385->1386->2886->2884->2885->1386->1816->1817->1818->1386->1384->104->105->183->181->182->379->381->380->182->2932->2934->2984->2985->2983->2934->2933->182->105->103->886->1479->1478->1477->1636->1637->1638->1477->886->888->887->103->1850->1880->1879->1881->2529->2528->2527->1881->1850->1849->2362->2363->2364->1849->1851->2730->2728->2729->1851->103->87->85->142->143->938->939->937->143->288->815->816->814->288->722->881->880->882->722->1220->1221->1219->722->723->721->288->353->2750->2830->2832->2831->2750->2749->2751->353->352->354->288->2653->2655->2654->288->287->286->143->144->206->412->414->420->1750->2203->2205->2204->1750->1751->1752->420->418->717->920->1311->1310->1309->920->921->942->2565->2563->2564->942->941->2586->2585->2584->941->940->2098->2100->2099->940->921->919->717->1676->1677->1809->1808->1807->1677->1675->2089->2090->2091->2482->2484->2483->2091->2186->2185->2187->2091->1675->717->716->1340->1341->1339->716->715->2643->2642->2641->715->418->2541->2539->2540->418->419->414->2717->2716->2718->414->413->206->207->2645->2646->2644->207->205->359->360->642->744->2120->2119->2121->744->743->742->642->2064->2063->2062->2111->2112->2110->2062->642->640->1289->1290->1288->640->641->360->1361->1362->1360->360->358->1982->1981->1983->358->205->290->289->306->305->394->395->396->610->928->929->930->2892->2890->2891->930->610->612->611->396->305->304->289->2809->2811->2810->289->2573->2572->2574->289->291->205->275->276->274->205->144->85->30->826->828->1106->1107->1989->2000->1999->2001->1989->1988->1987->1107->1105->828->827->30->2405->2406->2404->30->28->1199->1200->1198->28->29->141->760->762->1304->1601->2851->2853->2852->1601->1600->1602->2636->2637->2635->1602->1887->2224->2721->2719->2720->2224->2226->2225->1887->1885->1886->1897->2695->2696->2697->1897->1898->1899->1886->1602->1304->1303->1305->762->761->1051->2214->2349->2347->2348->2214->2212->2213->1051->1053->1052->761->141->139->140->29->13->293->292->294->13->176->595->596->597->2763->2761->2762->597->176->1205->1666->1667->2606->2607->2605->1667->1915->1916->1917->1667->1668->1205->1206->2590->2591->2592->1206->1204->2837->2836->2838->1204->2800->2802->2801->1204->2277->2276->2842->2844->2843->2276->2775->2773->2774->2276->2275->1204->176->175->177->13->10->12->117->116->632->1742->1743->2702->2703->2701->1743->1741->632->631->2722->2723->2724->631->633->116->115->171->169->204->202->639->1100->1101->1099->639->638->637->957->956->955->1763->1762->1764->955->637->2393->2392->2394->637->202->203->169->170->1785->1929->1928->1927->1785->1783->1784->170->115->145->443->490->491->492->1574->1575->1604->1605->1943->1944->1942->1605->1603->1575->1573->492->443->1865->1866->1864->443->1162->1164->1163->443->442->444->984->1779->1777->1778->984->982->983->1739->1738->1795->1796->1797->1738->1740->983->444->615->613->1840->1841->1842->613->614->444->548->549->547->444->145->2144->2371->2373->2372->2144->2143->2145->145->147->243->2510->2509->2511->243->242->383->384->382->242->1692->1890->1888->2183->2184->2613->2612->2611->2184->2182->1888->1889->1692->1691->1690->242->241->147->146->115->130->996->1007->1423->1425->1424->1007->1006->1008->2608->2609->2610->1008->996->994->995->2384->2383->2385->995->130->857->1187->1332->1330->1974->1972->1973->1330->1331->1187->1186->1614->1612->1613->1186->1188->2829->2827->2828->1188->857->858->2283->2878->2879->2880->2283->2282->2281->858->856->1284->1282->1283->856->130->132->131->1431->1430->1429->131->115->12->3->1->497->496->498->1->2982->2980->2981->1->21->493->494->1122->1121->1120->494->495->1962->1960->2106->2216->2370->2368->2369->2216->2217->2215->2106->2105->2104->1960->1961->2069->2068->2070->1961->495->1718->1719->2726->2725->2727->1719->1717->495->1371->1369->1370->495->21->44->191->192->190->1920->1919->1918->190->44->45->1327->1328->1329->45->43->441->1755->1753->1754->441->440->1514->1513->1515->440->439->43->235->236->578->855->2992->2993->2994->855->853->854->1768->1770->1769->854->1714->2032->2034->2055->2672->2671->2673->2055->2054->2053->2034->2033->1714->1715->1716->854->578->579->1902->1900->1901->579->577->1013->1012->1014->577->236->321->392->840->1996->1998->2864->2865->2863->1998->1997->840->838->839->392->391->552->550->1805->2927->2928->2926->1805->1806->1827->1825->1826->1806->1804->550->1480->1481->1482->550->551->2124->2123->2686->2687->2688->2123->2249->2250->2248->2123->2122->551->391->393->859->2543->2542->2589->2746->2747->2748->2589->2588->2587->2542->2544->859->861->860->1033->2223->2221->2361->2359->2360->2221->2222->1033->1034->1323->1321->1322->2792->2791->2793->1322->1034->1035->860->393->321->320->1644->1643->1642->320->1063->2296->2297->2298->1063->1064->1065->2492->2493->2491->1065->1609->1610->1611->1065->320->319->993->992->2358->2356->2357->992->991->319->236->237->1046->1045->1047->237->43->21->42->40->2085->2285->2286->2284->2085->2083->2084->40->41->607->608->953->954->1583->1582->1584->954->952->608->609->1620->1619->2272->2274->2273->1619->1618->609->41->56->73->75->1252->1253->2060->2059->2117->2118->2116->2059->2061->1253->1254->75->74->502->503->504->74->2427->2425->2426->2430->2429->2428->2426->74->56->55->92->167->168->753->751->752->799->1169->1168->1170->2135->2136->2134->1170->799->800->1484->1729->1730->1731->1484->1485->1483->800->801->752->168->166->92->91->93->691->2504->2503->2505->691->692->693->93->55->343->344->425->426->424->2945->2946->2944->424->344->345->975->1379->1957->1958->2236->2237->2238->1958->1959->2415->2414->2413->1959->1379->1469->1470->1468->2495->2494->2496->1468->1379->1380->1378->975->973->974->345->55->2818->2819->2820->55->194->195->193->2771->2770->2772->193->55->122->123->303->302->346->2470->2471->2472->346->348->356->357->421->422->423->357->1908->1906->1907->357->355->794->900->899->898->2974->2976->2975->898->1517->1518->2092->2093->2094->2766->2765->2764->2094->1518->1516->898->794->2044->2860->2861->2862->2044->2045->2046->794->793->1836->2022->2021->2020->1836->1835->1834->793->1498->1500->2516->2517->2515->1500->1499->793->795->355->2443->2445->2444->355->348->2845->2847->2846->348->347->302->301->435->471->470->469->435->434->2824->2826->2825->434->433->2023->2024->2025->433->301->123->121->55->108->326->325->327->108->1028->1027->1029->1533->1532->1531->1029->108->107->807->805->806->2374->2376->2375->806->107->399->694->833->834->832->1474->1700->1699->1701->1474->1475->1476->832->694->822->820->1010->1009->1011->820->821->944->945->943->821->694->696->2614->2616->2615->696->1421->2234->2233->2995->2997->2996->2233->2235->1421->1420->1557->1556->1555->1420->1422->696->695->399->398->397->2549->2550->2548->397->107->106->427->2228->2229->2227->427->429->566->1129->1131->1130->566->565->567->429->1563->2781->2779->2780->1563->1562->1561->429->428->1239->1271->2211->2209->2210->1271->1272->1270->2917->2918->2919->1270->1239->1237->1238->428->106->162->160->773->774->772->1565->1566->1564->772->160->2050->2466->2464->2465->2050->2051->2193->2192->2191->2051->2052->160->161->2408->2407->2409->161->106->55->57->41->401->402->2965->2967->2966->402->400->1085->1084->1086->2925->2923->2924->1086->400->41->21->174->1261->1262->1263->174->172->671->1756->1757->1758->671->1192->1194->1193->2304->2302->2303->1193->671->672->670->1947->2256->2255->2254->1947->1946->1945->670->1713->1711->1712->670->1570->1571->1572->670->172->173->1040->1041->1039->173->21->20->389->1892->2874->2872->2873->1892->1891->2129->2173->2174->2175->2129->2130->2128->2778->2777->2776->2128->1891->1893->389->390->2954->2953->2955->390->388->20->226->628->629->907->908->909->1059->1057->1058->909->1024->1026->1025->909->629->630->666->665->664->630->226->2170->2171->2172->226->227->2417->2416->2418->227->228->20->197->196->764->763->765->914->913->999->2469->2467->2468->999->998->997->913->915->765->2157->2156->2155->765->196->198->2603->2604->2602->198->20->19->1->0->149->148->150->456->455->675->673->674->1344->1342->1343->674->455->454->150->0'"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = read_input('rosalind_ba3f.txt')\n",
"def eulerianCycle(g):\n",
" \"\"\" Input: The adjacency list of an Eulerian directed graph.\n",
" Output: An Eulerian cycle in this graph.\"\"\"\n",
" tour = []\n",
" src = g.iterkeys().next() # pick arbitrary starting node\n",
" def visit(n):\n",
" while len(g[n]) > 0:\n",
" dst = g[n].pop()\n",
" visit(dst)\n",
" tour.append(n)\n",
" visit(src)\n",
" tour = tour[::-1] # reverse and then take all but last node\n",
" return tour\n",
"tour = eulerianCycle(g)\n",
"'->'.join(str(i) for i in tour)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"def eulerianCycle(graph):\n",
" # init cycle with any node\n",
" cycle = [graph.keys()[0]]\n",
" ##### move all edges from graph to cycle\n",
" while len(graph) > 0:\n",
" # if cycle is closed shift to a node with remaining targets\n",
" if cycle[0] == cycle[-1]:\n",
" while not cycle[0] in graph:\n",
" cycle.pop(0)\n",
" cycle.append(cycle[0])\n",
" ##### source is the last node of cycle\n",
" source = cycle[-1]\n",
" ##### move one target from graph to cycle\n",
" cycle.append(graph[source].pop())\n",
" ##### clean sources without targets\n",
" if len(graph[source]) == 0: del graph[source]\n",
" return cycle"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1, 2, 3, 6, 7, 8, 9]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n",
"nodes = g.keys()\n",
"nodes"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def outdegree(g,node):\n",
" \"\"\"Returns the number of outgoing edges from a node\"\"\"\n",
" try:\n",
" return len(g[node])\n",
" except:\n",
" return 0\n",
"outdegree(g,4)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def indegree(g,node):\n",
" \"\"\"Returns the number of incoming edges from a node\"\"\"\n",
" incoming = [k for k,v in g.iteritems() if node in v]\n",
" return len(incoming)\n",
"indegree(g,6)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def isSemiBalanced(g,node):\n",
" return abs(indegree(g,node) - outdegree(g,node)) == 1"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def isBalanced(g,node):\n",
" return indegree(g,node) == outdegree(g,node)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def head(g):\n",
" \"\"\"Returns the head of a graph in case of Eulerian Walk\"\"\"\n",
" nodes = g.keys()\n",
" head_node = None\n",
" for node in nodes:\n",
" if indegree(g,node) == outdegree(g,node) - 1:\n",
" head_node = node\n",
" break\n",
" return head_node"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def tail(g):\n",
" \"\"\"Returns the tail of a graph in case of Eulerian Walk\"\"\"\n",
" neighbors = [v for k in g for v in g[k]]\n",
" tail_node = None\n",
" for node in neighbors:\n",
" if indegree(g,node) == outdegree(g,node) + 1:\n",
" tail_node = node\n",
" break\n",
" return tail_node"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6\n",
"4\n",
"{0: [2], 1: [3], 2: [1], 3: [0, 4], 4: [6], 6: [3, 7], 7: [8], 8: [9], 9: [6]}\n",
"src 0\n",
"tour [0, 3, 6, 9, 8, 7, 6, 4, 3, 1, 2, 0]\n",
"tour [0, 2, 1, 3, 4, 6, 7, 8, 9, 6, 3]\n",
"sti 5\n",
"tour [6, 7, 8, 9, 6, 3, 0, 2, 1, 3, 4]\n"
]
}
],
"source": [
"g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n",
"#kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']\n",
"#kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]\n",
"#g = debruijn_from_kmer(kmers)\n",
"#setdefault is useful for setting defaults while or after filling the dict.\n",
"initialNode = head(g)\n",
"print initialNode\n",
"terminalNode = tail(g)\n",
"print terminalNode\n",
"g.setdefault(terminalNode, []).append(initialNode)\n",
"print g\n",
"# graph g has an Eulerian cycle\n",
"tour = []\n",
"src = g.iterkeys().next() # pick arbitrary starting node\n",
"print \"src\", src\n",
"def visit(n):\n",
" while len(g[n]) > 0:\n",
" dst = g[n].pop()\n",
" visit(dst)\n",
" tour.append(n)\n",
" \n",
"visit(src)\n",
"print \"tour\", tour\n",
"tour = tour[::-1][:-1] # reverse and then take all but last node\n",
"print \"tour\", tour\n",
"sti = tour.index(initialNode)\n",
"print \"sti\", sti\n",
"tour = tour[sti:] + tour[:sti]\n",
"print \"tour\", tour"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def eulerianWalk(g):\n",
" initialNode = head(g)\n",
" terminalNode = tail(g)\n",
" #setdefault is useful for setting defaults while or after filling the dict.\n",
" g.setdefault(terminalNode, []).append(initialNode)\n",
" # graph g has an Eulerian cycle\n",
" tour = []\n",
" src = g.iterkeys().next() # pick arbitrary starting node\n",
" def visit(n):\n",
" while len(g[n]) > 0:\n",
" dst = g[n].pop()\n",
" visit(dst)\n",
" tour.append(n)\n",
" \n",
" visit(src)\n",
" tour = tour[::-1][:-1] # reverse and then take all but last node\n",
" sti = tour.index(initialNode)\n",
" tour = tour[sti:] + tour[:sti]\n",
" return tour"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'6->7->8->9->6->3->0->2->1->3->4'"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = {0:[2], 1:[3], 2:[1], 3:[0,4], 6:[3,7], 7:[8], 8:[9], 9:[6]}\n",
"#g = read_input('dataset_203_5.txt')\n",
"'->'.join(str(i) for i in eulerianWalk(g))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"defaultdict(, {'CTT': ['TTA'], 'ACC': ['CCA'], 'GCT': ['CTT'], 'GGC': ['GCT'], 'TAC': ['ACC'], 'TTA': ['TAC']})\n"
]
},
{
"data": {
"text/plain": [
"['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#kmers = [line.strip() for line in open('dataset_198_3.txt', 'r')]\n",
"kmers = ['CTTA', 'ACCA', 'TACC', 'GGCT', 'GCTT', 'TTAC']\n",
"g = debruijn_from_kmer(kmers)\n",
"print g\n",
"eulerianWalk(g)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def stringReconstruction(kmers):\n",
" \"\"\"Input: An integer k followed by a list of k-mers Patterns.\n",
" Output: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may\n",
" return any one.)\"\"\"\n",
" g = debruijn_from_kmer(kmers)\n",
" walk = eulerianWalk(g)\n",
" return walk[0] + ''.join(map(lambda x: x[-1], walk[1:])) "
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'GGCTTACCA'"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stringReconstruction(kmers)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'CAAATGCATCATACGCTCACCCAG'"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kmers = ['AAAT','AATG','ACCC','ACGC','ATAC','ATCA','ATGC','CAAA','CACC','CATA','CATC','CCAG','CCCA','CGCT','CTCA','GCAT',\n",
" 'GCTC','TACG','TCAC','TCAT','TGCA']\n",
"stringReconstruction(kmers)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'GGCTTACCA'"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#map() will apply its lambda function to the elements of the argument lists, i.e. it first applies to the elements with the 0th index, then to the elements with the 1st index until the n-th index is reached\n",
"walk = ['GGC', 'GCT', 'CTT', 'TTA', 'TAC', 'ACC', 'CCA']\n",
"walk[0] + ''.join(map(lambda x: x[-1], walk[1:])) "
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.setrecursionlimit(2500)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'AATAAAAGTCAGTATCAGGTGTTCACCGTTATCTCCTGCCGGGCACGGGGGAGGTACCGTGCGTATCTAGGGAGAAGCCCGTCGAGTTCAGGGCCCAATGGTAGGTACTCCCGGTCCACTAATGACCCAACTAGACCACAAGGGGACTACGTACCCTACTCGCGAGCCTAGCTAACTAACTGCCCTGGCTTACGCATGACATACTGCGGTTTGCAGGGTGATGCTAGTATACATGGATGTGATAGCAATCTTTCAACCAAGCATTAGTACCATCGCACTGGAGCGATTTAGGTCGAAACTCATATTCGCTTTATGTGTCCTGGACGTGTATACAAGTCCTTCCTTACTTTACACAATAGCTTGGTACAGTGTCTCGACTCAACTTAAGGTCTTCACCATTTGATCTGTGCTCGTGTGCAGTTAGAGGAGTAGCCCCCACTTTCGTTTCGACGAGTAGACCCACAGCGTTGCGACGTCCACCACCTTTCATCACCACAGTCTCTTCTGGACTTAACAGCATTTATAATACCAGAGGCGCGGTATCACTGGATCTTCAACTGCGAATGTGAGGAGGTCCTGGATTATAGTGTCCTGCAACATGGATGCACGACCCGGAAACACCGGGATTGATCCAGACGAGATTGACTATTTAACGAGGAAGGTTCGTAATAGCATCGGGAGAGGATATGGCTCACCAGTCTTCGGTATATGGTTGACTGCTGACAGACTGTCGAGCAGCGATCGTTATAGAAGCTCCCTTTTAGCGCTCCGTGCCCCAGCTGATTGGTCGGTATATAATGCCTAGCGGGCTGCTTTACTCACGTTCCGCTTGTGGCGCCACAGACCCACGGGCAAAAGAGGGGGTCACCGCAAGTCTCCGTAGGCGCGAGTAACCTTATATTCACTTAACTTTTTTTCAAGGAAATCACAGCTTGGGACTACGAGCGACGACGCAATTGCATTGACGGCTAGTTGTGATATGCCCATTACACTGCGAGATAAGGGGAAACACATCACGGAATCCAAACCTCTACATTCCTTATAGGGCAAGTGATGTTCGGATCCCAAAGTATCGTGCATGAGTAGAAGCCGTGAGGGTATTCGAATCAGTGAATTACGGCGCGCGGAGGCAACCATACATATGTCAGACTATGCCCCGATGGATGAGGCCCATCAGGCTAAGCGATCATTCGGACGGGCCCCGATTCCTAACGCAGCCTTGTATAGCTGATGAAAGTGAGAGACGCACTCGATGCACATGTCCTGTCGTGTTATGGACCCAACGCGAGAAGAAGCACGTGAAACGTACAAGGGGCCGGACTGCAGCAAGGTGTTCGGTGTATCAAACACGTGTCAGCCCAGAGAAGTAGCCAACCGAGAGTATGAAACATCAGTAACAAGCTTACTCGGTTGAAGCGATGATCCCTATCAAGGGTCATCGGCACCACCTTTGAAGTAGCGGTAAGACCCCTCAGTTAAGAGTGGCGTTAGGTGGGACGGCATATGATCCTATTATTAGGTACACAGGGACAACAGAGTAGACTCATGAGTCATGCCAAACCGCTTCCGCTATGGTTTATTACAAAAAGACGCGGCCCTAAATTAAGCCTATGTGGAGACCATGGACCAATGACAAGATGTGAGTTGAGGCGTGGGTAGAGTGCTATGAATCAACGTTTAACTCGGGTAATGGCTCTGACACAACCGACTAATATCCCCGTTAGACAGACTGCCACGACGAGGACCTCCACCGATGGGGTGGAGTGCGACCTCTGGCGGGCAATGCGTAAGCTACAATTGATCTGCCTTAATCTGTCACCGGCAGTTATCCTCATATGAAGCGCTGCGGCAATGCCGCCTGATGAAGGTGCCGGATGACTGCGCTCTATACGAGGTTCTACGGACACGTTCCGTTACTCCAGACTACCCGTCAAACCTCCCGTCCGTCCACGCCATAATAGGTCCACTGAATGTGGATATTGCTAAATTGGAACAGATGTCGGTTCTAACAATATATAAAGAGGTTAAGTTACTTTGTAGTGCGAATTAAGTCCGGCATATGAGATACATCTTCAGCCATTGCCAGTTGTGCGTTATGCCCAACTCGGTTAGCTGTTATCCTTTTGATTACCAGTCTTCGGACCCCCCAGTATCGTAGCCTTTAGATAGATGCGGTCCACCTGGGTGGTTCCTCTGTGTAAATCGAAAAGAGTATGGCGACGCCGGGTTTCACTTCAGAA'"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kmers = [line.strip() for line in open('dataset_203_6.txt', 'r')]\n",
"stringReconstruction(kmers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## k universal circular string"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['000', '001', '010', '011', '100', '101', '110', '111']"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import itertools\n",
"[''.join(binary) for binary in itertools.product('01', repeat = 3)]\n"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#CODE CHALLENGE: Solve the k-Universal Circular String Problem.\n",
"import itertools\n",
"def k_universal_circular_string(k):\n",
" \"\"\"Input: An integer k.\n",
" Output: A k-universal circular string.\"\"\"\n",
" binary_kmers = [''.join(binary) for binary in itertools.product('01', repeat = k)]\n",
" g = debruijn_from_kmer(binary_kmers)\n",
" cycle = eulerianCycle(g)\n",
" return cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-(k-1)]))"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0101111010011000'"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"k_universal_circular_string(4)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'11101000'"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']\n",
"cycle[0] + ''.join(map(lambda x: x[-1], cycle[1:-2]))"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0001011111111011111010111101101111001111110010111011101010111001101110001111100010110110101101001111010010110011101100101011000110110000111100001010101001101010001110100010100100111001001010000110100000111000001001100110001001000110010000100010000001100000'"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"k_universal_circular_string(8)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['11', '10', '01', '10', '00', '00']"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cycle = ['11', '11', '10', '01', '10', '00', '00', '01', '11']\n",
"cycle[1:-2]"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"CODE CHALLENGE: Implement StringSpelledByGappedPatterns.\n",
" Input: Integers k and d followed by a sequence of (k, d)-mers (a1|b1), … , (an|bn) such that\n",
" Suffix(ai|bi) = Prefix(ai+1|bi+1) for 1 ≤ i ≤ n-1.\n",
" Output: A string Text of length k + d + k + n - 1 such that the i-th (k, d)-mer in Text is equal to\n",
" (ai|bi) for 1 ≤ i ≤ n (if such a string exists).\n",
"Sample Input:\n",
"4 2\n",
"GACC|GCGC\n",
"ACCG|CGCC\n",
"CCGA|GCCG\n",
"CGAG|CCGG\n",
"GAGC|CGGA\n",
"Sample Output:\n",
"GACCGAGCGCCGGA"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def gappedPatterns(f):\n",
" firstPatterns = []\n",
" secondPatterns = []\n",
" for lines in f:\n",
" lines = lines.strip()\n",
" lines = lines.split('|')\n",
" firstPatterns.append(lines[0])\n",
" secondPatterns.append(lines[1])\n",
" return firstPatterns, secondPatterns\n",
"def stringSpelledByGappedPatterns(firstPatterns, secondPatterns, k, d):\n",
" prefixString = string_from_genome_path(firstPatterns)\n",
" suffixString = string_from_genome_path(secondPatterns)\n",
" for i in range(k+d, len(prefixString)):\n",
" if prefixString[i] != suffixString[i-k-d]:\n",
" return \"there is no string spelled by the gapped patterns\"\n",
" return prefixString + suffixString[-(k+d):] "
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'GACCGAGCGCCGGA'"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = open('StringSpelledByGappedPatterns.txt', 'r')\n",
"firstPatterns, secondPatterns = gappedPatterns(f)\n",
"stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 4, 2)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'there is no string spelled by the gapped patterns'"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = open('dataset_204_14.txt', 'r')\n",
"firstPatterns, secondPatterns = gappedPatterns(f)\n",
"stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Solve the String Reconstruction from Read-Pairs Problem.\n",
"#Input: Integers k and d followed by a collection of paired k-mers PairedReads.\n",
"#Output: A string Text with (k, d)-mer composition equal to PairedReads."
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"read_pairs = [('TAA','GCC'), ('AAT','CCA'), ('ATG','CAT'), ('TGC','ATG'), ('GCC','TGG'), ('CCA','GGG'), ('CAT','GGA'), ('ATG','GAT'), ('TGG','ATG'), ('GGG','TGT'), ('GGA','GTT')]\n",
"#read_pairs = [('GACC','GCGC'),('ACCG','CGCC'),('CCGA','GCCG'),('CGAG','CCGG'),('GAGC','CGGA')]"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"def debruijn_from_readpairs(read_pairs):\n",
" '''Input: A collection of k-mers Patterns.\n",
" Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n",
" edges = defaultdict(list)\n",
" for pair in read_pairs:\n",
" edges[(pair[0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:]))\n",
" return edges"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(list,\n",
" {('AA', 'CC'): [('AT', 'CA')],\n",
" ('AT', 'CA'): [('TG', 'AT')],\n",
" ('AT', 'GA'): [('TG', 'AT')],\n",
" ('CA', 'GG'): [('AT', 'GA')],\n",
" ('CC', 'GG'): [('CA', 'GG')],\n",
" ('GC', 'TG'): [('CC', 'GG')],\n",
" ('GG', 'GT'): [('GA', 'TT')],\n",
" ('GG', 'TG'): [('GG', 'GT')],\n",
" ('TA', 'GC'): [('AA', 'CC')],\n",
" ('TG', 'AT'): [('GC', 'TG'), ('GG', 'TG')]})"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = debruijn_from_readpairs(read_pairs)\n",
"g"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('TA', 'GC'),\n",
" ('AA', 'CC'),\n",
" ('AT', 'CA'),\n",
" ('TG', 'AT'),\n",
" ('GC', 'TG'),\n",
" ('CC', 'GG'),\n",
" ('CA', 'GG'),\n",
" ('AT', 'GA'),\n",
" ('TG', 'AT'),\n",
" ('GG', 'TG'),\n",
" ('GG', 'GT'),\n",
" ('GA', 'TT')]"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walk = eulerianWalk(g)\n",
"walk"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['TA', 'AA', 'AT', 'TG', 'GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA']\n",
"['GC', 'CC', 'CA', 'AT', 'TG', 'GG', 'GG', 'GA', 'AT', 'TG', 'GT', 'TT']\n"
]
}
],
"source": [
"firstPatterns = [a for (a,b) in walk]\n",
"secondPatterns = [b for (a,b) in walk]\n",
"print firstPatterns\n",
"print secondPatterns"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'TAATGCCATGGGATGTT'"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 3, 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Quiz que 3"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(list,\n",
" {('AC', 'AT'): [('CC', 'TA'), ('CT', 'TT')],\n",
" ('AT', 'TG'): [('TA', 'GA'), ('TT', 'GA')],\n",
" ('CA', 'GA'): [('AC', 'AT')],\n",
" ('CC', 'TA'): [('CG', 'AC')],\n",
" ('CG', 'AC'): [('GA', 'CT')],\n",
" ('CT', 'AG'): [('TG', 'GC')],\n",
" ('CT', 'TT'): [('TG', 'TC')],\n",
" ('GA', 'CT'): [('AA', 'TT'), ('AT', 'TG'), ('AT', 'TG')],\n",
" ('TA', 'GA'): [('AC', 'AT')],\n",
" ('TC', 'AA'): [('CT', 'AG')],\n",
" ('TG', 'GC'): [('GA', 'CT')],\n",
" ('TG', 'TC'): [('GA', 'CT')],\n",
" ('TT', 'GA'): [('TC', 'AA')]})"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"read_pairs = [('ACC','ATA'),('ACT','ATT'),('ATA','TGA'),('ATT','TGA'),('CAC','GAT'),('CCG','TAC'),('CGA','ACT'),\n",
" ('CTG','AGC'),('CTG','TTC'),('GAA','CTT'),('GAT','CTG'),('GAT','CTG'),('TAC','GAT'),('TCT','AAG'),\n",
" ('TGA','GCT'),('TGA','TCT'),('TTC','GAA')]\n",
"g = debruijn_from_readpairs(read_pairs)\n",
"g"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[('AT', 'TG'),\n",
" ('TT', 'GA'),\n",
" ('TC', 'AA'),\n",
" ('CT', 'AG'),\n",
" ('TG', 'GC'),\n",
" ('GA', 'CT'),\n",
" ('AT', 'TG'),\n",
" ('TA', 'GA'),\n",
" ('AC', 'AT'),\n",
" ('CC', 'TA'),\n",
" ('CG', 'AC'),\n",
" ('GA', 'CT'),\n",
" ('CT', 'TT'),\n",
" ('TG', 'TC'),\n",
" ('GA', 'CT'),\n",
" ('AA', 'TT'),\n",
" ('AT', 'TG')]"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"walk = eulerianWalk(g)\n",
"walk"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['AT', 'TT', 'TC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CC', 'CG', 'GA', 'CT', 'TG', 'GA', 'AA', 'AT']\n",
"['TG', 'GA', 'AA', 'AG', 'GC', 'CT', 'TG', 'GA', 'AT', 'TA', 'AC', 'CT', 'TT', 'TC', 'CT', 'TT', 'TG']\n"
]
}
],
"source": [
"x = [a for (a,b) in walk]\n",
"y = [b for (a,b) in walk]\n",
"print x\n",
"print y"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'there is no string spelled by the gapped patterns'"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#ANSWER SHOULD BE: CACCGATACTGATTCTGAAGCTT\n",
"stringSpelledByGappedPatterns(x, y, 3, 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Problem set"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Process the readpairs into list of tuples\n",
"f = open('dataset_204_14.txt', 'r')\n",
"read_pairs = []\n",
"for lines in f:\n",
" lines = lines.strip()\n",
" lines = lines.split('|')\n",
" read_pairs.append((lines[0],lines[1]))"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'CTCTACGAGCATTGGCATAACCGTAGCACCTCAAGCGTATTCTCGCTCGCCGATTGTCAGTGCCGCCAGGAGGCCCTTGACAAACGTACAACTAATGAGGCTTCCTAATCCCTCGTTTGGTAATGAGGAGTATCATGTTGATATCGGCCGCCTGCCGACTAACAGGTACTTTGGCTGCTGTACGAGGTGAAAATAGCGGCTGACATTGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGATATAAATTGCATTACGCTACTAGGACCGATTGACTCGGGGGGTGCCAGTTGGTGACCCACTACAATCTACGCCCTTGCGACTTTGCCTAACATACTCGTCAGGGCAAAGTGTTGGGTACATGTGTTTCACCACAATGCTCTTGGGTCTTGACAACAGGCAAAGGTCTTTAATGCGTTCCCTTGGCGAGTGTGGCGCGGTAATCGTCGAATCGCATGAGACGAGCAGGGAAAAACTTGGACGTTACATCCGGCAATAGATTCCCACGATCTGCCCTTGCCCATATGCACAGATGATATTACAGCTAGGATCCCCAGGAGCGGCATGTGTATCACTAGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCAAGGCTTGACCGCACCTCATAGACTCTCGTAACGTAGTAAATCTGTATTAACAGGAGCTATTCAAACTGCTCCTTGGGTGCTTCCTTGCACGAGCCGAAATCGCCTTCTAGTCAATGAAAACTAGGAGAGACTCCTAGCATCTGCGATACCGCCCCTTGATTTACCAACGAGGTAAGATGTCGAATTTCTTTTCTGTGGGCTACCCCGGAGTAGCAAGAGGTGAAGAGTATTCTTTGATCTAGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGATTTAGACAGTTAACTGATGGATCTTAAGGGTTAGAGCCATTCCCAGATTCATGTCGGGACTTTCTGTAGGTCACGGCCCCAACGGACAACCGGTCGAACTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTCTAAATCCAGGTGGCTATTGAGGTAAGCGGAGCTTTGATCTCGGCACGTCGACGGGAACCACATATATCTTCCGCCGGTACCTTCGGTAGCGCTGACCCGGAGGATCGGCGTGGAATCTCAGCTTAAGCCGTGTCTGAGTGGTTAGGCCTTTTAACCTGGCCACTCCCTCGACGGAAATTTTTGGACGCTAGCAAGAGTCAGACTAACGTACCCTAGACAGGGCAAAGTCGGAATCGACTCTGAGGTCTTTGTATCATATTGAGGGTGCAACACAAAGCGTCTTAGTCACTGACAGTTGGTTGCACTGTTTACCCCCAATCTCGTGCGGACTTGGAATTCGTAGGCACAGTTCTTGGCAGCCGGGAAAAAGAGCCACTTTCATCGTCGAAACTCTCGATTGGTCAGCCCCGGTTTGAACCACGCCGACGTCGCCCTCCCAACAGAACACCCCACCCGCTGAGGCAAGTGGACCGTTTGCGCTTCCGCACGCCGCTATCGCGTACGCAGTTGAGAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCACCAATAAAAAGCACCGACTTTATGCGAGTACGAACCAGAGGAACGGGGATGATCGATCGTGTACTGGGGTACGTCAAAAGGAGGTAACGGCCGTTACTGACCTGGTTGCAATGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGGGCGCAGGTGAGCGCGTCCCGTTGTCTCTATGATACCAACAAAGCACCCAACCTCTGGTTTTTTCCCTGTTATCGACCTCGGCATTGGAGTCTTTTAAGTTGAATAGCTACACAAAACCAAAAGAATTATGCCGCCGTAGTGCGGGTGGCCCGCCCGCACCGAAGCGGACGCGTTTTCATGAGTGCGGAGAGGCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCACTTTTCCCTTCATTGTACCTTTCATCTCCGCATAGCTATTTGAGATCAGTATGTCGTACTGATTTTAAAGATCACCAAAGCATCGGAGGACACATTATGAGTGTCTCAGGAGCTATAGATATGCTAGGCAGGGTCGGTCAGTCTCTGTCCGACGGAGCACAAGGCACCTCACCCCCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTGTTTCTCGATTGAATAAACCTGGCCCGTCAATTACTGAAGTAGCTTAAACGTTACGACGGCAGTCCTCATCACCATCCTGGCTCGAACTACCGAGCAGCGACGAGTTGCGCTCAACACCTTTTTTTGATACATCCAGATAGTCTACAGAATGCACGGATTGAGAATTCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGTGAGACTGTTCTACACATAAGCTTAAACACCATAAAGTTGCTTGAAGGGTCCTTTCTGCGCGAGTTCGGAAAATATGCGCCCAGGTAGCGTTGGCTTTTCATATAGCGGAGCCCGACTAACTGTTCCACCCCCGTCGAAACGCCATTGAAGCGGACGCAAGTTCATCCCTGCCAAATTGAGACAGGTTTGTACGTCCTAAGAAGGGGTTTGTACCTTTGAGATTGTTTATGATCTATCTTGGCAGCCCGAGCAGAGTATCTTGTTTTAATATACGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCTGGCACACGGTACAATGTGCCAGGAGTTATTCTCATCGATGCTTCGACATTGACTTCGCTCTTTGAGTTAACAACTCGCGGAGGCACCTTCTCAGTTGCATGACTTGATGTCGTCATGAAAATGCTCGGTGGCTTCTCACGGGGACTCACCCAACTAAAACCCCGACGAACGGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGACCGCCAGCGACAAATATAATTACGGATGTCGGGGGCTGCTTCGCTGTTAAACGATGAAGAATCTGAGATTTGACAAGGCAAGTCCATAATCGTATGGTTCCTCCTGAGGCGGGGATCATCAGGGACTTTGGTCCTGCCTTAGGGACGTATTAAATTAAATTATCTCGGCCGCGGCTCAGGTTCATTAAAAAAGCGGGGCTTGCTATTGTCCCCCAACCCTTCGTGTTGAGAAACGAGAGAATAGGGACGAGCGCACGACGACACATAGAGATTGTGTGCCACCAGTGCTCCGAGTACCCCTTATTCGTCTCACTTTGAGTTTAATCCTACTATGTTTGGTACAACAGATGTCATAGGTTACGTTGAACCCGATAGCTAGTTCCAGCACGAACCCGCTAAATGAGACACCCAAGAATCGATCCCCTAGTTCGCGCCATGTAGGCTTTAGTAAGTCGAGGTCATACTTTACTAAAAAAAATTTTAGGGTACTACTTCGAGCATGCCCCTTGTCCCTTACCCTTATACGTCAGGATCTTGATTGGTCAGTGGGCCACAAGGATGGCTAATCTTAAGAGCGTCGCTCTATTTCCGGAACGGCAACAAATTCAAACCGGTACGCCTTGTCTACATATCTACAAGCGATAAGGAATAGAAGCCGGTTCAGTGTGTCCAGTTCCCCACAGCGTCATCCGCTTGAGTCGGCGCAAGCCCGGGTCACAACTATGAGAAATCCGATATAACCTCTCATTTGTATACCTCAATCAATACGGCCGAATTTGTTTTGGCTCCCGAAACGCCACACACACGCGTCACCCAGCCAATTACTCCGTTGGGGGTAGATCGGGGCGTGATACCAGTAAAGTTCAACTGCAAGGGTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGAGCCAATCTAAGGTGGGTTAGGTGAGCCCATTTGATGCCTCGCAAAGGATTATTTGCAAGATTGATCCTCGGGTTAACTTGTAATCTCACCGGTTGAAAGTAAACCTAGATACGCAGTATCACTTACTCCTCTACCGTTCCTAGAGGTAAGTTTTTGAACCAGTAACAACAATTAGACAATCACATAAAATTCTATCAATTTCTTAACGCTTAGATCCTTCTTCTACCTTATAACTGTTCCCACGCTACCGTGCTTGTTCGGTCGGCCACGCGAAACCTCTTGCGGCTAACCAGTATATCCCGGGCTTCGAGCGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCATTTAGCGCGCGTTACCGCTCCACTTCACTCTCCGTGGGGATCTCTCATGAGCTGGCGGCTCCCAGATTGCCCTCCACAGTCAGAATGGGTGCGCAGAGCATGACGACAAAACTCGAACTGCTACACCTCTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGGATTTTCCCCTCTTTTGGTCATGCTGCCGCCATAATATAAGCCCAGTGCGATCTCATTTGCCATTTCGTTTTTCGCTTAGCAACCTATCTGAGTGCTCGTGTAGCAAATGACGCCGAATGTTATGAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCGGTTATCGTCTCGAGGATGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGTCGGTCGATTATTGACTGACAGTTGATTCCCTGCATATGGCGGGTATAGGACACCCTCAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGCCATACCGTATAACGGGTGGTAGAGAGCATCTCTTCTATGCTTGGGGCCGATTAAGGTAGACTGAGGAGTCAATCCGATCCTAGTATATAAGGGCGAGCTCGGCTACTGTGACGACCTAGTTACATAATACTCGCTCTTCTATTATTCAGTAACTTCATTATAGTTCAGACTTGTTGCACGCTTGCTATCTACGAAAAGTAGCCTCCTATCGCTAGGAGCACGTTTCCAGGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAAGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGCCTGTACATCTGATGGCATTGTCTGAGTCCCGCCGAAACACTTCTGAAAAGGTTGCCGCTATCTTTAATTCCGTACCCGACGCTATTGTGTGAAATTGGATTTCAAAAGATAAACTGTCGCTCAACTGAACTTTCTTGAACTCGGTACCGGCGTACTATAGTAGGACTTTCTAGGCTGATACGTGACCGACCTATGCAATTACTATAAGTTCGCTGACGCGTCCGCCCGTGACCTTATCTGTGCCTGCCACGGTCGTCTGATGTGAGCGTTGTGGAAGGGAGAAATCGGGCACTCCTGTAACCCAAGTAAAGCCTTGTTATTTGGAGCAGAGAGACGGAATTCCGAACACCTGCTAAAGGAACCTGGGCAGTTTACGCGATTCTAACTTGACTGGAGATTGCTCGGGACTCCACGAACCAAGTGTCACTCTCGCAGTGTTGCCTCGGCATGATGATGCGTATCGTCCCTTTTTTCTCCCGAAACCGCCGTTACCGCAACCCGCCAACCTTCGGAGTAGGGTCAACGCATTCGATACGTGGATAGAACTCATACTAATGTTTAGCGCCGCAATTGGCGGGTTCTCATGAATCAGCTCGCAATGTTGAACCTTTTTGCTTCTTGACATGTTTTATCCGCTGAGTAGAAGATCGCAATCCTAAAGAGACAGATGACATTCCGAGAATTTAGCGATAGCAGTGTCGTCGCCGGGAGAAGTTACACTAGGTTACGCTTCGACCACCTCCGGGCAGACGCGTTGG'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"sys.setrecursionlimit(10000) #To avoid maximum recursion depth exceeded\n",
"g = debruijn_from_readpairs(read_pairs) # Constructs prefix and suffix for each pair in the read. De bruijnize the read pairs.\n",
"walk = eulerianWalk(g)\n",
"firstPatterns = [a for (a,b) in walk]\n",
"secondPatterns = [b for (a,b) in walk]\n",
"stringSpelledByGappedPatterns(firstPatterns, secondPatterns, 50, 200) #Constructs the string from paired eulerian walk."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}