{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Read input data\n", "#kmers = [line.strip() for line in open('dataset_205_5.txt', 'r')]\n", "kmers = ['ATG','ATG','TGT','TGG','CAT','GGA','GAT','AGA']" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Construct a dictionary of edges\n", "from collections import defaultdict\n", "def debruijn_from_kmer(kmers):\n", " '''Input: A collection of k-mers Patterns.\n", " Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n", " edges = defaultdict(list)\n", " for kmer in kmers:\n", " edges[kmer[:-1]].append(kmer[1:])\n", " return edges" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "defaultdict(list,\n", " {'AG': ['GA'],\n", " 'AT': ['TG', 'TG'],\n", " 'CA': ['AT'],\n", " 'GA': ['AT'],\n", " 'GG': ['GA'],\n", " 'TG': ['GT', 'GG']})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = debruijn_from_kmer(kmers)\n", "g" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def classify_nodes(g):\n", " \"\"\"Returns balanced and unbalanced nodes as separate lists from a graph of edges\"\"\"\n", " balanced, unbalanced = [], []\n", " out = reduce(lambda a,b: a+b, g.values())\n", " for node in set(out + g.keys()):\n", " indegrees = out.count(node)\n", " if node in g:\n", " outdegrees = len(g[node])\n", " else:\n", " outdegrees = 0\n", " \n", " if indegrees == outdegrees == 1:\n", " balanced.append(node)\n", " else:\n", " unbalanced.append(node)\n", " return balanced, unbalanced" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "balanced ['GG']\n", "unbalanced ['GT', 'AG', 'CA', 'AT', 'GA', 'TG']\n" ] } ], "source": [ "balanced, unbalanced = classify_nodes(g)\n", "print 'balanced', balanced\n", "print 'unbalanced', unbalanced" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def maximalNonBranchingPaths(g):\n", " \"\"\" Input: The adjacency list of a graph whose nodes are integers.\n", " Output: The collection of all maximal nonbranching paths in this graph.\"\"\"\n", " balanced, unbalanced = classify_nodes(g)\n", " paths = []\n", " for node in g:\n", " if node in unbalanced:\n", " if len(g[node]) > 0:\n", " while len(g[node]) > 0:\n", " w = g[node].pop()\n", " nonbranchingPath = [node,w]\n", " while w in balanced:\n", " w = g[w].pop()\n", " nonbranchingPath.append(w)\n", " paths.append(nonbranchingPath)\n", " #Find isolated cycles and add to paths\n", " for node in g:\n", " if len(g[node]) > 0:\n", " if node in balanced:\n", " cycle = [node]\n", " w = g[node].pop()\n", " while w in balanced:\n", " cycle.append(w)\n", " if cycle[0] == cycle[-1]: \n", " break\n", " w = g[w].pop()\n", " paths.append(cycle)\n", " return paths" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['AG', 'GA'],\n", " ['CA', 'AT'],\n", " ['AT', 'TG'],\n", " ['AT', 'TG'],\n", " ['GA', 'AT'],\n", " ['TG', 'GG', 'GA'],\n", " ['TG', 'GT']]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "paths = maximalNonBranchingPaths(g)\n", "paths" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "###String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.\n", "def string_from_genome_path(kmers):\n", " return kmers[0] + ''.join(map(lambda x: x[-1], kmers[1:])) " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['AGA', 'ATG', 'ATG', 'CAT', 'GAT', 'TGGA', 'TGT']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_contigs = [string_from_genome_path(contig) for contig in paths]\n", "contigs = sorted(get_contigs)\n", "contigs" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AGA\n", "ATG\n", "ATG\n", "CAT\n", "GAT\n", "TGGA\n", "TGT\n" ] } ], "source": [ "print '\\n'.join(contigs)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAACAATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATAC\n", "AAAATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGAC\n", "AAAATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGAC\n", "AAACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGT\n", "AAACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGT\n", "AAACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGTT\n", "AAACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGTT\n", "AAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTAGT\n", "AAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTAGT\n", "AAATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGACA\n", "AAATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGACA\n", "AACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAGGATGTCTCGGGCTGTTCCTTCTAAATGCTCTGGCCACCCAGACTGACTTAGCCACCCCAATCTCA\n", "AACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACT\n", "AACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACT\n", "AACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTC\n", "AACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTC\n", "AACGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGT\n", "AACGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGT\n", "AACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGTTG\n", "AACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGTTG\n", "AAGAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAAC\n", "AAGAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAAC\n", "AAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTTGT\n", "AAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTTGT\n", "AAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGATAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCA\n", "AATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACGA\n", "AATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACGA\n", "AATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTAGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGT\n", "AATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGACAA\n", "AATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGACAA\n", "AATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTACAT\n", "AATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTACAT\n", "ACAGAAATATGTTGGGGTGAGACGGGAACTTCACTCCTGAACGCTGGTTCAGCCCGAGCCAACCA\n", "ACAGAAATATGTTGGGGTGAGACGGGAACTTCACTCCTGAACGCTGGTTCAGCCCGAGCCAACCA\n", "ACAGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCCA\n", "ACAGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCCA\n", "ACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACTC\n", "ACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACTC\n", "ACATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAA\n", "ACATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAA\n", "ACCAAGATAGCCGCGCATTAGCGTGTAACAGCACTCCAACCCGCACTGCGATCTAGTTTAAAAGT\n", "ACCAAGATAGCCGCGCATTAGCGTGTAACAGCACTCCAACCCGCACTGCGATCTAGTTTAAAAGT\n", "ACCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTACT\n", "ACCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTACT\n", "ACCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTT\n", "ACCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTT\n", "ACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTCA\n", "ACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTCA\n", "ACCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGGTT\n", "ACCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGGTT\n", "ACGGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATC\n", "ACGGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATC\n", "ACGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTG\n", "ACGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTG\n", "ACGTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCA\n", "ACGTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCA\n", "ACGTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAG\n", "ACGTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAG\n", "ACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGTTGGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACG\n", "ACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACATGGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTG\n", "ACTGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTAG\n", "ACTGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTAG\n", "ACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGAAA\n", "ACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGAAA\n", "AGAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAACG\n", "AGAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAACG\n", "AGAATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTAC\n", "AGAATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTAC\n", "AGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACAT\n", "AGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACAT\n", "AGATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAG\n", "AGATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAG\n", "AGATATGCGACAGGACAGCTAGCCATCCGAACTCAAAATATAGTTGACTGATTGCATCTTGATAA\n", "AGATATGCGACAGGACAGCTAGCCATCCGAACTCAAAATATAGTTGACTGATTGCATCTTGATAA\n", "AGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCCAGAATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTA\n", "AGCGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTT\n", "AGCGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTT\n", "AGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTTGTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGT\n", "AGGAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCT\n", "AGGAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCT\n", "AGGGTGTGCTTCATAACATAAGCCTACTTGAAGGGGCCCGACCCGGCCGGTTAGTCTCTCCGGGGATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGG\n", "AGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTCT\n", "AGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTCT\n", "AGTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTTAT\n", "AGTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTTAT\n", "ATAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAGG\n", "ATAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAGG\n", "ATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGAT\n", "ATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGAT\n", "ATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACTCAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAA\n", "ATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGATT\n", "ATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGATT\n", "ATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAACG\n", "ATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAACG\n", "ATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACGAG\n", "ATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACGAG\n", "ATGGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTGC\n", "ATGGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTGC\n", "ATGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTCAC\n", "ATGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTCAC\n", "ATGTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAAA\n", "ATGTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAAA\n", "ATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGACAAGAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAA\n", "ATGTCTCGGGCTGTTCCTTCTAAATGCTCTGGCCACCCAGACTGACTTAGCCACCCCAATCTCAT\n", "ATGTCTCGGGCTGTTCCTTCTAAATGCTCTGGCCACCCAGACTGACTTAGCCACCCCAATCTCAT\n", "ATGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTAAA\n", "ATGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTAAA\n", "ATGTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAA\n", "ATGTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAA\n", "ATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGTT\n", "ATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGTT\n", "ATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTACATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCA\n", "CAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAACAA\n", "CAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAACAA\n", "CAACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCAC\n", "CAACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCAC\n", "CAATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACG\n", "CAATGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACG\n", "CACCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTAC\n", "CACCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTAC\n", "CACCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACT\n", "CACCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACT\n", "CAGAAATATGTTGGGGTGAGACGGGAACTTCACTCCTGAACGCTGGTTCAGCCCGAGCCAACCACACCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTA\n", "CAGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCCAG\n", "CAGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCCAG\n", "CAGGGTGTGCTTCATAACATAAGCCTACTTGAAGGGGCCCGACCCGGCCGGTTAGTCTCTCCGGG\n", "CAGGGTGTGCTTCATAACATAAGCCTACTTGAAGGGGCCCGACCCGGCCGGTTAGTCTCTCCGGG\n", "CATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACTCA\n", "CATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCACTCA\n", "CATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAAC\n", "CATCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAAC\n", "CATGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTCA\n", "CATGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTCA\n", "CCAAGATAGCCGCGCATTAGCGTGTAACAGCACTCCAACCCGCACTGCGATCTAGTTTAAAAGTTACGCGGCTCAGAGCAGGGAACCTGCCCCGTTGAAACTTCATCACGGGCCGTGAGTTGGCCATGCT\n", "CCAGAAACAACCATCATTCCAGCGCCTAGAACTGACCGTAGACCTGCCGTAACCAGTAACTACTCGGAGGTAGAGTATCGCCGAACGAGAGCTAGGTCCGGACCGGTCTGTTGAGTTACATAGCCTCC\n", "CCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTTA\n", "CCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTTA\n", "CCGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGTTT\n", "CCGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGTTT\n", "CCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTCAT\n", "CCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTCAT\n", "CCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGGTTCCGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGT\n", "CCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTTAA\n", "CCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTTAA\n", "CGACGGCAGACGTTGGCGCCGCGCCTGACTGATTATAGGAATTTTAATCACTGTTATTTGTGAGATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAA\n", "CGACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGA\n", "CGACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGA\n", "CGAGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGAC\n", "CGAGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGAC\n", "CGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGGCG\n", "CGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGGCG\n", "CGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGTTTCGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGG\n", "CGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTTAA\n", "CGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTTAA\n", "CGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCAAC\n", "CGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCAAC\n", "CGGAGGTAGAGTATCGCCGAACGAGAGCTAGGTCCGGACCGGTCTGTTGAGTTACATAGCCTCCA\n", "CGGAGGTAGAGTATCGCCGAACGAGAGCTAGGTCCGGACCGGTCTGTTGAGTTACATAGCCTCCA\n", "CGGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATCA\n", "CGGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATCA\n", "CGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTGC\n", "CGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTGC\n", "CGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCGTCATGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTC\n", "CGTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCCCT\n", "CGTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCCCT\n", "CGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGACTTAAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCT\n", "CGTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCAC\n", "CGTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCAC\n", "CGTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGG\n", "CGTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGG\n", "CTAGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTT\n", "CTAGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTT\n", "CTCAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAAC\n", "CTCAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAAC\n", "CTGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTAGT\n", "CTGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTAGT\n", "CTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCACCCGTCTCCTCTGGTGGAGATCCTGGGCGTAAGCTCAACGATTCAAATAAACCCCAGGTGAC\n", "CTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGAAAATGTCGACTCACATCTTCCTCAGCTTAGGCTTCATTACGATACGCGTCCGATCGCTAGCGA\n", "CTTGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGCG\n", "CTTGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGCG\n", "CTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAACGGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAG\n", "GAAACTTAGATAGACCCGCGCATTTGAGCCTTTAATTAGTAAGGCAAAACATTAGCCGAGAACGTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGA\n", "GAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTAG\n", "GAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTAG\n", "GAATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTACA\n", "GAATTCCAAAATTGATGCAGTGTAGGGATTTATACGCCCGTAAAACCAACGGTACCCAGTGTACA\n", "GACGGTAATATTCAGAACTTGCCGGTCTACACACGCAGACAACACGGTACTGCTAATTCACCTCGACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAG\n", "GACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACATG\n", "GACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACATG\n", "GACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGAA\n", "GACTTCACGGCCATAAACATGCGTACGCACGGGCCTCCGCAACATATAGATCATGCAACGCAGAA\n", "GAGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACA\n", "GAGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGACA\n", "GAGCAAGAGCATATGTCTCACATGTCGACATGAGGGCGAGGTTAATAGCCCCAGTAGATGCTACAACTGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTA\n", "GATAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAG\n", "GATAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAG\n", "GATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGA\n", "GATAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGA\n", "GATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATCATGGGATCGACTTGCTGACTCCCCGTACTTTAGGATGCTAGACGACACCCAACAGCGCGGCCGAA\n", "GATATGCGACAGGACAGCTAGCCATCCGAACTCAAAATATAGTTGACTGATTGCATCTTGATAACGGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACAT\n", "GATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGAT\n", "GATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGAT\n", "GATGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTAA\n", "GATGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTAA\n", "GATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGT\n", "GATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGT\n", "GCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGGCGCGACGGCAGACGTTGGCGCCGCGCCTGACTGATTATAGGAATTTTAATCACTGTTATTTGTGA\n", "GCGACGGCAGACGTTGGCGCCGCGCCTGACTGATTATAGGAATTTTAATCACTGTTATTTGTGAG\n", "GCGACGGCAGACGTTGGCGCCGCGCCTGACTGATTATAGGAATTTTAATCACTGTTATTTGTGAG\n", "GCGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTTA\n", "GCGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTTA\n", "GCGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCAA\n", "GCGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCAA\n", "GCGTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCCC\n", "GCGTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCCC\n", "GGAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTA\n", "GGAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTCTA\n", "GGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATTTAAACTATTTGTAATTTTGCAGTCGTCTATTCGCCAGGGGTGCTGCCAAGACTCAGTTGGATAGT\n", "GGACATCAGGAGTTTTTGTCTCGCACGTGTTCACACCCGCTGATAGAGGATTACCAAGGAAGCCATGACGGTAATATTCAGAACTTGCCGGTCTACACACGCAGACAACACGGTACTGCTAATTCACCT\n", "GGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCAACATAGCGAGAACCGCTATACTGTAGTTTAACGGTAAGCACTCACCCCCAGTAATAAATGCA\n", "GGAGGTAGAGTATCGCCGAACGAGAGCTAGGTCCGGACCGGTCTGTTGAGTTACATAGCCTCCAAAGATATGCGACAGGACAGCTAGCCATCCGAACTCAAAATATAGTTGACTGATTGCATCTTGATA\n", "GGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATCAT\n", "GGATACGAACACGGATACTTCATCCGTACTCTCGTTATAGCACATACACGTGATTAGAACATCAT\n", "GGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTGCGCGTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCC\n", "GGATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGA\n", "GGATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGA\n", "GGATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCG\n", "GGATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCG\n", "GGGATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGG\n", "GGGATCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGG\n", "GGGATCGACTTGCTGACTCCCCGTACTTTAGGATGCTAGACGACACCCAACAGCGCGGCCGAAGGAAATGGGTTCCGTTACGCGGACCTCCCGGTACCGATGATTCGTGTAGAGACTGCCAGGCTC\n", "GGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTTA\n", "GGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTTA\n", "GGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTGCG\n", "GGTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTGCG\n", "GGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTTAC\n", "GGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTTAC\n", "GTAACACGCGCTTACCGAACTATGAACTGTCCGCCTTGGTCGCTTGCCTTGTTTTGCCCGCCCTACCAAGATAGCCGCGCATTAGCGTGTAACAGCACTCCAACCCGCACTGCGATCTAGTTTAAAAG\n", "GTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTCTT\n", "GTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTCTT\n", "GTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAAAGCGGAATTTGACAATCACTATAAGGAACGGAACAACGGGGACGTTATGAGTCTGCGCCCCATT\n", "GTCCACGGACTTCCGTCTGCGGTATGCAGGGTTCAGAAGGAAGCATACTGCATAACCGAGTGCGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTC\n", "GTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCACC\n", "GTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCACC\n", "GTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTTACCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGG\n", "GTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTTATTGGACATCAGGAGTTTTTGTCTCGCACGTGTTCACACCCGCTGATAGAGGATTACCAAGGAAGC\n", "GTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGGATGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTA\n", "GTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGGA\n", "GTGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGGA\n", "GTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAAT\n", "GTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAAT\n", "GTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAGA\n", "GTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAGA\n", "TAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAGGA\n", "TAACAATTCCTCCTTAACACCCATAGTATGCTCATAGGCGCACCCGTCAGCAACCATGACCAGGA\n", "TAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAATGTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAA\n", "TAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGTTTTATGCAGTAAGATCCGTTTTGTACGGGCTCTGCGAAGATGATTCGGAGCGCTGAACCTTGGA\n", "TAAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTTG\n", "TAAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTTG\n", "TAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGATA\n", "TAAGGCTCCTACCAAGACTGAGTCTTAAAGAAGTGTCGAGCATCTAGGATAGTTGGGTAAAGATA\n", "TACCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGGT\n", "TACCGTAGGGCGACAAAACCTCTTTAACCGTTACTTCCCATTGAGGGGTGTCGGTCGAATGGGGT\n", "TAGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTC\n", "TAGTATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTC\n", "TAGTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTTA\n", "TAGTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTTA\n", "TATCTTTCACAAAGCACGCCAAATCAGGAATGGTGTAACAAGATGAAACTATGAGTGTGTTCTTGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGC\n", "TCAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAACA\n", "TCAAAAATACGTAGCTAGGCTTACGACGCGTAACAAAACAGGGCCTAGTCGTACAAGATTCAACA\n", "TCCGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGTT\n", "TCCGCTGTGCAGCTCTGACAAGGCTCCACTTACGCTCTGCGGACTAGGGTGCCAAATGGATCGTT\n", "TCCTTGCCTAAAGGCGAGGCGTGACCAAGGTGCACACTAGGCACATGCACATTTTTAGGGGATTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACC\n", "TCGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGGC\n", "TCGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGGC\n", "TCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCACCC\n", "TCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTCACCC\n", "TCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAACGG\n", "TCTTTGGCATAACAACAACCACATGTACACGGTCATAGCTTAGAATGCTTTTCAGAGCCCAACGG\n", "TGACGGTAATATTCAGAACTTGCCGGTCTACACACGCAGACAACACGGTACTGCTAATTCACCTC\n", "TGACGGTAATATTCAGAACTTGCCGGTCTACACACGCAGACAACACGGTACTGCTAATTCACCTC\n", "TGCGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCA\n", "TGCGGACTATTCGAGATCGAGTTGATGTCCCTAATTTCAGGACTAGGTATAATCGCAATACGTCA\n", "TGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGCGACAGCGCCAGCCTTTAAGCAATTACTTTCACGTTATACATCCGGCAGGGTGTACCGAAGTTTCC\n", "TGCTAAGTGGTCGTTCGTTTCTGAACTATCGTCAATCTGAAGAGGGATACGGAGCCTATACGAGACTGATGCATCTATCGGTTGACACAATTTCCCATGCCATGACTATCCGATGGATGATGGA\n", "TGGACATCAGGAGTTTTTGTCTCGCACGTGTTCACACCCGCTGATAGAGGATTACCAAGGAAGCC\n", "TGGACATCAGGAGTTTTTGTCTCGCACGTGTTCACACCCGCTGATAGAGGATTACCAAGGAAGCC\n", "TGGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTGCG\n", "TGGATAGTTTGGAATGGAGTGCGAACGTTCCCATCTAGGGCAGATTAATATGGAGTAACACTGCG\n", "TGGGATCGACTTGCTGACTCCCCGTACTTTAGGATGCTAGACGACACCCAACAGCGCGGCCGAAG\n", "TGGGATCGACTTGCTGACTCCCCGTACTTTAGGATGCTAGACGACACCCAACAGCGCGGCCGAAG\n", "TGGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTT\n", "TGGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGTT\n", "TGGTCTCTCTCAAAACCTTACAGTTTCGCGTCCACCCTTCTCTAATGGGTCCAACCTTTTTCACCAGGGTGTGCTTCATAACATAAGCCTACTTGAAGGGGCCCGACCCGGCCGGTTAGTCTCTCCGG\n", "TGTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAAAG\n", "TGTCACGCTCGATCGTAGCAGGAGCCGCTGCTCTCTTTTGTAGGTAACCGCGACAGTATCGAAAG\n", "TGTCTCGGGCTGTTCCTTCTAAATGCTCTGGCCACCCAGACTGACTTAGCCACCCCAATCTCATGTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAA\n", "TGTCTTCGGTTTAAACGTCGGGCTATACACCGGTTGCGAAAGTGATGACCCCCGAGTCTCTAAACCGGTTGGAGGTCCAGGTAGCCCATCGTCTCCATTCGGGTGAAGAAGTACCACCAACTTCG\n", "TGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGGAT\n", "TGTGCGCAGTGTCAGCGGTTCTTTACAGCGAAGTCCCTCAGCGTTAATCTGGTAGTCACGAGGAT\n", "TGTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAA\n", "TGTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAA\n", "TGTTTCGTATTTAACGAGAGGTCACGTGCGAAGCGCTTCGCGATATTAATGGCTGGCAACTAGTAGTGATGGCGGTCAGCTATTCTTCCCTGCATGAGAGAGGATCCTTTCGAAGAGTTTCCAGCTT\n", "TGTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAG\n", "TGTTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAG\n", "TTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAATG\n", "TTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTAATG\n", "TTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGTTT\n", "TTAACGCCGGTTGAGGTAGTGTAGGAGTGGACCATTAATACTTATAGGACTGTCACCAACCGTTT\n", "TTAAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTT\n", "TTAAGCTAACCCCTATACACGTATGGCACCGTAGGCGTGCGTAGTTAAATACGAATAGGCCCCTT\n", "TTATGCAGTAAGATCCGTTTTGTACGGGCTCTGCGAAGATGATTCGGAGCGCTGAACCTTGGACGTCTTAAAACATATCCGAATGAGTCAATCAGGTTGAGTTCAGTATGGCTTTCGTGGACACTC\n", "TTCGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGG\n", "TTCGCCTAAATTGCTCAAGAATCTTCGTGCATAAAATTACCGAAAGCACCATAAACTCCAATGGG\n", "TTGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGCGA\n", "TTGCGTATTCTCTACCTTTTTCTAGACGTTGACTATATGTAGTATTTGTTGTTACGACGTGGCGA\n", "TTGGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGT\n", "TTGGGTGATGCGCTGCGAGGCGCGCCTTCCGGCGACTCGGCGGAGGGTCGCTCTCCACAAGACGT\n", "TTGTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTA\n", "TTGTTAACCATCATCGTGTGTTAGCCTATAATCATTTCACCTGATCACACACAAACCGGACTGTA\n", "TTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAGACAGAAATATGTTGGGGTGAGACGGGAACTTCACTCCTGAACGCTGGTTCAGCCCGAGCCAACC\n", "TTTATGCAGTAAGATCCGTTTTGTACGGGCTCTGCGAAGATGATTCGGAGCGCTGAACCTTGGAC\n", "TTTATGCAGTAAGATCCGTTTTGTACGGGCTCTGCGAAGATGATTCGGAGCGCTGAACCTTGGAC\n", "TTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAGAC\n", "TTTTACGGTCCCTGGACCGGTATTTAAAGTGCAAAAACAGAAGATAGGCTTTCGACAATAAAGAC\n" ] } ], "source": [ "#Complete solution for Contig generation problem\n", "#Generate the contigs from a collection of reads (with imperfect coverage).\n", "#Input: A collection of k-mers Patterns. \n", "#Output: All contigs in DeBruijn(Patterns).\n", "\n", "#Read input data\n", "kmers = [line.strip() for line in open('input/dataset_205_5.txt', 'r')]\n", "\n", "#Construct a dictionary of edges\n", "from collections import defaultdict\n", "def debruijn_from_kmer(kmers):\n", " '''Input: A collection of k-mers Patterns.\n", " Output: The adjacency list of the de Bruijn graph DeBruijn(Patterns).'''\n", " edges = defaultdict(list)\n", " for kmer in kmers:\n", " edges[kmer[:-1]].append(kmer[1:])\n", " return edges\n", "\n", "def classify_nodes(g):\n", " \"\"\"Returns balanced and unbalanced nodes as seperate lists from a graph of edges\"\"\"\n", " balanced, unbalanced = [], []\n", " out = reduce(lambda a,b: a+b, g.values())\n", " for node in set(out + g.keys()):\n", " indegrees = out.count(node)\n", " if node in g:\n", " outdegrees = len(g[node])\n", " else:\n", " outdegrees = 0\n", " \n", " if indegrees == outdegrees == 1:\n", " balanced.append(node)\n", " else:\n", " unbalanced.append(node)\n", " return balanced, unbalanced\n", "\n", "def maximalNonBranchingPaths(g):\n", " \"\"\" Input: The adjacency list of a graph whose nodes are integers.\n", " Output: The collection of all maximal nonbranching paths in this graph.\"\"\"\n", " balanced, unbalanced = classify_nodes(g)\n", " paths = []\n", " for node in g:\n", " if node in unbalanced:\n", " if len(g[node]) > 0:\n", " while len(g[node]) > 0:\n", " w = g[node].pop()\n", " nonbranchingPath = [node,w]\n", " while w in balanced:\n", " w = g[w].pop()\n", " nonbranchingPath.append(w)\n", " paths.append(nonbranchingPath)\n", " #Find isolated cycles and add to paths\n", " for node in g:\n", " if len(g[node]) > 0:\n", " if node in balanced:\n", " cycle = [node]\n", " w = g[node].pop()\n", " while w in balanced:\n", " cycle.append(w)\n", " if cycle[0] == cycle[-1]: \n", " break\n", " w = g[w].pop()\n", " paths.append(cycle)\n", " return paths\n", "\n", "def string_from_genome_path(kmers):\n", " return kmers[0] + ''.join(map(lambda x: x[-1], kmers[1:])) \n", "\n", "g = debruijn_from_kmer(kmers)\n", "paths = maximalNonBranchingPaths(g)\n", "get_contigs = [string_from_genome_path(contig) for contig in paths]\n", "contigs = sorted(get_contigs)\n", "print '\\n'.join(contigs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }