{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "def neighbors1mm(kmer, alpha):\n", " ''' Generate all neighbors at Hamming distance 1 from kmer '''\n", " neighbors = []\n", " for j in range(len(kmer)-1, -1, -1):\n", " oldc = kmer[j]\n", " for c in alpha:\n", " if c == oldc: continue\n", " neighbors.append(kmer[:j] + c + kmer[j+1:])\n", " return neighbors" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CAA', 'CAC', 'CAG', 'CCT', 'CGT', 'CTT', 'AAT', 'GAT', 'TAT']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neighbors1mm('CAT', 'ACGT')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def kmerHist(reads, k):\n", " ''' Return k-mer histogram and average # k-mer occurrences '''\n", " kmerhist = {}\n", " for read in reads:\n", " for kmer in [ read[i:i+k] for i in range(len(read)-(k-1)) ]:\n", " kmerhist[kmer] = kmerhist.get(kmer, 0) + 1\n", " return kmerhist" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ATC': 9, 'CAT': 10, 'TCA': 9}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "khist = kmerHist(['CAT' * 10], 3)\n", "khist" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def correct1mm(read, k, kmerhist, alpha, thresh):\n", " ''' Return an error-corrected version of read. k = k-mer length.\n", " kmerhist is kmer count map. alpha is alphabet. thresh is\n", " count threshold above which k-mer is considered correct. '''\n", " # Iterate over k-mers in read\n", " for i in range(len(read)-(k-1)):\n", " kmer = read[i:i+k]\n", " # If k-mer is infrequent...\n", " if kmerhist.get(kmer, 0) <= thresh:\n", " # Look for a frequent neighbor\n", " for newkmer in neighbors1mm(kmer, alpha):\n", " if kmerhist.get(newkmer, 0) > thresh:\n", " # Found a frequent neighbor; replace old kmer\n", " # with neighbor\n", " read = read[:i] + newkmer + read[i+k:]\n", " break\n", " # Return possibly-corrected read\n", " return read" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'CAT'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correct1mm('CAT', 3, khist, 'ACGT', 2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'CAT'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correct1mm('CTT', 3, khist, 'ACGT', 2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }