{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import math\n", "from collections import defaultdict\n", "\n", "class MarkovChainOrder(object):\n", " ''' Simple higher-order Markov chain, specifically for DNA\n", " sequences. User provides training data (DNA strings). '''\n", " \n", " def __init__(self, examples, order=1):\n", " ''' Initialize the model given collection of example DNA\n", " strings. '''\n", " self.order = order\n", " self.mers = defaultdict(int)\n", " self.longestMer = longestMer = order + 1\n", " for ex in examples:\n", " # count up all k-mers of length 'longestMer' or shorter\n", " for i in range(len(ex) - longestMer + 1):\n", " for j in range(longestMer, -1, -1):\n", " self.mers[ex[i:i+j]] += 1\n", " \n", " def condProb(self, obs, given):\n", " ''' Return conditional probability of seeing nucleotide \"obs\"\n", " given we just saw nucleotide string \"given\". Length of\n", " \"given\" can't exceed model order. Return None if \"given\"\n", " was never observed. '''\n", " assert len(given) <= self.order\n", " ngiven = self.mers[given]\n", " if ngiven == 0: return None\n", " return float(self.mers[given + obs]) / self.mers[given]\n", " \n", " def jointProb(self, s):\n", " ''' Return joint probability of observing string s '''\n", " cum = 1.0\n", " for i in range(self.longestMer-1, len(s)):\n", " obs, given = s[i], s[i-self.longestMer+1:i]\n", " p = self.condProb(obs, given)\n", " if p is not None:\n", " cum *= p\n", " # include the smaller k-mers at the very beginning\n", " for j in range(self.longestMer-2, -1, -1):\n", " obs, given = s[j], s[:j]\n", " p = self.condProb(obs, given)\n", " if p is not None:\n", " cum *= p\n", " return cum\n", " \n", " def jointProbL(self, s):\n", " ''' Return log2 of joint probability of observing string s '''\n", " cum = 0.0\n", " for i in range(self.longestMer-1, len(s)):\n", " obs, given = s[i], s[i-self.longestMer+1:i]\n", " p = self.condProb(obs, given)\n", " if p is not None:\n", " cum += math.log(p, 2)\n", " # include the smaller k-mers at the very beginning\n", " for j in range(self.longestMer-2, -1, -1):\n", " obs, given = s[j], s[:j]\n", " p = self.condProb(obs, given)\n", " if p is not None:\n", " cum += math.log(p, 2)\n", " return cum" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mc1 = MarkovChainOrder(['AC' * 10], 1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc1.condProb('A', 'C') # should be 1; C always followed by A" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc1.condProb('C', 'A') # should be 1; A always followed by C" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "mc2 = MarkovChainOrder(['AC' * 10], 2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc2.condProb('A', 'AC') # AC always followed by A" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc2.condProb('C', 'CA') # CA always followed by C" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc2.condProb('C', 'AA') is None # because AA doesn't occur" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.3333333333333333" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc3.condProb('2', 'AAA') # 1/3" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mc3.condProb('3', 'AAA') # 1/2" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7619047619047619" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p1 = mc3.condProb('A', '')\n", "p1" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6875" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p2 = mc3.condProb('A', 'A')\n", "p2" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5454545454545454" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p3 = mc3.condProb('A', 'AA')\n", "p3" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.16666666666666666" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p4 = mc3.condProb('1', 'AAA')\n", "p4" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.0476190476190476, 0.04761904761904761)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(-4.392317422778761, -4.392317422778761)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import math\n", "math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 1 }