{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "from collections import defaultdict\n",
    "\n",
    "class MarkovChainOrder(object):\n",
    "    ''' Simple higher-order Markov chain, specifically for DNA\n",
    "        sequences.  User provides training data (DNA strings).  '''\n",
    "    \n",
    "    def __init__(self, examples, order=1):\n",
    "        ''' Initialize the model given collection of example DNA\n",
    "            strings. '''\n",
    "        self.order = order\n",
    "        self.mers = defaultdict(int)\n",
    "        self.longestMer = longestMer = order + 1\n",
    "        for ex in examples:\n",
    "            # count up all k-mers of length 'longestMer' or shorter\n",
    "            for i in range(len(ex) - longestMer + 1):\n",
    "                for j in range(longestMer, -1, -1):\n",
    "                    self.mers[ex[i:i+j]] += 1\n",
    "    \n",
    "    def condProb(self, obs, given):\n",
    "        ''' Return conditional probability of seeing nucleotide \"obs\"\n",
    "            given we just saw nucleotide string \"given\".  Length of\n",
    "            \"given\" can't exceed model order.  Return None if \"given\"\n",
    "            was never observed. '''\n",
    "        assert len(given) <= self.order\n",
    "        ngiven = self.mers[given]\n",
    "        if ngiven == 0: return None\n",
    "        return float(self.mers[given + obs]) / self.mers[given]\n",
    "    \n",
    "    def jointProb(self, s):\n",
    "        ''' Return joint probability of observing string s '''\n",
    "        cum = 1.0\n",
    "        for i in range(self.longestMer-1, len(s)):\n",
    "            obs, given = s[i], s[i-self.longestMer+1:i]\n",
    "            p = self.condProb(obs, given)\n",
    "            if p is not None:\n",
    "                cum *= p\n",
    "        # include the smaller k-mers at the very beginning\n",
    "        for j in range(self.longestMer-2, -1, -1):\n",
    "            obs, given = s[j], s[:j]\n",
    "            p = self.condProb(obs, given)\n",
    "            if p is not None:\n",
    "                cum *= p\n",
    "        return cum\n",
    "    \n",
    "    def jointProbL(self, s):\n",
    "        ''' Return log2 of joint probability of observing string s '''\n",
    "        cum = 0.0\n",
    "        for i in range(self.longestMer-1, len(s)):\n",
    "            obs, given = s[i], s[i-self.longestMer+1:i]\n",
    "            p = self.condProb(obs, given)\n",
    "            if p is not None:\n",
    "                cum += math.log(p, 2)\n",
    "        # include the smaller k-mers at the very beginning\n",
    "        for j in range(self.longestMer-2, -1, -1):\n",
    "            obs, given = s[j], s[:j]\n",
    "            p = self.condProb(obs, given)\n",
    "            if p is not None:\n",
    "                cum += math.log(p, 2)\n",
    "        return cum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mc1 = MarkovChainOrder(['AC' * 10], 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc1.condProb('A', 'C') # should be 1; C always followed by A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc1.condProb('C', 'A') # should be 1; A always followed by C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "mc2 = MarkovChainOrder(['AC' * 10], 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc2.condProb('A', 'AC') # AC always followed by A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc2.condProb('C', 'CA') # CA always followed by C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc2.condProb('C', 'AA') is None # because AA doesn't occur"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3333333333333333"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc3.condProb('2', 'AAA') # 1/3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mc3.condProb('3', 'AAA') # 1/2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7619047619047619"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1 = mc3.condProb('A', '')\n",
    "p1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6875"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p2 = mc3.condProb('A', 'A')\n",
    "p2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5454545454545454"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p3 = mc3.condProb('A', 'AA')\n",
    "p3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.16666666666666666"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p4 = mc3.condProb('1', 'AAA')\n",
    "p4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.0476190476190476, 0.04761904761904761)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(-4.392317422778761, -4.392317422778761)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import math\n",
    "math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}