{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Naive Bayes Probability Model\n",
    "\n",
    "참고: [데이타 사이언스 스쿨 노트북](https://www.datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/#다항-분포-나이브-베이즈-모형)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "X = np.random.randint(2, size=(10, 4))\n",
    "y = np.array([0,0,0,0,1,1,1,1,1,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1, 1, 0],\n",
       "       [1, 1, 1, 1],\n",
       "       [1, 1, 1, 0],\n",
       "       [0, 1, 0, 0],\n",
       "       [0, 0, 0, 1],\n",
       "       [0, 1, 1, 0],\n",
       "       [0, 1, 1, 1],\n",
       "       [1, 0, 1, 0],\n",
       "       [1, 0, 1, 1],\n",
       "       [0, 1, 1, 0]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 다항 분포 나이브 베이즈"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = MultinomialNB().fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4., 6.])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.class_count_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.51082562])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.91629073, -0.51082562])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.class_log_prior_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2., 4., 3., 1.],\n",
       "       [2., 3., 5., 3.]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fc = clf.feature_count_\n",
    "fc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n",
       "       [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.feature_log_prob_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.73460106, -1.44691898, -1.04145387, -1.44691898]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4., 7., 8., 4.])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 각 y class의 X elements 합, axis는 값을 합산하는 축\n",
    "# axis=None(default): elements 전체 합\n",
    "# axis=0: y 축 기준 합 \n",
    "fc.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([10., 13.])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# axis=1: x 축 기준 합\n",
    "fc.sum(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[10.],\n",
       "       [13.]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# newaxis는 길이 1의 새로운 축 생성. None과 동일하다.\n",
    "fc.sum(axis=1)[:, np.newaxis]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[10., 10., 10., 10.],\n",
       "       [13., 13., 13., 13.]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# x 축 기준 합을 반복 축 따라(axis=1) 4회 반복.\n",
    "denominator = np.repeat(fc.sum(axis=1)[:, np.newaxis], 4, axis=1)\n",
    "denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.2       , 0.4       , 0.3       , 0.1       ],\n",
       "       [0.15384615, 0.23076923, 0.38461538, 0.23076923]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fc / denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 라플라스 스무딩\n",
    "clf.alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n",
       "       [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.log((fc + clf.alpha) / (denominator + clf.alpha * X.shape[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n",
       "       [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.feature_log_prob_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.55131629, 0.44868371]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_new = np.array([1,1,0,0])\n",
    "clf.predict_proba([x_new])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-3.48635519, -3.69234566])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "log_p = clf.class_log_prior_ + np.sum(clf.feature_log_prob_ * x_new, axis=1)\n",
    "log_p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.55131629, 0.44868371])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.exp(log_p) / np.sum(np.exp(log_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 베르누이 나이브 베이즈"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import BernoulliNB\n",
    "clf_bern = BernoulliNB().fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.69314718, -0.18232156, -0.40546511, -1.09861229],\n",
       "       [-0.98082925, -0.69314718, -0.28768207, -0.69314718]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_bern.feature_log_prob_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.69314718, -0.18232156, -0.40546511, -1.09861229],\n",
       "       [-0.98082925, -0.69314718, -0.28768207, -0.69314718]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 베르누이 `feature_log_prob_` 계산\n",
    "np.log((clf_bern.feature_count_ + 1) / \\\n",
    "       (clf_bern.class_count_.reshape(-1, 1) + 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.72480181, 0.27519819]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_bern.predict_proba([x_new])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.69314718, -1.79175947, -1.09861229, -0.40546511],\n",
       "       [-0.47000363, -0.69314718, -1.38629436, -0.69314718]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg_prob = np.log(1 - np.exp(clf_bern.feature_log_prob_))\n",
    "neg_prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.69314718, -0.18232156, -0.        , -0.        ],\n",
       "       [-0.98082925, -0.69314718, -0.        , -0.        ]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_bern.feature_log_prob_ * x_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.        , -0.        , -1.09861229, -0.40546511],\n",
       "       [-0.        , -0.        , -1.38629436, -0.69314718]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg_prob * (1 - x_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-3.29583687, -4.2642436 ])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "log_p_bern = clf_bern.class_log_prior_ + \\\n",
    "np.sum(clf_bern.feature_log_prob_ * x_new + \\\n",
    "       neg_prob * (1 - x_new), axis=1)\n",
    "log_p_bern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.72480181, 0.27519819])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.exp(log_p_bern) / np.sum(np.exp(log_p_bern))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}