{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Naive Bayes Probability Model\n", "\n", "참고: [데이타 사이언스 스쿨 노트북](https://www.datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/#다항-분포-나이브-베이즈-모형)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "np.random.seed(0)\n", "X = np.random.randint(2, size=(10, 4))\n", "y = np.array([0,0,0,0,1,1,1,1,1,1])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[0, 1, 1, 0],\n", " [1, 1, 1, 1],\n", " [1, 1, 1, 0],\n", " [0, 1, 0, 0],\n", " [0, 0, 0, 1],\n", " [0, 1, 1, 0],\n", " [0, 1, 1, 1],\n", " [1, 0, 1, 0],\n", " [1, 0, 1, 1],\n", " [0, 1, 1, 0]])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 다항 분포 나이브 베이즈" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "clf = MultinomialNB().fit(X, y)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([4., 6.])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.class_count_" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.51082562])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.intercept_" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.91629073, -0.51082562])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.class_log_prior_" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[2., 4., 3., 1.],\n", " [2., 3., 5., 3.]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fc = clf.feature_count_\n", "fc" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n", " [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.feature_log_prob_" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-1.73460106, -1.44691898, -1.04145387, -1.44691898]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.coef_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([4., 7., 8., 4.])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 각 y class의 X elements 합, axis는 값을 합산하는 축\n", "# axis=None(default): elements 전체 합\n", "# axis=0: y 축 기준 합 \n", "fc.sum(axis=0)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([10., 13.])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# axis=1: x 축 기준 합\n", "fc.sum(axis=1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[10.],\n", " [13.]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# newaxis는 길이 1의 새로운 축 생성. None과 동일하다.\n", "fc.sum(axis=1)[:, np.newaxis]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[10., 10., 10., 10.],\n", " [13., 13., 13., 13.]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# x 축 기준 합을 반복 축 따라(axis=1) 4회 반복.\n", "denominator = np.repeat(fc.sum(axis=1)[:, np.newaxis], 4, axis=1)\n", "denominator" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.2 , 0.4 , 0.3 , 0.1 ],\n", " [0.15384615, 0.23076923, 0.38461538, 0.23076923]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fc / denominator" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 라플라스 스무딩\n", "clf.alpha" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n", " [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.log((fc + clf.alpha) / (denominator + clf.alpha * X.shape[1]))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-1.54044504, -1.02961942, -1.25276297, -1.94591015],\n", " [-1.73460106, -1.44691898, -1.04145387, -1.44691898]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.feature_log_prob_" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.55131629, 0.44868371]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_new = np.array([1,1,0,0])\n", "clf.predict_proba([x_new])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-3.48635519, -3.69234566])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_p = clf.class_log_prior_ + np.sum(clf.feature_log_prob_ * x_new, axis=1)\n", "log_p" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.55131629, 0.44868371])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(log_p) / np.sum(np.exp(log_p))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 베르누이 나이브 베이즈" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import BernoulliNB\n", "clf_bern = BernoulliNB().fit(X, y)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.69314718, -0.18232156, -0.40546511, -1.09861229],\n", " [-0.98082925, -0.69314718, -0.28768207, -0.69314718]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_bern.feature_log_prob_" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.69314718, -0.18232156, -0.40546511, -1.09861229],\n", " [-0.98082925, -0.69314718, -0.28768207, -0.69314718]])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 베르누이 `feature_log_prob_` 계산\n", "np.log((clf_bern.feature_count_ + 1) / \\\n", " (clf_bern.class_count_.reshape(-1, 1) + 2))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.72480181, 0.27519819]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_bern.predict_proba([x_new])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.69314718, -1.79175947, -1.09861229, -0.40546511],\n", " [-0.47000363, -0.69314718, -1.38629436, -0.69314718]])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neg_prob = np.log(1 - np.exp(clf_bern.feature_log_prob_))\n", "neg_prob" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.69314718, -0.18232156, -0. , -0. ],\n", " [-0.98082925, -0.69314718, -0. , -0. ]])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf_bern.feature_log_prob_ * x_new" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[-0. , -0. , -1.09861229, -0.40546511],\n", " [-0. , -0. , -1.38629436, -0.69314718]])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neg_prob * (1 - x_new)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-3.29583687, -4.2642436 ])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log_p_bern = clf_bern.class_log_prior_ + \\\n", "np.sum(clf_bern.feature_log_prob_ * x_new + \\\n", " neg_prob * (1 - x_new), axis=1)\n", "log_p_bern" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.72480181, 0.27519819])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(log_p_bern) / np.sum(np.exp(log_p_bern))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 1 }