{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For Blog: [自然语言计算机形式分析的理论与方法笔记(Ch13) | Yam](https://yam.gift/2019/03/15/NLPFA/2019-03-15-Ch13-Ngram-and-Smoothing/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "wc = {\n",
    "    \"I\": 3437,\n",
    "    \"want\": 1215,\n",
    "    \"to\": 3256,\n",
    "    \"eat\": 938,\n",
    "    \"Chinese\": 213,\n",
    "    \"food\": 1506,\n",
    "    \"lunch\": 459,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawbigramc = [\n",
    "    [8    , 1087 , 0    , 12   , 0       , 0    , 0]     ,\n",
    "    [3    , 0    , 786  , 0    , 6       , 8    , 6]     ,\n",
    "    [3    , 0    , 10   , 860  , 3       , 0    , 12]    ,\n",
    "    [0    , 0    , 2    , 0    , 19      , 2    , 52]    ,\n",
    "    [2    , 0    , 0    , 0    , 0       , 120  , 1]     ,\n",
    "    [19   , 0    , 17   , 0    , 0       , 0    , 0]     ,\n",
    "    [4    , 0    , 0    , 0    , 0       , 1    , 0]     ,\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenc = 1616"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "wtype_bic = {\n",
    "    \"I x\": 95,\n",
    "    \"want x\": 76,\n",
    "    \"to x\": 130,\n",
    "    \"eat x\": 124,\n",
    "    \"Chinese x\": 20,\n",
    "    \"food x\": 82,\n",
    "    \"lunch x\": 45\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add-One"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "bigramc = [[_+1 for _ in item] for item in rawbigramc]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = []\n",
    "for i,w in enumerate(\"I want to eat Chinese food lunch\".split()):\n",
    "    item = [(x/(wc[w] + tokenc)) for x in bigramc[i]]\n",
    "    res.append(item)\n",
    "prob = np.array(res).reshape(7,7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[6.122, 740.047, 0.68, 8.842, 0.68, 0.68, 0.68]\n",
      "[1.717, 0.429, 337.762, 0.429, 3.004, 3.863, 3.004]\n",
      "[2.673, 0.668, 7.351, 575.414, 2.673, 0.668, 8.688]\n",
      "[0.367, 0.367, 1.102, 0.367, 7.345, 1.102, 19.465]\n",
      "[0.349, 0.116, 0.116, 0.116, 0.116, 14.091, 0.233]\n",
      "[9.648, 0.482, 8.683, 0.482, 0.482, 0.482, 0.482]\n",
      "[1.106, 0.221, 0.221, 0.221, 0.221, 0.442, 0.221]\n"
     ]
    }
   ],
   "source": [
    "for i, w in enumerate(wc.keys()):\n",
    "    print([round(_,3) for _ in list(prob[i] * wc[w])])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Witten-Bell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = []\n",
    "for i,w in enumerate(\"I want to eat Chinese food lunch\".split()):\n",
    "    n = wc[w]\n",
    "    t = wtype_bic[w+\" x\"]\n",
    "    z = tokenc - t\n",
    "    item = [(x/(n+t)) if x > 0 else t/((n+t)*z) for x in rawbigramc[i]]\n",
    "    res.append(item)\n",
    "prob = np.array(res).reshape(7,7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[7.785, 1057.763, 0.061, 11.677, 0.061, 0.061, 0.061]\n",
      "[2.823, 0.046, 739.729, 0.046, 5.647, 7.529, 5.647]\n",
      "[2.885, 0.084, 9.616, 826.982, 2.885, 0.084, 11.539]\n",
      "[0.073, 0.073, 1.766, 0.073, 16.782, 1.766, 45.928]\n",
      "[1.828, 0.011, 0.011, 0.011, 0.011, 109.7, 0.914]\n",
      "[18.019, 0.051, 16.122, 0.051, 0.051, 0.051, 0.051]\n",
      "[3.643, 0.026, 0.026, 0.026, 0.026, 0.911, 0.026]\n"
     ]
    }
   ],
   "source": [
    "for i, w in enumerate(wc.keys()):\n",
    "    print([round(_,3) for _ in list(prob[i] * wc[w])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}