{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "For Blog: [自然语言计算机形式分析的理论与方法笔记(Ch13) | Yam](https://yam.gift/2019/03/15/NLPFA/2019-03-15-Ch13-Ngram-and-Smoothing/)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "wc = {\n", " \"I\": 3437,\n", " \"want\": 1215,\n", " \"to\": 3256,\n", " \"eat\": 938,\n", " \"Chinese\": 213,\n", " \"food\": 1506,\n", " \"lunch\": 459,\n", "}" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "rawbigramc = [\n", " [8 , 1087 , 0 , 12 , 0 , 0 , 0] ,\n", " [3 , 0 , 786 , 0 , 6 , 8 , 6] ,\n", " [3 , 0 , 10 , 860 , 3 , 0 , 12] ,\n", " [0 , 0 , 2 , 0 , 19 , 2 , 52] ,\n", " [2 , 0 , 0 , 0 , 0 , 120 , 1] ,\n", " [19 , 0 , 17 , 0 , 0 , 0 , 0] ,\n", " [4 , 0 , 0 , 0 , 0 , 1 , 0] ,\n", "]" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "tokenc = 1616" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "wtype_bic = {\n", " \"I x\": 95,\n", " \"want x\": 76,\n", " \"to x\": 130,\n", " \"eat x\": 124,\n", " \"Chinese x\": 20,\n", " \"food x\": 82,\n", " \"lunch x\": 45\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add-One" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "code_folding": [] }, "outputs": [], "source": [ "bigramc = [[_+1 for _ in item] for item in rawbigramc]" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "res = []\n", "for i,w in enumerate(\"I want to eat Chinese food lunch\".split()):\n", " item = [(x/(wc[w] + tokenc)) for x in bigramc[i]]\n", " res.append(item)\n", "prob = np.array(res).reshape(7,7)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[6.122, 740.047, 0.68, 8.842, 0.68, 0.68, 0.68]\n", "[1.717, 0.429, 337.762, 0.429, 3.004, 3.863, 3.004]\n", "[2.673, 0.668, 7.351, 575.414, 2.673, 0.668, 8.688]\n", "[0.367, 0.367, 1.102, 0.367, 7.345, 1.102, 19.465]\n", "[0.349, 0.116, 0.116, 0.116, 0.116, 14.091, 0.233]\n", "[9.648, 0.482, 8.683, 0.482, 0.482, 0.482, 0.482]\n", "[1.106, 0.221, 0.221, 0.221, 0.221, 0.442, 0.221]\n" ] } ], "source": [ "for i, w in enumerate(wc.keys()):\n", " print([round(_,3) for _ in list(prob[i] * wc[w])])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Witten-Bell" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "res = []\n", "for i,w in enumerate(\"I want to eat Chinese food lunch\".split()):\n", " n = wc[w]\n", " t = wtype_bic[w+\" x\"]\n", " z = tokenc - t\n", " item = [(x/(n+t)) if x > 0 else t/((n+t)*z) for x in rawbigramc[i]]\n", " res.append(item)\n", "prob = np.array(res).reshape(7,7)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[7.785, 1057.763, 0.061, 11.677, 0.061, 0.061, 0.061]\n", "[2.823, 0.046, 739.729, 0.046, 5.647, 7.529, 5.647]\n", "[2.885, 0.084, 9.616, 826.982, 2.885, 0.084, 11.539]\n", "[0.073, 0.073, 1.766, 0.073, 16.782, 1.766, 45.928]\n", "[1.828, 0.011, 0.011, 0.011, 0.011, 109.7, 0.914]\n", "[18.019, 0.051, 16.122, 0.051, 0.051, 0.051, 0.051]\n", "[3.643, 0.026, 0.026, 0.026, 0.026, 0.911, 0.026]\n" ] } ], "source": [ "for i, w in enumerate(wc.keys()):\n", " print([round(_,3) for _ in list(prob[i] * wc[w])])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }