{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "dictionary = {\n", " \"北京烤鸭\",\n", " \"特别\",\n", " \"喜欢\"\n", "}\n", "sentence = \"他特别喜欢北京烤鸭\"" ] }, { "cell_type": "code", "execution_count": 248, "metadata": {}, "outputs": [], "source": [ "# 最大后向匹配（递归）\n", "def seg(sentence):\n", " res = []\n", " def maxmatch(res, sentence, dictionary):\n", " if len(sentence) == 0:\n", " return list()\n", " for i in reversed(range(1, len(sentence)+1)):\n", " fw = sentence[:i]\n", " rw = sentence[i:]\n", " if fw in dictionary:\n", " res.append(fw)\n", " return maxmatch(res, rw, dictionary)\n", " res.append(fw)\n", " return maxmatch(res, rw, dictionary)\n", " maxmatch(res, sentence, dictionary)\n", " return res" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['他', '特别', '喜欢', '北京烤鸭']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seg(sentence)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 字典分词" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "参考资料：\n", "\n", "- [ysc/word: Java 分布式中文分词组件 - word 分词](https://github.com/ysc/word)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 词典" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pnlp" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "415870\n" ] }, { "data": { "text/plain": [ "16" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# word\n", "dictionary = pnlp.read_lines(\"./dic.txt\")\n", "# 字典中不能包括一个字的词，否则都被切成一个字了\n", "dictionary = [w for w in dictionary if len(w) > 1]\n", "dictionary = set(dictionary)\n", "print(len(dictionary))\n", "\n", "MAX_LEN = max(len(w) for w in dictionary)\n", "MAX_LEN" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "349045\n" ] }, { "data": { "text/plain": [ "16" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 结巴\n", "\n", "dict_path = \"/Users/HaoShaochun/Documents/Study/NLP/jieba/jieba/dict.txt\"\n", "\n", "dictionary = pnlp.read_lines(dict_path)\n", "# 字典中不能包括一个字的词，否则都被切成一个字了\n", "dictionary = [line.split(\" \")[0] for line in dictionary]\n", "dictionary = set(dictionary)\n", "print(len(dictionary))\n", "\n", "MAX_LEN = max(len(w) for w in dictionary)\n", "MAX_LEN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 最大正向匹配" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "def forward_max_match2(sent):\n", " res = []\n", " while len(sent) > 0:\n", " length = min(MAX_LEN, len(sent))\n", " try_word = sent[:length]\n", " while try_word not in dictionary:\n", " if len(try_word) == 1:\n", " break\n", " try_word = try_word[:-1]\n", " res.append(try_word)\n", " sent = sent[len(try_word):]\n", " return res" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "def forward_max_match(sent):\n", " res = []\n", " length = MAX_LEN\n", " text_len = len(sent)\n", " start = 0\n", " while start < text_len:\n", " length = min(length, text_len - start)\n", " while sent[start:start+length] not in dictionary:\n", " if length == 1:\n", " break\n", " length -= 1\n", " res.append(sent[start:start+length])\n", " start += length\n", " length = MAX_LEN\n", " return res" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "sen = \"美国加州大学的科学家发现\"" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['美国加州大学', '的', '科学家', '发现']" ] }, "execution_count": 165, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forward_max_match(sen)" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['美国加州大学', '的', '科学家', '发现']" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forward_max_match2(sen)" ] }, { "cell_type": "code", "execution_count": 171, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.67 µs ± 397 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" ] } ], "source": [ "%timeit forward_max_match(sen)" ] }, { "cell_type": "code", "execution_count": 172, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.51 µs ± 188 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" ] } ], "source": [ "%timeit forward_max_match2(sen)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 正向最小匹配" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [], "source": [ "def forward_min_match(sent):\n", " res = []\n", " length = 1\n", " text_len = len(sent)\n", " start = 0\n", " while start < text_len:\n", " while sent[start:start+length] not in dictionary:\n", " if length == MAX_LEN or length == text_len - start:\n", " length = 1\n", " break\n", " length += 1\n", " res.append(sent[start:start+length])\n", " start += length\n", " length = 1\n", " return res" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "sen = \"美国加州大学的科学家发现\"" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['美国', '加州', '大学', '的', '科学', '家', '发现']" ] }, "execution_count": 159, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forward_min_match(sen)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 最大反向匹配" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "def backward_max_match(sent):\n", " res = []\n", " length = MAX_LEN\n", " text_len = len(sent)\n", " start = max(0, text_len - length)\n", " length = min(length, text_len - start)\n", " while start >= 0 and length > 0:\n", " while sent[start: start+length] not in dictionary:\n", " if length == 1:\n", " break\n", " length -= 1\n", " start += 1\n", " res.append(sent[start: start+length])\n", " length = MAX_LEN\n", " if length > start:\n", " length = start\n", " start -= length\n", " return list(reversed(res))" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [], "source": [ "MAX_LEN = max(len(w) for w in dictionary)" ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7, 16)" ] }, "execution_count": 227, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sen = \"研究生命的起源\"\n", "len(sen), MAX_LEN" ] }, { "cell_type": "code", "execution_count": 228, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['研究', '生命', '的', '起源']" ] }, "execution_count": 228, "metadata": {}, "output_type": "execute_result" } ], "source": [ "backward_max_match(sen)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 最小反向匹配" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "def backward_min_match(sent):\n", " res = []\n", " length = 1\n", " text_len = len(sent)\n", " start = text_len - length\n", " while start >= 0:\n", " while sent[start: start+length] not in dictionary:\n", " length += 1\n", " start -= 1\n", " if length == MAX_LEN or start < 0:\n", " start += length - 1\n", " length = 1\n", " break\n", " res.append(sent[start: start+length])\n", " start -= 1\n", " length = 1\n", " return list(reversed(res))" ] }, { "cell_type": "code", "execution_count": 243, "metadata": {}, "outputs": [], "source": [ "sen = \"美国加州大学的科学家发现\"" ] }, { "cell_type": "code", "execution_count": 244, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['美国', '加州', '大学', '的', '科', '学家', '发现']" ] }, "execution_count": 244, "metadata": {}, "output_type": "execute_result" } ], "source": [ "backward_min_match(sen)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 数据对比" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "sens = [\n", " \"中华人民共和国万岁万岁万万岁\",\n", " \"乔布斯是 Apple 产品的设计者\",\n", " \"美国加州大学的科学家发现\",\n", " \"研究生命的起源\",\n", " \"长春市长春节致辞\",\n", " \"他从马上下来\",\n", " \"有意见分歧\",\n", " \"确实现在物价很高\",\n", " \"答辩结束的和尚未答辩的同学都请留在教室\"\n", "]" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "最大前向匹配：中华人民共和国/万岁/万岁/万万岁\n", "最大后向匹配：中华人民共和国/万岁/万岁/万万岁\n", "最小前向匹配：中/华/人/民/共/和/国/万/岁/万/岁/万/万/岁\n", "最小后向匹配：中/华/人/民/共/和/国/万/岁/万/岁/万/万/岁\n", "\n", "最大前向匹配：乔布斯/是/ /A/p/p/l/e/ /产品/的/设计者\n", "最大后向匹配：乔布斯/是/ /A/p/p/l/e/ /产品/的/设计者\n", "最小前向匹配：乔/布/斯/是/ /A/p/p/l/e/ /产/品/的/设/计/者\n", "最小后向匹配：乔/布/斯/是/ /A/p/p/l/e/ /产/品/的/设/计/者\n", "\n", "最大前向匹配：美国加州大学/的/科学家/发现\n", "最大后向匹配：美国加州大学/的/科学家/发现\n", "最小前向匹配：美/国/加/州/大/学/的/科/学/家/发/现\n", "最小后向匹配：美/国/加/州/大/学/的/科/学/家/发/现\n", "\n", "最大前向匹配：研究生/命/的/起源\n", "最大后向匹配：研究/生命/的/起源\n", "最小前向匹配：研/究/生/命/的/起/源\n", "最小后向匹配：研/究/生/命/的/起/源\n", "\n", "最大前向匹配：长春市/长春/节/致辞\n", "最大后向匹配：长春/市长/春节/致辞\n", "最小前向匹配：长/春/市/长/春/节/致/辞\n", "最小后向匹配：长/春/市/长/春/节/致/辞\n", "\n", "最大前向匹配：他/从/马上/下来\n", "最大后向匹配：他/从/马上/下来\n", "最小前向匹配：他/从/马/上/下/来\n", "最小后向匹配：他/从/马/上/下/来\n", "\n", "最大前向匹配：有意/见/分歧\n", "最大后向匹配：有/意见分歧\n", "最小前向匹配：有/意/见/分/歧\n", "最小后向匹配：有/意/见/分/歧\n", "\n", "最大前向匹配：确实/现在/物价/很/高\n", "最大后向匹配：确实/现在/物价/很/高\n", "最小前向匹配：确/实/现/在/物/价/很/高\n", "最小后向匹配：确/实/现/在/物/价/很/高\n", "\n", "最大前向匹配：答辩/结束/的/和尚/未/答辩/的/同学/都/请留/在/教室\n", "最大后向匹配：答辩/结束/的/和/尚未/答辩/的/同学/都/请/留在/教室\n", "最小前向匹配：答/辩/结/束/的/和/尚/未/答/辩/的/同/学/都/请/留/在/教/室\n", "最小后向匹配：答/辩/结/束/的/和/尚/未/答/辩/的/同/学/都/请/留/在/教/室\n", "\n" ] } ], "source": [ "for sen in sens:\n", " \n", " res = forward_max_match(sen)\n", " print(\"最大前向匹配：\", \"/\".join(res))\n", " \n", " res = backward_max_match(sen)\n", " print(\"最大后向匹配：\", \"/\".join(res))\n", " \n", " res = forward_min_match(sen)\n", " print(\"最小前向匹配：\", \"/\".join(res))\n", " \n", " res = backward_min_match(sen)\n", " print(\"最小后向匹配：\", \"/\".join(res))\n", " \n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 统计分词" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DAG" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import re\n", "from math import log" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "re_han = re.compile(\"([\\u4E00-\\u9FD5]+)\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['', '地方收到分手的饭', '880890df&*())(^', '范德萨发', '']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res = re_han.split(\"地方收到分手的饭880890df&*())(^范德萨发\")\n", "res" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "re_skip = re.compile(\"([a-zA-Z0-9]+(?:\\.\\d+)?%?)\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)\n", "\n", "re_eng = re.compile('[a-zA-Z0-9]', re.U)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def gen_pfdict():\n", " lfreq = {}\n", " ltotal = 0\n", " lines = pnlp.read_lines(dict_path)\n", " for lineno, line in enumerate(lines, 1):\n", " try:\n", " word, freq = line.split(' ')[:2]\n", " freq = int(freq)\n", " lfreq[word] = freq\n", " ltotal += freq\n", " for ch in range(len(word)):\n", " wfrag = word[:ch + 1]\n", " if wfrag not in lfreq:\n", " lfreq[wfrag] = 0\n", " except ValueError:\n", " raise ValueError(\n", " 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))\n", " return lfreq, ltotal" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "FREQ, total = gen_pfdict()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def get_DAG(sentence):\n", " DAG = {}\n", " N = len(sentence)\n", " for k in range(N):\n", " tmplist = []\n", " i = k\n", " frag = sentence[k]\n", " while i < N and frag in FREQ:\n", " if FREQ[frag]:\n", " tmplist.append(i)\n", " i += 1\n", " frag = sentence[k:i + 1]\n", " if not tmplist:\n", " tmplist.append(k)\n", " DAG[k] = tmplist\n", " return DAG" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "text = \"他来到了网易杭研大厦\"\n", "# text = \"确实现在物价很高\"\n", "text = \"答辩结束的和尚未答辩的同学都请留在教室\"\n", "# text = \"中华人民\"\n", "# text = \"懂法萨芬dsfsdfsf^&*()7989 love\"" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"辩\" in FREQ" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: [0, 1],\n", " 1: [1],\n", " 2: [2, 3],\n", " 3: [3],\n", " 4: [4],\n", " 5: [5, 6],\n", " 6: [6, 7],\n", " 7: [7],\n", " 8: [8, 9],\n", " 9: [9],\n", " 10: [10],\n", " 11: [11, 12],\n", " 12: [12],\n", " 13: [13],\n", " 14: [14, 15],\n", " 15: [15, 16],\n", " 16: [16],\n", " 17: [17, 18],\n", " 18: [18]}" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DAG = get_DAG(text)\n", "DAG" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17.91155312775522" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "log(total)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def calc(sentence, DAG):\n", " route = {}\n", " N = len(sentence)\n", " route[N] = (0, 0)\n", " logtotal = log(total)\n", " for idx in range(N - 1, -1, -1):\n", " tmp = []\n", " for x in DAG[idx]:\n", " val = log(FREQ.get(sentence[idx: x+1], 1)) - logtotal + route[x+1][0]\n", " tmp.append((val, x))\n", " route[idx] = max(tmp)\n", " return route" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{19: (0, 0),\n", " 18: (-9.486036724910885, 18),\n", " 17: (-11.413270978278785, 18),\n", " 16: (-15.826884543913714, 16),\n", " 15: (-21.514471422309715, 16),\n", " 14: (-29.3602906051659, 14),\n", " 13: (-35.05196680641455, 13),\n", " 12: (-43.194592875029336, 12),\n", " 11: (-44.57506961864625, 12),\n", " 10: (-49.81422510450637, 10),\n", " 9: (-60.645751732339, 9),\n", " 8: (-62.60778441984483, 9),\n", " 7: (-70.79002482672212, 7),\n", " 6: (-72.27710765622783, 7),\n", " 5: (-76.96046999993145, 5),\n", " 4: (-82.19962548579157, 4),\n", " 3: (-93.23079453136079, 3),\n", " 2: (-91.01244041815191, 3),\n", " 1: (-101.84396704598454, 1),\n", " 0: (-103.80599973349038, 1)}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "calc(text, DAG)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "def __cut_DAG_NO_HMM(sentence):\n", " DAG = get_DAG(sentence)\n", " route = calc(sentence, DAG)\n", " x = 0\n", " N = len(sentence)\n", " buf = ''\n", " while x < N:\n", " y = route[x][1] + 1\n", " l_word = sentence[x:y]\n", " print(l_word)\n", " if re_eng.match(l_word) and len(l_word) == 1:\n", " buf += l_word\n", " x = y\n", " else:\n", " if buf:\n", " yield buf\n", " buf = ''\n", " yield l_word\n", " x = y\n", " if buf:\n", " yield buf\n", " buf = ''" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "答辩\n", "结束\n", "的\n", "和\n", "尚未\n", "答辩\n", "的\n", "同学\n", "都\n", "请\n", "留在\n", "教室\n" ] }, { "data": { "text/plain": [ "['答辩', '结束', '的', '和', '尚未', '答辩', '的', '同学', '都', '请', '留在', '教室']" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(__cut_DAG_NO_HMM(text))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Viterbi" ] }, { "cell_type": "code", "execution_count": 315, "metadata": {}, "outputs": [], "source": [ "import jieba\n", "\n", "MIN_FLOAT = -3.14e100\n", "PrevStatus = {\n", " 'B': 'ES',\n", " 'M': 'MB',\n", " 'S': 'SE',\n", " 'E': 'BM'\n", "}" ] }, { "cell_type": "code", "execution_count": 316, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'B': -0.26268660809250016,\n", " 'E': -3.14e+100,\n", " 'M': -3.14e+100,\n", " 'S': -1.4652633398537678}" ] }, "execution_count": 316, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jieba.finalseg.start_P" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'B': {'E': -0.51082562376599, 'M': -0.916290731874155},\n", " 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},\n", " 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},\n", " 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}" ] }, "execution_count": 317, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jieba.finalseg.trans_P" ] }, { "cell_type": "code", "execution_count": 318, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'B': 'ES', 'M': 'MB', 'S': 'SE', 'E': 'BM'}" ] }, "execution_count": 318, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PrevStatus" ] }, { "cell_type": "code", "execution_count": 319, "metadata": {}, "outputs": [], "source": [ "def viterbi(sent, states, start_p, trans_p, emit_p):\n", " V = [{}] # tabular\n", " path = {}\n", " for y in states: # init\n", " V[0][y] = start_p[y] + emit_p[y].get(sent[0], MIN_FLOAT)\n", " path[y] = [y]\n", " for t in range(1, len(sent)):\n", " V.append({})\n", " newpath = {}\n", " for y in states:\n", " # 发射概率，state -> 观测值（token）\n", " em_p = emit_p[y].get(sent[t], MIN_FLOAT)\n", " # PrevStatus 是当前状态的前一个状态，来自转移概率\n", " # 判断前一个状态到现在状态的概率最大情况\n", " # 从现在状态到现在观测值的发射概率 * 前一个状态到现在状态的转移概率 * 上一步的概率\n", " (prob, state) = max(\n", " [(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])\n", " V[t][y] = prob\n", " # 记录每个 state 的路径\n", " newpath[y] = path[state] + [y]\n", " path = newpath\n", " \n", " # 最后一个状态只能是 E 或 S\n", " print(path)\n", " (prob, state) = max((V[len(sent) - 1][y], y) for y in 'ES')\n", " return (prob, path[state])" ] }, { "cell_type": "code", "execution_count": 320, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'B': ['S', 'B', 'E', 'B', 'E', 'S', 'S', 'B', 'M', 'E', 'S', 'S', 'B', 'E', 'S', 'S', 'S', 'S', 'S', 'B'], 'M': ['S', 'B', 'E', 'B', 'E', 'S', 'S', 'B', 'M', 'E', 'S', 'S', 'B', 'E', 'S', 'S', 'S', 'S', 'B', 'M'], 'S': ['S', 'B', 'E', 'B', 'E', 'S', 'S', 'B', 'M', 'E', 'S', 'S', 'B', 'E', 'S', 'S', 'S', 'S', 'S', 'S'], 'E': ['S', 'B', 'E', 'B', 'E', 'S', 'S', 'B', 'M', 'E', 'S', 'S', 'B', 'E', 'S', 'S', 'S', 'S', 'B', 'E']}\n" ] }, { "data": { "text/plain": [ "(-142.70062479570734,\n", " ['S',\n", " 'B',\n", " 'E',\n", " 'B',\n", " 'E',\n", " 'S',\n", " 'S',\n", " 'B',\n", " 'M',\n", " 'E',\n", " 'S',\n", " 'S',\n", " 'B',\n", " 'E',\n", " 'S',\n", " 'S',\n", " 'S',\n", " 'S',\n", " 'B',\n", " 'E'])" ] }, "execution_count": 320, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = \"我答辩结束的和尚未答辩的同学都请留在教室\"\n", "viterbi(text, \"BMSE\", jieba.finalseg.start_P, jieba.finalseg.trans_P, jieba.finalseg.emit_P)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 监督模型" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "165px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }