{ "cells": [ { "cell_type": "markdown", "id": "719e3403", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# Converting Raw Text into Sequence Data\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "117b083a", "metadata": { "attributes": { "classes": [], "id": "", "n": "3" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:23.122018Z", "iopub.status.busy": "2023-08-18T19:29:23.121504Z", "iopub.status.idle": "2023-08-18T19:29:25.901709Z", "shell.execute_reply": "2023-08-18T19:29:25.900650Z" }, "origin_pos": 3, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "import collections\n", "import random\n", "import re\n", "import torch\n", "from d2l import torch as d2l" ] }, { "cell_type": "markdown", "id": "87cfae33", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Reads the raw text into a string" ] }, { "cell_type": "code", "execution_count": 2, "id": "dd560200", "metadata": { "attributes": { "classes": [], "id": "", "n": "5" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:25.906885Z", "iopub.status.busy": "2023-08-18T19:29:25.905685Z", "iopub.status.idle": "2023-08-18T19:29:25.919860Z", "shell.execute_reply": "2023-08-18T19:29:25.918500Z" }, "origin_pos": 7, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "'The Time Machine, by H. G. Wells [1898]\\n\\n\\n\\n\\nI\\n\\n\\nThe Time Tra'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class TimeMachine(d2l.DataModule): \n", " \"\"\"The Time Machine dataset.\"\"\"\n", " def _download(self):\n", " fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root,\n", " '090b5e7e70c295757f55df93cb0a180b9691891a')\n", " with open(fname) as f:\n", " return f.read()\n", "\n", "data = TimeMachine()\n", "raw_text = data._download()\n", "raw_text[:60]" ] }, { "cell_type": "code", "execution_count": 3, "id": "c4d26c1b", "metadata": { "attributes": { "classes": [], "id": "", "n": "6" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:25.925597Z", "iopub.status.busy": "2023-08-18T19:29:25.924754Z", "iopub.status.idle": "2023-08-18T19:29:25.959402Z", "shell.execute_reply": "2023-08-18T19:29:25.958183Z" }, "origin_pos": 9, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "'the time machine by h g wells i the time traveller for so it'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@d2l.add_to_class(TimeMachine) \n", "def _preprocess(self, text):\n", " return re.sub('[^A-Za-z]+', ' ', text).lower()\n", "\n", "text = data._preprocess(raw_text)\n", "text[:60]" ] }, { "cell_type": "code", "execution_count": 4, "id": "dc35eee3", "metadata": { "attributes": { "classes": [], "id": "", "n": "7" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:25.963957Z", "iopub.status.busy": "2023-08-18T19:29:25.962999Z", "iopub.status.idle": "2023-08-18T19:29:25.975230Z", "shell.execute_reply": "2023-08-18T19:29:25.974102Z" }, "origin_pos": 11, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "'t,h,e, ,t,i,m,e, ,m,a,c,h,i,n,e, ,b,y, ,h, ,g, ,w,e,l,l,s, '" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@d2l.add_to_class(TimeMachine) \n", "def _tokenize(self, text):\n", " return list(text)\n", "\n", "tokens = data._tokenize(text)\n", "','.join(tokens[:30])" ] }, { "cell_type": "markdown", "id": "f24ef727", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Next, we introduce a class\n", "for constructing *vocabularies*,\n", "i.e., objects that associate\n", "each distinct token value\n", "with a unique index" ] }, { "cell_type": "code", "execution_count": 5, "id": "820d7cff", "metadata": { "attributes": { "classes": [], "id": "", "n": "8" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:25.979690Z", "iopub.status.busy": "2023-08-18T19:29:25.978764Z", "iopub.status.idle": "2023-08-18T19:29:25.995117Z", "shell.execute_reply": "2023-08-18T19:29:25.994009Z" }, "origin_pos": 13, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "class Vocab: \n", " \"\"\"Vocabulary for text.\"\"\"\n", " def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):\n", " if tokens and isinstance(tokens[0], list):\n", " tokens = [token for line in tokens for token in line]\n", " counter = collections.Counter(tokens)\n", " self.token_freqs = sorted(counter.items(), key=lambda x: x[1],\n", " reverse=True)\n", " self.idx_to_token = list(sorted(set([''] + reserved_tokens + [\n", " token for token, freq in self.token_freqs if freq >= min_freq])))\n", " self.token_to_idx = {token: idx\n", " for idx, token in enumerate(self.idx_to_token)}\n", "\n", " def __len__(self):\n", " return len(self.idx_to_token)\n", "\n", " def __getitem__(self, tokens):\n", " if not isinstance(tokens, (list, tuple)):\n", " return self.token_to_idx.get(tokens, self.unk)\n", " return [self.__getitem__(token) for token in tokens]\n", "\n", " def to_tokens(self, indices):\n", " if hasattr(indices, '__len__') and len(indices) > 1:\n", " return [self.idx_to_token[int(index)] for index in indices]\n", " return self.idx_to_token[indices]\n", "\n", " @property\n", " def unk(self):\n", " return self.token_to_idx['']" ] }, { "cell_type": "markdown", "id": "483f34b2", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Construct a vocabulary" ] }, { "cell_type": "code", "execution_count": 6, "id": "da288532", "metadata": { "attributes": { "classes": [], "id": "", "n": "9" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:25.999437Z", "iopub.status.busy": "2023-08-18T19:29:25.998566Z", "iopub.status.idle": "2023-08-18T19:29:26.019939Z", "shell.execute_reply": "2023-08-18T19:29:26.019020Z" }, "origin_pos": 15, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "indices: [21, 9, 6, 0, 21, 10, 14, 6, 0, 14]\n", "words: ['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm']\n" ] } ], "source": [ "vocab = Vocab(tokens)\n", "indices = vocab[tokens[:10]]\n", "print('indices:', indices)\n", "print('words:', vocab.to_tokens(indices))" ] }, { "cell_type": "markdown", "id": "7d6e6f00", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Package everything into the following\n", "`build` method of the `TimeMachine` class" ] }, { "cell_type": "code", "execution_count": 7, "id": "b19fc5b8", "metadata": { "attributes": { "classes": [], "id": "", "n": "10" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:26.023503Z", "iopub.status.busy": "2023-08-18T19:29:26.022928Z", "iopub.status.idle": "2023-08-18T19:29:26.127518Z", "shell.execute_reply": "2023-08-18T19:29:26.126623Z" }, "origin_pos": 17, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "(173428, 28)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "@d2l.add_to_class(TimeMachine) \n", "def build(self, raw_text, vocab=None):\n", " tokens = self._tokenize(self._preprocess(raw_text))\n", " if vocab is None: vocab = Vocab(tokens)\n", " corpus = [vocab[token] for token in tokens]\n", " return corpus, vocab\n", "\n", "corpus, vocab = data.build(raw_text)\n", "len(corpus), len(vocab)" ] }, { "cell_type": "code", "execution_count": 8, "id": "b5985d60", "metadata": { "attributes": { "classes": [], "id": "", "n": "11" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:26.131046Z", "iopub.status.busy": "2023-08-18T19:29:26.130467Z", "iopub.status.idle": "2023-08-18T19:29:26.147044Z", "shell.execute_reply": "2023-08-18T19:29:26.146169Z" }, "origin_pos": 19, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "[('the', 2261),\n", " ('i', 1267),\n", " ('and', 1245),\n", " ('of', 1155),\n", " ('a', 816),\n", " ('to', 695),\n", " ('was', 552),\n", " ('in', 541),\n", " ('that', 443),\n", " ('my', 440)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words = text.split()\n", "vocab = Vocab(words)\n", "vocab.token_freqs[:10]" ] }, { "cell_type": "markdown", "id": "abdc3328", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The ten most frequent words\n", "*stop words*\n", "plot the figure of the word frequency" ] }, { "cell_type": "code", "execution_count": 9, "id": "1017fe5b", "metadata": { "attributes": { "classes": [], "id": "", "n": "12" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:26.150706Z", "iopub.status.busy": "2023-08-18T19:29:26.149958Z", "iopub.status.idle": "2023-08-18T19:29:27.117808Z", "shell.execute_reply": "2023-08-18T19:29:27.116986Z" }, "origin_pos": 21, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", " \n", " 2023-08-18T19:29:26.990809\n", " image/svg+xml\n", " \n", " \n", " Matplotlib v3.7.2, https://matplotlib.org/\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "freqs = [freq for token, freq in vocab.token_freqs]\n", "d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)',\n", " xscale='log', yscale='log')" ] }, { "cell_type": "markdown", "id": "22167230", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "What about the other word combinations, such as two consecutive words (bigrams), three consecutive words (trigrams)" ] }, { "cell_type": "code", "execution_count": 10, "id": "eb992e88", "metadata": { "attributes": { "classes": [], "id": "", "n": "13" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:27.121634Z", "iopub.status.busy": "2023-08-18T19:29:27.121053Z", "iopub.status.idle": "2023-08-18T19:29:27.157009Z", "shell.execute_reply": "2023-08-18T19:29:27.156222Z" }, "origin_pos": 23, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "[('of--the', 309),\n", " ('in--the', 169),\n", " ('i--had', 130),\n", " ('i--was', 112),\n", " ('and--the', 109),\n", " ('the--time', 102),\n", " ('it--was', 99),\n", " ('to--the', 85),\n", " ('as--i', 78),\n", " ('of--a', 73)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bigram_tokens = ['--'.join(pair) for pair in zip(words[:-1], words[1:])]\n", "bigram_vocab = Vocab(bigram_tokens)\n", "bigram_vocab.token_freqs[:10]" ] }, { "cell_type": "code", "execution_count": 11, "id": "8a284366", "metadata": { "attributes": { "classes": [], "id": "", "n": "14" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:27.160412Z", "iopub.status.busy": "2023-08-18T19:29:27.159846Z", "iopub.status.idle": "2023-08-18T19:29:27.203866Z", "shell.execute_reply": "2023-08-18T19:29:27.203004Z" }, "origin_pos": 25, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "[('the--time--traveller', 59),\n", " ('the--time--machine', 30),\n", " ('the--medical--man', 24),\n", " ('it--seemed--to', 16),\n", " ('it--was--a', 15),\n", " ('here--and--there', 15),\n", " ('seemed--to--me', 14),\n", " ('i--did--not', 14),\n", " ('i--saw--the', 13),\n", " ('i--began--to', 13)]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trigram_tokens = ['--'.join(triple) for triple in zip(\n", " words[:-2], words[1:-1], words[2:])]\n", "trigram_vocab = Vocab(trigram_tokens)\n", "trigram_vocab.token_freqs[:10]" ] }, { "cell_type": "markdown", "id": "7587b155", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Visualize the token frequency" ] }, { "cell_type": "code", "execution_count": 12, "id": "ad96dfc1", "metadata": { "attributes": { "classes": [], "id": "", "n": "15" }, "execution": { "iopub.execute_input": "2023-08-18T19:29:27.207781Z", "iopub.status.busy": "2023-08-18T19:29:27.206873Z", "iopub.status.idle": "2023-08-18T19:29:28.094411Z", "shell.execute_reply": "2023-08-18T19:29:28.093574Z" }, "origin_pos": 27, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", " \n", " 2023-08-18T19:29:27.923417\n", " image/svg+xml\n", " \n", " \n", " Matplotlib v3.7.2, https://matplotlib.org/\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs]\n", "trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs]\n", "d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x',\n", " ylabel='frequency: n(x)', xscale='log', yscale='log',\n", " legend=['unigram', 'bigram', 'trigram'])" ] } ], "metadata": { "celltoolbar": "Slideshow", "language_info": { "name": "python" }, "required_libs": [], "rise": { "autolaunch": true, "enable_chalkboard": true, "overlay": "
", "scroll": true } }, "nbformat": 4, "nbformat_minor": 5 }