{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Text Preprocessing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.080061Z", "start_time": "2019-07-03T22:57:46.025675Z" } }, "outputs": [], "source": [ "import collections\n", "import re\n", "import random\n", "from mxnet import np, npx\n", "npx.set_np()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Read \"Time Machine\" by H. G. Wells as our training dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.109854Z", "start_time": "2019-07-03T22:57:47.081916Z" } }, "outputs": [ { "data": { "text/plain": [ "'# sentences 3221'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def read_time_machine():\n", " with open('../data/timemachine.txt', 'r') as f:\n", " lines = f.readlines()\n", " return [re.sub('[^A-Za-z]+', ' ', line.strip().lower()) \n", " for line in lines]\n", "\n", "lines = read_time_machine()\n", "'# sentences %d' % len(lines)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Split each sentence into a list of tokens" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.119856Z", "start_time": "2019-07-03T22:57:47.111528Z" } }, "outputs": [ { "data": { "text/plain": [ "[['the', 'time', 'machine', 'by', 'h', 'g', 'wells', ''], ['']]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def tokenize(lines, token='word'):\n", " if token == 'word':\n", " return [line.split(' ') for line in lines]\n", " elif token == 'char':\n", " return [list(line) for line in lines]\n", " else:\n", " print('ERROR: unkown token type '+token)\n", "\n", "tokens = tokenize(lines)\n", "tokens[0:2]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Build a vocabulary to map string tokens into numerical indices" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.130515Z", "start_time": "2019-07-03T22:57:47.121839Z" }, "attributes": { "classes": [], "id": "", "n": "9" } }, "outputs": [], "source": [ "class Vocab(object):\n", " def __init__(self, tokens, min_freq=0):\n", " # Sort according to frequencies\n", " counter = collections.Counter([tk for line in tokens for tk in line])\n", " self.token_freqs = sorted(counter.items(), key=lambda x: x[0])\n", " self.token_freqs.sort(key=lambda x: x[1], reverse=True)\n", " self.unk, uniq_tokens = 0, ['']\n", " uniq_tokens += [token for token, freq in self.token_freqs \n", " if freq >= min_freq and token not in uniq_tokens]\n", " self.idx_to_token, self.token_to_idx = [], dict()\n", " for token in uniq_tokens:\n", " self.idx_to_token.append(token)\n", " self.token_to_idx[token] = len(self.idx_to_token) - 1\n", " def __len__(self):\n", " return len(self.idx_to_token)\n", " def __getitem__(self, tokens):\n", " if not isinstance(tokens, (list, tuple)):\n", " return self.token_to_idx.get(tokens, self.unk)\n", " return [self.__getitem__(token) for token in tokens]\n", " def to_tokens(self, indices):\n", " if not isinstance(indices, (list, tuple)):\n", " return self.idx_to_token[indices]\n", " return [self.idx_to_token[index] for index in indices]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Print the map between a few tokens to indices" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.147807Z", "start_time": "2019-07-03T22:57:47.131982Z" }, "attributes": { "classes": [], "id": "", "n": "23" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('', 0), ('the', 1), ('', 2), ('i', 3), ('and', 4), ('of', 5), ('a', 6), ('to', 7), ('was', 8), ('in', 9)]\n" ] } ], "source": [ "vocab = Vocab(tokens)\n", "print(list(vocab.token_to_idx.items())[0:10])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Now we can convert each sentence into a list of numerical indices" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.152854Z", "start_time": "2019-07-03T22:57:47.149161Z" }, "attributes": { "classes": [], "id": "", "n": "25" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "words: ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him', '']\n", "indices: [1, 20, 72, 17, 38, 12, 120, 43, 706, 7, 660, 5, 112, 2]\n", "words: ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']\n", "indices: [8, 1654, 6, 3864, 634, 7, 131, 26, 344, 127, 484, 4]\n" ] } ], "source": [ "for i in range(8, 10):\n", " print('words:', tokens[i]) \n", " print('indices:', vocab[tokens[i]])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Next load data into mini-batches" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:57:47.159683Z", "start_time": "2019-07-03T22:57:47.154168Z" } }, "outputs": [], "source": [ "def seq_data_iter_consecutive(corpus, batch_size, num_steps):\n", " # Offset for the iterator over the data for uniform starts\n", " offset = random.randint(0, num_steps)\n", " # Slice out data - 