{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Text Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.080061Z",
     "start_time": "2019-07-03T22:57:46.025675Z"
    }
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "import re\n",
    "import random\n",
    "from mxnet import np, npx\n",
    "npx.set_np()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Read \"Time Machine\" by H. G. Wells as our training dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.109854Z",
     "start_time": "2019-07-03T22:57:47.081916Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'# sentences 3221'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_time_machine():\n",
    "    with open('../data/timemachine.txt', 'r') as f:\n",
    "        lines = f.readlines()\n",
    "    return [re.sub('[^A-Za-z]+', ' ', line.strip().lower()) \n",
    "            for line in lines]\n",
    "\n",
    "lines = read_time_machine()\n",
    "'# sentences %d' % len(lines)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Split each sentence into a list of tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.119856Z",
     "start_time": "2019-07-03T22:57:47.111528Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['the', 'time', 'machine', 'by', 'h', 'g', 'wells', ''], ['']]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def tokenize(lines, token='word'):\n",
    "    if token == 'word':\n",
    "        return [line.split(' ') for line in lines]\n",
    "    elif token == 'char':\n",
    "        return [list(line) for line in lines]\n",
    "    else:\n",
    "        print('ERROR: unkown token type '+token)\n",
    "\n",
    "tokens = tokenize(lines)\n",
    "tokens[0:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Build a vocabulary to map string tokens into numerical indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.130515Z",
     "start_time": "2019-07-03T22:57:47.121839Z"
    },
    "attributes": {
     "classes": [],
     "id": "",
     "n": "9"
    }
   },
   "outputs": [],
   "source": [
    "class Vocab(object):\n",
    "    def __init__(self, tokens, min_freq=0):\n",
    "        # Sort according to frequencies\n",
    "        counter = collections.Counter([tk for line in tokens for tk in line])\n",
    "        self.token_freqs = sorted(counter.items(), key=lambda x: x[0])\n",
    "        self.token_freqs.sort(key=lambda x: x[1], reverse=True)\n",
    "        self.unk, uniq_tokens = 0, ['<unk>']\n",
    "        uniq_tokens +=  [token for token, freq in self.token_freqs \n",
    "                         if freq >= min_freq and token not in uniq_tokens]\n",
    "        self.idx_to_token, self.token_to_idx = [], dict()\n",
    "        for token in uniq_tokens:\n",
    "            self.idx_to_token.append(token)\n",
    "            self.token_to_idx[token] = len(self.idx_to_token) - 1\n",
    "    def __len__(self):\n",
    "        return len(self.idx_to_token)\n",
    "    def __getitem__(self, tokens):\n",
    "        if not isinstance(tokens, (list, tuple)):\n",
    "            return self.token_to_idx.get(tokens, self.unk)\n",
    "        return [self.__getitem__(token) for token in tokens]\n",
    "    def to_tokens(self, indices):\n",
    "        if not isinstance(indices, (list, tuple)):\n",
    "            return self.idx_to_token[indices]\n",
    "        return [self.idx_to_token[index] for index in indices]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Print the map between a few tokens to indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.147807Z",
     "start_time": "2019-07-03T22:57:47.131982Z"
    },
    "attributes": {
     "classes": [],
     "id": "",
     "n": "23"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('<unk>', 0), ('the', 1), ('', 2), ('i', 3), ('and', 4), ('of', 5), ('a', 6), ('to', 7), ('was', 8), ('in', 9)]\n"
     ]
    }
   ],
   "source": [
    "vocab = Vocab(tokens)\n",
    "print(list(vocab.token_to_idx.items())[0:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Now we can convert each sentence into a list of numerical indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.152854Z",
     "start_time": "2019-07-03T22:57:47.149161Z"
    },
    "attributes": {
     "classes": [],
     "id": "",
     "n": "25"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "words: ['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him', '']\n",
      "indices: [1, 20, 72, 17, 38, 12, 120, 43, 706, 7, 660, 5, 112, 2]\n",
      "words: ['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']\n",
      "indices: [8, 1654, 6, 3864, 634, 7, 131, 26, 344, 127, 484, 4]\n"
     ]
    }
   ],
   "source": [
    "for i in range(8, 10):\n",
    "    print('words:', tokens[i]) \n",
    "    print('indices:', vocab[tokens[i]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Next load data into mini-batches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.159683Z",
     "start_time": "2019-07-03T22:57:47.154168Z"
    }
   },
   "outputs": [],
   "source": [
    "def seq_data_iter_consecutive(corpus, batch_size, num_steps):\n",
    "    # Offset for the iterator over the data for uniform starts\n",
    "    offset = random.randint(0, num_steps)\n",
    "    # Slice out data - ignore num_steps and just wrap around\n",
    "    num_indices = ((len(corpus) - offset - 1) // batch_size) * batch_size\n",
    "    Xs = np.array(corpus[offset:offset+num_indices])\n",
    "    Ys = np.array(corpus[offset+1:offset+1+num_indices])\n",
    "    Xs, Ys = Xs.reshape((batch_size, -1)), Ys.reshape((batch_size, -1))\n",
    "    num_batches = Xs.shape[1] // num_steps\n",
    "    for i in range(0, num_batches * num_steps, num_steps):\n",
    "        X = Xs[:,i:(i+num_steps)]\n",
    "        Y = Ys[:,i:(i+num_steps)]\n",
    "        yield X, Y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Test on a toy example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-03T22:57:47.176345Z",
     "start_time": "2019-07-03T22:57:47.161464Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X =\n",
      "[[ 6.  7.  8.  9. 10. 11.]\n",
      " [17. 18. 19. 20. 21. 22.]]\n",
      "Y =\n",
      "[[ 7.  8.  9. 10. 11. 12.]\n",
      " [18. 19. 20. 21. 22. 23.]]\n"
     ]
    }
   ],
   "source": [
    "my_seq = list(range(30))\n",
    "for X, Y in seq_data_iter_consecutive(my_seq, batch_size=2, num_steps=6):\n",
    "    print('X =\\n%s\\nY =\\n%s' %(X, Y))"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}