{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.text import * \n",
    "from fastai import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = Path('data/wikitext-103')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check if a line is a title of a wikipedia article or not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def istitle(line):\n",
    "    return len(re.findall(r'^ = [^=]* = $', line)) != 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Replace `<unk>` by UNK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_unk(s):\n",
    "    return UNK if s == '<unk>' else s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the WT103 tokens file while separating each article from the next."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_file(filename):\n",
    "    articles = []\n",
    "    with open(filename, encoding='utf8') as f:\n",
    "        lines = f.readlines()\n",
    "    current_article = ''\n",
    "    for i,line in enumerate(lines):\n",
    "        current_article += line\n",
    "        if i < len(lines)-2 and lines[i+1] == ' \\n' and istitle(lines[i+2]):\n",
    "            articles.append(current_article)\n",
    "            current_article = ''\n",
    "    articles.append(current_article)\n",
    "    return np.array(articles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the token files (download from [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip))."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = read_file(path/'wiki.train.tokens')\n",
    "valid = read_file(path/'wiki.valid.tokens')\n",
    "test =  read_file(path/'wiki.test.tokens')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(train), len(valid), len(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We don't care about a separate test set, so let's join it with the training set. We put valid at the beginning because we'll use `valid_idx` to split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_texts = np.concatenate([valid, train,test])\n",
    "df = pd.DataFrame({'texts':all_texts})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To release this from the RAM otherwise I can't launch the next."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del train\n",
    "del valid\n",
    "del test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = (TextList.from_df(df, path, col='texts')\n",
    "                .split_by_idx(range(0,60))\n",
    "                .label_for_lm()\n",
    "                .databunch())\n",
    "data.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = TextLMDataBunch.load(path, bs=80, max_len=15)\n",
    "data.show_batch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = language_model_learner(data, drop_mult=0., emb_sz=400, nh=1550, nl=4, qrnn=True, clip=0.12)\n",
    "learn.fit_one_cycle(10,5e-3, moms=(0.8,0.7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.save('qrnn_maj')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = language_model_learner(data, drop_mult=0.1, clip=0.12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.load('lstm_maj');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.callbacks.tracker import SaveModelCallback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cb = SaveModelCallback(learn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.fit_one_cycle(5,1e-3, moms=(0.8,0.7), callbacks=[cb], pct_start=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.save('qrnn_maj1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.validate(learn.data.valid_dl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}