{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fastai.text import * \n", "from fastai import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = Path('data/wikitext-103')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if a line is a title of a wikipedia article or not." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def istitle(line):\n", " return len(re.findall(r'^ = [^=]* = $', line)) != 0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Replace `` by UNK" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_unk(s):\n", " return UNK if s == '' else s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read the WT103 tokens file while separating each article from the next." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_file(filename):\n", " articles = []\n", " with open(filename, encoding='utf8') as f:\n", " lines = f.readlines()\n", " current_article = ''\n", " for i,line in enumerate(lines):\n", " current_article += line\n", " if i < len(lines)-2 and lines[i+1] == ' \\n' and istitle(lines[i+2]):\n", " articles.append(current_article)\n", " current_article = ''\n", " articles.append(current_article)\n", " return np.array(articles)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read the token files (download from [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip))." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train = read_file(path/'wiki.train.tokens')\n", "valid = read_file(path/'wiki.valid.tokens')\n", "test = read_file(path/'wiki.test.tokens')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(train), len(valid), len(test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We don't care about a separate test set, so let's join it with the training set. We put valid at the beginning because we'll use `valid_idx` to split." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_texts = np.concatenate([valid, train,test])\n", "df = pd.DataFrame({'texts':all_texts})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To release this from the RAM otherwise I can't launch the next." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del train\n", "del valid\n", "del test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = (TextList.from_df(df, path, col='texts')\n", " .split_by_idx(range(0,60))\n", " .label_for_lm()\n", " .databunch())\n", "data.save()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = TextLMDataBunch.load(path, bs=80, max_len=15)\n", "data.show_batch()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn = language_model_learner(data, drop_mult=0., emb_sz=400, nh=1550, nl=4, qrnn=True, clip=0.12)\n", "learn.fit_one_cycle(10,5e-3, moms=(0.8,0.7))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.save('qrnn_maj')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn = language_model_learner(data, drop_mult=0.1, clip=0.12)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.load('lstm_maj');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fastai.callbacks.tracker import SaveModelCallback" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cb = SaveModelCallback(learn)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.fit_one_cycle(5,1e-3, moms=(0.8,0.7), callbacks=[cb], pct_start=0.1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.save('qrnn_maj1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learn.validate(learn.data.valid_dl)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }