In [None]:
from fastai.text import * 
from fastai import *

In [None]:
path = Path('data/wikitext-103')

Check if a line is a title of a wikipedia article or not.

In [None]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0

Replace `<unk>` by UNK

In [None]:
def process_unk(s):
    return UNK if s == '<unk>' else s

Read the WT103 tokens file while separating each article from the next.

In [None]:
def read_file(filename):
    articles = []
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            articles.append(current_article)
            current_article = ''
    articles.append(current_article)
    return np.array(articles)

Read the token files (download from [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)).

In [None]:
train = read_file(path/'wiki.train.tokens')
valid = read_file(path/'wiki.valid.tokens')
test =  read_file(path/'wiki.test.tokens')

In [None]:
len(train), len(valid), len(test)

We don't care about a separate test set, so let's join it with the training set. We put valid at the beginning because we'll use `valid_idx` to split.

In [None]:
all_texts = np.concatenate([valid, train,test])
df = pd.DataFrame({'texts':all_texts})
df.head()

In [None]:
df.head()

To release this from the RAM otherwise I can't launch the next.

In [None]:
del train
del valid
del test

In [None]:
data = (TextList.from_df(df, path, col='texts')
                .split_by_idx(range(0,60))
                .label_for_lm()
                .databunch())
data.save()

In [None]:
data = TextLMDataBunch.load(path, bs=80, max_len=15)
data.show_batch()

In [None]:
learn = language_model_learner(data, drop_mult=0., emb_sz=400, nh=1550, nl=4, qrnn=True, clip=0.12)
learn.fit_one_cycle(10,5e-3, moms=(0.8,0.7))

In [None]:
learn.save('qrnn_maj')

In [None]:
learn = language_model_learner(data, drop_mult=0.1, clip=0.12)

In [None]:
learn.load('lstm_maj');

In [None]:
from fastai.callbacks.tracker import SaveModelCallback

In [None]:
cb = SaveModelCallback(learn)

In [None]:
learn.fit_one_cycle(5,1e-3, moms=(0.8,0.7), callbacks=[cb], pct_start=0.1)

In [None]:
learn.save('qrnn_maj1')

In [None]:
learn.validate(learn.data.valid_dl)