{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "from utils import *\n",
    "from IPython.display import display,HTML"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NLP deep dive: RNNs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Language model fine tuning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing text with fastai"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Putting our texts into batches for a language model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenization with fastai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai2.text.all import *\n",
    "path = untar_data(URLs.IMDB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = get_text_files(path, folders = ['train', 'test', 'unsup'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'This movie, which I just discovered at the video store, has apparently sit '"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = files[0].open().read(); txt[:75]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(#201) ['This','movie',',','which','I','just','discovered','at','the','video','store',',','has','apparently','sit','around','for','a','couple','of','years','without','a','distributor','.','It',\"'s\",'easy','to','see'...]\n"
     ]
    }
   ],
   "source": [
    "spacy = SpacyTokenizer()\n",
    "toks = first(spacy([txt]))\n",
    "print(coll_repr(toks, 30))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first(spacy(['The U.S. dollar $1 is $1.00.']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(#228) ['xxbos','xxmaj','this','movie',',','which','i','just','discovered','at','the','video','store',',','has','apparently','sit','around','for','a','couple','of','years','without','a','distributor','.','xxmaj','it',\"'s\",'easy'...]\n"
     ]
    }
   ],
   "source": [
    "tkn = Tokenizer(spacy)\n",
    "print(coll_repr(tkn(txt), 31))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<function fastai2.text.core.fix_html(x)>,\n",
       " <function fastai2.text.core.replace_rep(t)>,\n",
       " <function fastai2.text.core.replace_wrep(t)>,\n",
       " <function fastai2.text.core.spec_add_spaces(t)>,\n",
       " <function fastai2.text.core.rm_useless_spaces(t)>,\n",
       " <function fastai2.text.core.replace_all_caps(t)>,\n",
       " <function fastai2.text.core.replace_maj(t)>,\n",
       " <function fastai2.text.core.lowercase(t, add_bos=True, add_eos=False)>]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "defaults.text_proc_rules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index'...]\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coll_repr(tkn('&copy;   Fast.ai www.fast.ai/INDEX'), 31)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Subword tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "txts = [o.open().read() for o in files[:2000]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sp = SentencePieceTokenizer(vocab_sz=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sp.setup(txts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁This ▁movie , ▁which ▁I ▁just ▁dis c over ed ▁at ▁the ▁video ▁st or e , ▁has ▁a p par ent ly ▁s it ▁around ▁for ▁a ▁couple ▁of ▁years ▁without ▁a ▁dis t ri but or . ▁It'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(first(sp([txt]))[:40])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Numericalization with fastai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([  2,  19,  87, 236, 100, 910,  18, 170,  20,  91])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfm = Tokenizer.from_folder(path)\n",
    "files = get_text_files(path, folders = ['train', 'test', 'unsup'])\n",
    "tls = TfmdLists(files, [tfm, Numericalize])\n",
    "tls[0][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['xxmaj', 'the', '.', ',', 'and', 'a', 'of']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tls.vocab[8:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "stream = \"In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\\nThen we will study how we build a language model and train it for a while.\"\n",
    "tokens = tfm(stream)\n",
    "bs,seq_len = 6,15\n",
    "d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])\n",
    "df = pd.DataFrame(d_tokens)\n",
    "display(HTML(df.to_html(index=False,header=None)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hide_input": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>xxbos</td>\n",
       "      <td>xxmaj</td>\n",
       "      <td>in</td>\n",
       "      <td>this</td>\n",
       "      <td>chapter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>movie</td>\n",
       "      <td>reviews</td>\n",
       "      <td>we</td>\n",
       "      <td>studied</td>\n",
       "      <td>in</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>first</td>\n",
       "      <td>we</td>\n",
       "      <td>will</td>\n",
       "      <td>look</td>\n",
       "      <td>at</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>how</td>\n",
       "      <td>to</td>\n",
       "      <td>customize</td>\n",
       "      <td>it</td>\n",
       "      <td>.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>of</td>\n",
       "      <td>the</td>\n",
       "      <td>preprocessor</td>\n",
       "      <td>used</td>\n",
       "      <td>in</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>will</td>\n",
       "      <td>study</td>\n",
       "      <td>how</td>\n",
       "      <td>we</td>\n",
       "      <td>build</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bs,seq_len = 6,5\n",
    "d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])\n",
    "df = pd.DataFrame(d_tokens)\n",
    "display(HTML(df.to_html(index=False,header=None)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hide_input": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>,</td>\n",
       "      <td>we</td>\n",
       "      <td>will</td>\n",
       "      <td>go</td>\n",
       "      <td>back</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>chapter</td>\n",
       "      <td>1</td>\n",
       "      <td>and</td>\n",
       "      <td>dig</td>\n",
       "      <td>deeper</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>the</td>\n",
       "      <td>processing</td>\n",
       "      <td>steps</td>\n",
       "      <td>necessary</td>\n",
       "      <td>to</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>xxmaj</td>\n",
       "      <td>by</td>\n",
       "      <td>doing</td>\n",
       "      <td>this</td>\n",
       "      <td>,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>the</td>\n",
       "      <td>data</td>\n",
       "      <td>block</td>\n",
       "      <td>xxup</td>\n",
       "      <td>api</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>language</td>\n",
       "      <td>model</td>\n",
       "      <td>and</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bs,seq_len = 6,5\n",
    "d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])\n",
    "df = pd.DataFrame(d_tokens)\n",
    "display(HTML(df.to_html(index=False,header=None)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hide_input": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>over</td>\n",
       "      <td>the</td>\n",
       "      <td>example</td>\n",
       "      <td>of</td>\n",
       "      <td>classifying</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>under</td>\n",
       "      <td>the</td>\n",
       "      <td>surface</td>\n",
       "      <td>.</td>\n",
       "      <td>xxmaj</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>convert</td>\n",
       "      <td>text</td>\n",
       "      <td>into</td>\n",
       "      <td>numbers</td>\n",
       "      <td>and</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>we</td>\n",
       "      <td>'ll</td>\n",
       "      <td>have</td>\n",
       "      <td>another</td>\n",
       "      <td>example</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>.</td>\n",
       "      <td>\\n</td>\n",
       "      <td>xxmaj</td>\n",
       "      <td>then</td>\n",
       "      <td>we</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>it</td>\n",
       "      <td>for</td>\n",
       "      <td>a</td>\n",
       "      <td>while</td>\n",
       "      <td>.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "bs,seq_len = 6,5\n",
    "d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])\n",
    "df = pd.DataFrame(d_tokens)\n",
    "display(HTML(df.to_html(index=False,header=None)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dl = LMDataLoader(tls)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfms = [Tokenizer.from_folder(path), Numericalize]\n",
    "files = get_text_files(path, folders = ['train', 'test', 'unsup'])\n",
    "splits = RandomSplitter(valid_pct=0.1, seed=42)(files)\n",
    "tls = TfmdLists(files, tfms, splits=splits)\n",
    "dls = tls.dataloaders(dl_type=LMDataLoader)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Putting our texts into batch for a classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = get_text_files(path, folders = ['train', 'test'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "splits = GrandparentSplitter(valid_name='test')(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dsets = Datasets(files, tfms, splits=splits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input_chunk)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A layered API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dls = TextDataLoaders.from_folder(path, valid='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imdb = DataBlock(blocks=(TextBlock.from_folder(path),CategoryBlock),\n",
    "                 get_y = parent_label,\n",
    "                 get_items=partial(get_text_files, folders=['train', 'test']),\n",
    "                 splitter=GrandparentSplitter(valid_name='test'))\n",
    "dls = imdb.dataloaders(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]\n",
    "files = get_text_files(path, folders = ['train', 'test'])\n",
    "splits = GrandparentSplitter(valid_name='test')(files)\n",
    "dsets = Datasets(files, tfms, splits=splits)\n",
    "dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input_chunk)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training a text classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fine tuning the language model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imdb_lm = DataBlock(blocks=(TextBlock.from_folder(path, is_lm=True),),\n",
    "                    get_items=partial(get_text_files, folders=['train', 'test', 'unsup']),\n",
    "                    splitter=RandomSplitter(0.1))\n",
    "\n",
    "dls_lm = imdb_lm.dataloaders(path, path=path, bs=128, seq_len=80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>text_</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>xxbos xxmaj keep away from this one . xxmaj the worst thing is the appalling story . xxmaj there seems to be an intent to convey some subtle spiritual / love / friendship message but it is so pathetically devoid of any substance you ca n't help but cringe . xxmaj in addition , the majority of screen time is a far below standard story of thieves , criminals and our xxunk ) dealing with some alien time travel artifacts</td>\n",
       "      <td>xxmaj keep away from this one . xxmaj the worst thing is the appalling story . xxmaj there seems to be an intent to convey some subtle spiritual / love / friendship message but it is so pathetically devoid of any substance you ca n't help but cringe . xxmaj in addition , the majority of screen time is a far below standard story of thieves , criminals and our xxunk ) dealing with some alien time travel artifacts .</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>- plots . \\n\\n xxmaj lacks the ambition to be a great film , but remains one of the best of its kind and watchable and re - watchable for its comedic value alone . xxmaj deserves more attention than it seems to have received and well worth the cost of the xxup dvd or video cassette . xxbos xxmaj christy is one of my favorite books , and i absolutely loved xxmaj kellie xxmaj martin and xxmaj tyne xxmaj</td>\n",
       "      <td>plots . \\n\\n xxmaj lacks the ambition to be a great film , but remains one of the best of its kind and watchable and re - watchable for its comedic value alone . xxmaj deserves more attention than it seems to have received and well worth the cost of the xxup dvd or video cassette . xxbos xxmaj christy is one of my favorite books , and i absolutely loved xxmaj kellie xxmaj martin and xxmaj tyne xxmaj daly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>in this movie began acting careers as children . i recommend this movie as well as others like xxmaj sexo xxmaj xxunk y xxmaj xxunk , xxmaj amores xxmaj perros and xxmaj ladies xxmaj night although this last one is more like a \" chick flick \" because there are male strippers , a bachelorette party , many guys and lots of fun ! xxmaj the others are great though ! \\n\\n xxup nona xxbos xxmaj the lurid title may</td>\n",
       "      <td>this movie began acting careers as children . i recommend this movie as well as others like xxmaj sexo xxmaj xxunk y xxmaj xxunk , xxmaj amores xxmaj perros and xxmaj ladies xxmaj night although this last one is more like a \" chick flick \" because there are male strippers , a bachelorette party , many guys and lots of fun ! xxmaj the others are great though ! \\n\\n xxup nona xxbos xxmaj the lurid title may suggest</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dls_lm.show_batch(max_n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = language_model_learner(\n",
    "    dls_lm, AWD_LSTM, drop_mult=0.3, \n",
    "    metrics=[accuracy, Perplexity()]).to_fp16()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>perplexity</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>4.120048</td>\n",
       "      <td>3.912788</td>\n",
       "      <td>0.299565</td>\n",
       "      <td>50.038246</td>\n",
       "      <td>11:39</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.fit_one_cycle(1, 2e-2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Saving and loading models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.save('1epoch')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = learn.load('1epoch')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>perplexity</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>3.893486</td>\n",
       "      <td>3.772820</td>\n",
       "      <td>0.317104</td>\n",
       "      <td>43.502548</td>\n",
       "      <td>12:37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3.820479</td>\n",
       "      <td>3.717197</td>\n",
       "      <td>0.323790</td>\n",
       "      <td>41.148880</td>\n",
       "      <td>12:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3.735622</td>\n",
       "      <td>3.659760</td>\n",
       "      <td>0.330321</td>\n",
       "      <td>38.851997</td>\n",
       "      <td>12:09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3.677086</td>\n",
       "      <td>3.624794</td>\n",
       "      <td>0.333960</td>\n",
       "      <td>37.516987</td>\n",
       "      <td>12:12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>3.636646</td>\n",
       "      <td>3.601300</td>\n",
       "      <td>0.337017</td>\n",
       "      <td>36.645859</td>\n",
       "      <td>12:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>3.553636</td>\n",
       "      <td>3.584241</td>\n",
       "      <td>0.339355</td>\n",
       "      <td>36.026001</td>\n",
       "      <td>12:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>3.507634</td>\n",
       "      <td>3.571892</td>\n",
       "      <td>0.341353</td>\n",
       "      <td>35.583862</td>\n",
       "      <td>12:08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>3.444101</td>\n",
       "      <td>3.565988</td>\n",
       "      <td>0.342194</td>\n",
       "      <td>35.374371</td>\n",
       "      <td>12:08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>3.398597</td>\n",
       "      <td>3.566283</td>\n",
       "      <td>0.342647</td>\n",
       "      <td>35.384815</td>\n",
       "      <td>12:11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>3.375563</td>\n",
       "      <td>3.568166</td>\n",
       "      <td>0.342528</td>\n",
       "      <td>35.451500</td>\n",
       "      <td>12:05</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.unfreeze()\n",
    "learn.fit_one_cycle(10, 2e-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn.save_encoder('finetuned')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "TEXT = \"I liked this movie because\"\n",
    "N_WORDS = 40\n",
    "N_SENTENCES = 2\n",
    "preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "i liked this movie because of its story and characters . The story line was very strong , very good for a sci - fi film . The main character , Alucard , was very well developed and brought the whole story\n",
      "i liked this movie because i like the idea of the premise of the movie , the ( very ) convenient virus ( which , when you have to kill a few people , the \" evil \" machine has to be used to protect\n"
     ]
    }
   ],
   "source": [
    "print(\"\\n\".join(preds))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fine tuning the classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "imdb_clas = DataBlock(blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),\n",
    "                      get_y = parent_label,\n",
    "                      get_items=partial(get_text_files, folders=['train', 'test']),\n",
    "                      splitter=GrandparentSplitter(valid_name='test'))\n",
    "dls_clas = imdb_clas.dataloaders(path, path=path, bs=128, seq_len=80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>xxbos i rate this movie with 3 skulls , only coz the girls knew how to scream , this could 've been a better movie , if actors were better , the twins were xxup ok , i believed they were evil , but the eldest and youngest brother , they sucked really bad , it seemed like they were reading the scripts instead of acting them … . spoiler : if they 're vampire 's why do they freeze the blood ? vampires ca n't drink frozen blood , the sister in the movie says let 's drink her while she is alive … .but then when they 're moving to another house , they take on a cooler they 're frozen blood . end of spoiler \\n\\n it was a huge waste of time , and that made me mad coz i read all the reviews of how</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>xxbos i have read all of the xxmaj love xxmaj come xxmaj softly books . xxmaj knowing full well that movies can not use all aspects of the book , but generally they at least have the main point of the book . i was highly disappointed in this movie . xxmaj the only thing that they have in this movie that is in the book is that xxmaj missy 's father comes to xxunk in the book both parents come ) . xxmaj that is all . xxmaj the story line was so twisted and far fetch and yes , sad , from the book , that i just could n't enjoy it . xxmaj even if i did n't read the book it was too sad . i do know that xxmaj pioneer life was rough , but the whole movie was a downer . xxmaj the rating</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>xxbos xxmaj this , for lack of a better term , movie is lousy . xxmaj where do i start … … \\n\\n xxmaj cinemaphotography - xxmaj this was , perhaps , the worst xxmaj i 've seen this year . xxmaj it looked like the camera was being tossed from camera man to camera man . xxmaj maybe they only had one camera . xxmaj it gives you the sensation of being a volleyball . \\n\\n xxmaj there are a bunch of scenes , haphazardly , thrown in with no continuity at all . xxmaj when they did the ' split screen ' , it was absurd . xxmaj everything was squished flat , it looked ridiculous . \\n\\n xxmaj the color tones were way off . xxmaj these people need to learn how to balance a camera . xxmaj this ' movie ' is poorly made , and</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dls_clas.show_batch(max_n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()\n",
    "learn = learn.load_encoder('finetuned')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.347427</td>\n",
       "      <td>0.184480</td>\n",
       "      <td>0.929320</td>\n",
       "      <td>00:33</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.fit_one_cycle(1, 2e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.247763</td>\n",
       "      <td>0.171683</td>\n",
       "      <td>0.934640</td>\n",
       "      <td>00:37</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.freeze_to(-2)\n",
    "learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.193377</td>\n",
       "      <td>0.156696</td>\n",
       "      <td>0.941200</td>\n",
       "      <td>00:45</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.freeze_to(-3)\n",
    "learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.172888</td>\n",
       "      <td>0.153770</td>\n",
       "      <td>0.943120</td>\n",
       "      <td>01:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.161492</td>\n",
       "      <td>0.155567</td>\n",
       "      <td>0.942640</td>\n",
       "      <td>00:57</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "learn.unfreeze()\n",
    "learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Disinformation and language models"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Questionnaire"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Further research"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Becoming a deep learning practitioner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "jupytext": {
   "split_at_heading": true
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}