{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "70HL69ki43xO"
      },
      "source": [
        "Link to the notebook: https://tinyurl.com/y2zz2qu8\n",
        "**Copy the notebook to your GDrive to edit.**"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RF-oSG9m5Zr5"
      },
      "source": [
        "# Setup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "vKjNPo57FTJ8"
      },
      "outputs": [],
      "source": [
        "# magic commands to make sure changes to external packages are automatically loaded and plots are displayed in the notebook\n",
        "%reload_ext autoreload\n",
        "%autoreload 2\n",
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UkOnlv2EC0PR",
        "outputId": "750084a5-da1c-49b4-f2d4-0c8da3d3ea51"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: datasets in /usr/local/lib/python3.7/dist-packages (2.5.0)\n",
            "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.22.1)\n",
            "Requirement already satisfied: bpemb in /usr/local/lib/python3.7/dist-packages (0.3.3)\n",
            "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (2022.8.2)\n",
            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.12.0)\n",
            "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.13)\n",
            "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets) (3.0.0)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets) (3.8.1)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n",
            "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.9.1)\n",
            "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.18.0)\n",
            "Requirement already satisfied: dill<0.3.6 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.5.1)\n",
            "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n",
            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n",
            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.1)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.2.0)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.8.1)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.1.1)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.0.2)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (6.0.2)\n",
            "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (0.13.0)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (22.1.0)\n",
            "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.1.1)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.3.1)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.8.0)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n",
            "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.9)\n",
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2022.6.15)\n",
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.25.11)\n",
            "Requirement already satisfied: tokenizers!=0.11.3,<0.13,>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.12.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)\n",
            "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (from bpemb) (0.1.97)\n",
            "Requirement already satisfied: gensim in /usr/local/lib/python3.7/dist-packages (from bpemb) (3.6.0)\n",
            "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.7/dist-packages (from gensim->bpemb) (5.2.1)\n",
            "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim->bpemb) (1.15.0)\n",
            "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim->bpemb) (1.7.3)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.1)\n",
            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.2.1)\n"
          ]
        }
      ],
      "source": [
        "!pip3 install datasets transformers bpemb"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "y7aI_mECQqhS"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "import random\n",
        "import numpy as np\n",
        "from typing import List, Tuple\n",
        "from torch.utils.data import Dataset, DataLoader\n",
        "from torch import nn\n",
        "import matplotlib.pyplot as plt\n",
        "import re\n",
        "from tqdm.notebook import tqdm\n",
        "from torch.optim import Adam, RMSprop\n",
        "import nltk\n",
        "from datasets import load_dataset\n",
        "from bpemb import BPEmb"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "JkmVtpi_MxEc"
      },
      "outputs": [],
      "source": [
        "def enforce_reproducibility(seed=42):\n",
        "    # Sets seed manually for both CPU and CUDA\n",
        "    torch.manual_seed(seed)\n",
        "    torch.cuda.manual_seed_all(seed)\n",
        "    # For atomic operations there is currently \n",
        "    # no simple way to enforce determinism, as\n",
        "    # the order of parallel operations is not known.\n",
        "    # CUDNN\n",
        "    torch.backends.cudnn.deterministic = True\n",
        "    torch.backends.cudnn.benchmark = False\n",
        "    # System based\n",
        "    random.seed(seed)\n",
        "    np.random.seed(seed)\n",
        "enforce_reproducibility()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-LxQjKyq5X32",
        "outputId": "4ac79ed9-1db7-4929-e747-4fa761a9284c"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "device(type='cuda')"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "device = torch.device(\"cpu\")\n",
        "if torch.cuda.is_available():\n",
        "  device = torch.device(\"cuda\")\n",
        "device"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "o9i3qSCg48BU"
      },
      "source": [
        "# Language Models"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Zwi0svDYkgCO"
      },
      "source": [
        "![LM example](https://lh5.googleusercontent.com/K6L6kme_QmdPic_mP5QusvB0sq_Cy8fxfzIJCqoATS_li6ct5VyZ6E7AZT_UINact-6tbUavVtLdvaAufzVMxureqfX9GNpXofEOwasd5giVOLLiPcX9nZ8YUq6mQ0gb6Y8_9lSPtQGwm0ZnEQ)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NnZAvZYG5ArA"
      },
      "source": [
        "* Trained to predict probabilities of the **next word, given some context**. \n",
        "* Architectures with **large numbers** of parameters trained on large corpora of text.\n",
        "* Can be used to generate text, to verify how likely is a particular sequence of text, e.g. if it sounds grammatical and complies to the rules of a language, or we can use the contextual representations of the words learned by the model. \n",
        "* Evaluated with **perplexity**, which is the inverse probability of the test set, normalized by the number of words:\n",
        "\n",
        "\\begin{equation}\n",
        "\\mathrm{PP}(W)=\\sqrt[N]{\\prod_{i=1}^{N} \\frac{1}{P\\left(w_{i} \\mid w_{1} \\ldots w_{i-1}\\right)}}\n",
        "\\end{equation}\n",
        "* Equivalent to the **exponential of the cross-entropy loss** [(detailed explanation)](https://towardsdatascience.com/perplexity-in-language-models-87a196019a94)\n",
        "* Minimizing perplexity is the same as maximising probability of the correct prediction. \n",
        "* Low perplexity is better!\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BIGas9GNJBM6"
      },
      "source": [
        "# RNNs"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_E_9DF6z5doT"
      },
      "source": [
        "## Recap"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mDrTKqi08j4m"
      },
      "source": [
        "![Karpathy blog rnn example](https://karpathy.github.io/assets/rnn/diags.jpeg)\n",
        "\n",
        "Source: https://karpathy.github.io/2015/05/21/rnn-effectiveness/\n",
        "\n",
        "* Widely used for working with sequence data. \n",
        "* Can be used for a variety of problems which work with sequences.\n",
        "* Consider both a given input and a **state**, which is updated at every step on the input sequence. \n",
        "* How this state is kept and the internal calculations of an RNN can differ.   \n",
        "  * Three main types of RNN cells: vanilla RNNs, GRU, and LSTM. \n",
        "  * GRUs and LSTMs differ from vanilla RNNs in that they use **gating mechanisms** to mitigate vanishing gradients, and are better at capturing long-term dependencies."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rPhxrOAOLu1l"
      },
      "source": [
        "## Data Preparation for RNNs\n",
        "*PyTorch-specific."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8WueZdv25lnn"
      },
      "source": [
        "### Packing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7rOPCP4rEpX1",
        "outputId": "477557fe-bee8-45c3-9c00-4204cd8b5ee8"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:20: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "tensor([[ 4, 19, 13, 17,  3,  8, 18,  5, 16, 13,  2, 17, 15],\n",
              "        [10,  9, 11,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0],\n",
              "        [ 1,  9, 12, 14,  6,  7,  0,  0,  0,  0,  0,  0,  0]])"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# We want to run LSTM on a batch with 3 sentences\n",
        "sents = ['The word of the Lord came to Zechariah son of Iddo the prophet.',  # len = 13\n",
        "        'fruit flies like a banana',      # len = 5\n",
        "        'Fruit flies live on a banana']    # len = 6\n",
        "\n",
        "# Step 1: Construct Vocabulary\n",
        "vocab = ['<pad>'] + sorted(set([token for sent in sents for token in sent.split()]))\n",
        "\n",
        "#Step 2: Load indexed data (list of instances, where each instance is list of character indices)\n",
        "vectorized_seqs = [[vocab.index(tok) for tok in sent.split()]for sent in sents]\n",
        "\n",
        "#Step 3: Make Model\n",
        "embed = torch.nn.Embedding(len(vocab), 4) # embedding_dim = 4\n",
        "lstm = torch.nn.LSTM(input_size=4, hidden_size=5, batch_first=True) # input_dim = 4, hidden_dim = 5\n",
        "\n",
        "#Step 4: Pad instances with 0s till max length sequence\n",
        "# get the length of each seq in your batch\n",
        "seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))\n",
        "\n",
        "seq_tensor = torch.tensor(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()\n",
        "\n",
        "for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):\n",
        "    seq_tensor[idx, :seqlen] = torch.LongTensor(seq)\n",
        "seq_tensor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "AwacR6fdG5Rm",
        "outputId": "4802ce14-9e0a-4694-92f6-b93e6bbbb416"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "tensor([[ 4, 19, 13, 17,  3,  8, 18,  5, 16, 13,  2, 17, 15],\n",
              "        [ 1,  9, 12, 14,  6,  7,  0,  0,  0,  0,  0,  0,  0],\n",
              "        [10,  9, 11,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0]])"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Step 5: Sort instances by sequence length in descending order \n",
        "# this step is not compulsory, where the packing functions have a parameter enforce_sorted, which can be set to False\n",
        "seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)\n",
        "seq_tensor = seq_tensor[perm_idx]\n",
        "seq_tensor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9SKbaeryHC9H",
        "outputId": "b185024a-8211-4847-f4ac-add945b9d088"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "PackedSequence(data=tensor([ 4,  1, 10, 19,  9,  9, 13, 12, 11, 17, 14,  6,  3,  6,  7,  8,  7, 18,\n",
              "         5, 16, 13,  2, 17, 15]), batch_sizes=tensor([3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1]), sorted_indices=None, unsorted_indices=None)"
            ]
          },
          "execution_count": 8,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Calling pack_padded_sequence with instances and sequence lengths\n",
        "# Packing is important for training RNN on text with variable length to save compute\n",
        "packed_input = torch.nn.utils.rnn.pack_padded_sequence(seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)\n",
        "# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes\n",
        "packed_input"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YORCY4nzHgYO",
        "outputId": "8d55e7dc-a8fc-4f86-d5e8-494ea9f8d8a2"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "tensor([[[ 1.6423, -0.1596, -0.4974,  0.4396],\n",
              "         [ 0.5750, -0.6417, -2.2064, -0.7508],\n",
              "         [-0.4880,  1.1914, -0.8140, -0.7360],\n",
              "         [ 0.3466, -0.1973, -1.0546,  1.2780],\n",
              "         [-0.7279, -0.5594, -0.7688,  0.7624],\n",
              "         [-1.3847, -0.8712, -0.2234,  1.7174],\n",
              "         [-0.1722,  0.5238,  0.0566,  0.4263],\n",
              "         [-0.7581,  1.0783,  0.8008,  1.6806],\n",
              "         [ 1.4451,  0.8564,  2.2181,  0.5232],\n",
              "         [-0.4880,  1.1914, -0.8140, -0.7360],\n",
              "         [-0.7521,  1.6487, -0.3925, -1.4036],\n",
              "         [ 0.3466, -0.1973, -1.0546,  1.2780],\n",
              "         [-0.0978,  1.8446, -1.1845,  1.3835]],\n",
              "\n",
              "        [[ 0.6784, -1.2345, -0.0431, -1.6047],\n",
              "         [ 0.3189, -0.4245,  0.3057, -0.7746],\n",
              "         [-0.9138, -0.6581,  0.0780,  0.5258],\n",
              "         [-1.4032,  0.0360, -0.0635,  0.6756],\n",
              "         [ 1.2791,  1.2964,  0.6105,  1.3347],\n",
              "         [-0.2316,  0.0418, -0.2516,  0.8599],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055]],\n",
              "\n",
              "        [[-1.5576,  0.9956, -0.8798, -0.6011],\n",
              "         [ 0.3189, -0.4245,  0.3057, -0.7746],\n",
              "         [-1.2742,  2.1228, -1.2347, -0.4879],\n",
              "         [ 1.2791,  1.2964,  0.6105,  1.3347],\n",
              "         [-0.2316,  0.0418, -0.2516,  0.8599],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055],\n",
              "         [ 1.9269,  1.4873,  0.9007, -2.1055]]], grad_fn=<EmbeddingBackward0>)"
            ]
          },
          "execution_count": 9,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Step 6: Let's now proceed with the network transformations and embed the instances\n",
        "embedded_seq_tensor = embed(seq_tensor)\n",
        "embedded_seq_tensor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eeVeo5UQHajy",
        "outputId": "c6a74980-53a5-41be-c7fa-eebd1209f694"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "PackedSequence(data=tensor([[ 1.6423, -0.1596, -0.4974,  0.4396],\n",
              "        [ 0.6784, -1.2345, -0.0431, -1.6047],\n",
              "        [-1.5576,  0.9956, -0.8798, -0.6011],\n",
              "        [ 0.5750, -0.6417, -2.2064, -0.7508],\n",
              "        [ 0.3189, -0.4245,  0.3057, -0.7746],\n",
              "        [ 0.3189, -0.4245,  0.3057, -0.7746],\n",
              "        [-0.4880,  1.1914, -0.8140, -0.7360],\n",
              "        [-0.9138, -0.6581,  0.0780,  0.5258],\n",
              "        [-1.2742,  2.1228, -1.2347, -0.4879],\n",
              "        [ 0.3466, -0.1973, -1.0546,  1.2780],\n",
              "        [-1.4032,  0.0360, -0.0635,  0.6756],\n",
              "        [ 1.2791,  1.2964,  0.6105,  1.3347],\n",
              "        [-0.7279, -0.5594, -0.7688,  0.7624],\n",
              "        [ 1.2791,  1.2964,  0.6105,  1.3347],\n",
              "        [-0.2316,  0.0418, -0.2516,  0.8599],\n",
              "        [-1.3847, -0.8712, -0.2234,  1.7174],\n",
              "        [-0.2316,  0.0418, -0.2516,  0.8599],\n",
              "        [-0.1722,  0.5238,  0.0566,  0.4263],\n",
              "        [-0.7581,  1.0783,  0.8008,  1.6806],\n",
              "        [ 1.4451,  0.8564,  2.2181,  0.5232],\n",
              "        [-0.4880,  1.1914, -0.8140, -0.7360],\n",
              "        [-0.7521,  1.6487, -0.3925, -1.4036],\n",
              "        [ 0.3466, -0.1973, -1.0546,  1.2780],\n",
              "        [-0.0978,  1.8446, -1.1845,  1.3835]],\n",
              "       grad_fn=<PackPaddedSequenceBackward0>), batch_sizes=tensor([3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1]), sorted_indices=None, unsorted_indices=None)"
            ]
          },
          "execution_count": 10,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Step 7: Call pack_padded_sequence with embeded instances and sequence lengths\n",
        "packed_input = torch.nn.utils.rnn.pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)\n",
        "# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes\n",
        "packed_input"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "PQyp6kKgIXvT"
      },
      "outputs": [],
      "source": [
        "# Step 8: Forward with LSTM, get packed_output, hidden state and cell state\n",
        "packed_output, (ht, ct) = lstm(packed_input)\n",
        "\n",
        "# Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector\n",
        "output, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kj2BQBKgIo3h"
      },
      "source": [
        "Summary of Shape Transformations\n",
        "\n",
        "(batch_size X max_seq_len X embedding_dim) --> Sort by seqlen ---> (batch_size X max_seq_len X embedding_dim)\n",
        "\n",
        "(batch_size X max_seq_len X embedding_dim) --->      Pack     ---> (batch_sum_seq_len X embedding_dim)\n",
        "\n",
        "(batch_sum_seq_len X embedding_dim)        --->      LSTM     ---> (batch_sum_seq_len X hidden_dim)\n",
        "\n",
        "(batch_sum_seq_len X hidden_dim)           --->    UnPack     ---> (batch_size X max_seq_len X hidden_dim)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wfu0XTsvI1-1"
      },
      "source": [
        "### Bucketing\n",
        "* Group instances of *similar* lenghts in the same batch which leads to minimal padding. \n",
        "* E.g. a batch with instances of lenght 5, 5, 6 will be padded to lenght 6, while a batch with instances of lenght 5, 5, 13 will be padded to lenght 13 and will also take more time for processing.\n",
        "* To still have some randomness in the batches, we usually have buckets with N instances with similar lenght and shuffle them into K batches.\n",
        "* Example implementation -- Bucket Iterator (https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Xy7v99c4QVbH"
      },
      "source": [
        "## Dataset Loading\n",
        "\n",
        "For training the Language Model, we'll be using the [WikiText-2 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)\n",
        "\n",
        "HuggingFace have a [repository](https://huggingface.co/datasets) of many datasets, where you can easily find datasets of interests (e.g. the TyDi QA dataset).\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104,
          "referenced_widgets": [
            "7428f7aed4fe489da08f9e8f225ce45e",
            "4b72f20b29554a5385cfc56f6be67e60",
            "dd18c2e69c554cb9b3d8976d0939eff9",
            "e5557fe6de544eef8ca7285e21eedd57",
            "c3e73da0b0e940c6acb3e1f7144033ef",
            "bb6145b148494ba2b3a06793f12b2e95",
            "ecbd58ec665c457dbd6f8969e687f9ca",
            "4cd519d0e15148789c7cb728c049f494",
            "d70ffccbe196487b927ecfeab3862ff9",
            "841246c9f816433f836017c25cfd23cc",
            "e530dbc1f0f8495c8331471678c10785"
          ]
        },
        "id": "3GUAREbrCjQR",
        "outputId": "7eed9456-5217-4b5c-f717-efc9ff341835"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.builder:Found cached dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7428f7aed4fe489da08f9e8f225ce45e",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/3 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "dict_keys(['test', 'train', 'validation'])"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')\n",
        "datasets.keys()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_Yi1VEBpDUDf",
        "outputId": "3a06cc4b-f182-494d-c001-b3005721a2e6"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "{'text': ' = Valkyria Chronicles III = \\n'}\n",
            "{'text': ''}\n",
            "{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the \" Nameless \" , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit \" Calamaty Raven \" . \\n'}\n"
          ]
        }
      ],
      "source": [
        "print(datasets['train'][1])\n",
        "print(datasets['train'][2])\n",
        "print(datasets['train'][3])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "WGs3Z2qMHx0u"
      },
      "outputs": [],
      "source": [
        "# We'll use again the pretrained BP Embeddings and the corresponding tokenizer.\n",
        "\n",
        "bpemb_en = BPEmb(lang='en', dim=100, vs=25000)\n",
        "# Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token\n",
        "pretrained_embeddings = np.concatenate([bpemb_en.emb.vectors, np.zeros(shape=(1,100))], axis=0)\n",
        "# Extract the vocab and add an extra [PAD] token\n",
        "vocabulary = bpemb_en.emb.index2word + ['[PAD]']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5lRVqf23fmzB",
        "outputId": "9792f300-2398-4ad6-c34d-dd32e07002d5"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "embeddings' shape: (25001, 100) vocabulary size: 25001\n"
          ]
        }
      ],
      "source": [
        "print(\"embeddings' shape:\", pretrained_embeddings.shape, \"vocabulary size:\", len(vocabulary))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GhI7MyBbIlSz",
        "outputId": "3f712c31-cbf5-4a8b-dca2-bd212ebfd066"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "      "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-8c6fa1ee7badaacd.arrow\n",
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-77acfeb86a33ecd2.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-54389ded37e8dd5e.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-5b81ec00cae1d713.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-eef702d036cb08e7.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-dd74ef948d72de48.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-7aa3254c800caf5a.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-996aeb3b127df775.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-18c523d5ea07181c.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-6aed561681a698aa.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-2dc2f04c3268970e.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a143ecc30448f0c9.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[1282, 20611, 467, 852, 2358, 2538, 121, 944, 118, 14849]\n",
            "     "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-0cebd0dd47e9dca3.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-ba2baed289148080.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-9436b3aebfce6e6d.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-e5908a6cfef7bf69.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-a8b410318c1a7367.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-cf6c4434cd4b23a9.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-fd1f4825f23219d7.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4502a7f4eeb26f66.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-d6c4e34424567d33.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-eba367bb853c786b.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-6906813da7cf25e5.arrow\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            " "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-dd273094f4e07438.arrow\n"
          ]
        }
      ],
      "source": [
        "def tokenizer(text):\n",
        "  return {'input_ids': bpemb_en.encode_ids_with_eos(text)}\n",
        "\n",
        "def tokenize_function(examples):\n",
        "    return tokenizer(examples['text'])\n",
        "\n",
        "def group_texts(examples):\n",
        "    # Concatenate all texts.\n",
        "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
        "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
        "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
        "    # customize this part to your needs.\n",
        "    total_length = (total_length // block_size) * block_size\n",
        "    # Split by chunks of size block_size .\n",
        "    result = {\n",
        "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
        "        for k, t in concatenated_examples.items()\n",
        "    }\n",
        "    return result\n",
        "\n",
        "block_size = 128\n",
        "\n",
        "tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=[\"text\"])\n",
        "print(tokenized_datasets['train'][3]['input_ids'][:10])\n",
        "\n",
        "lm_datasets = tokenized_datasets.map(group_texts, batched=True, batch_size=1000, num_proc=4,)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "id": "LJfzyoT_FJ_V"
      },
      "outputs": [],
      "source": [
        "def collate_batch_bilstm(dataset) -> Tuple[torch.Tensor, torch.Tensor]:\n",
        "    \"\"\"\n",
        "    Combines multiple data samples into a single batch\n",
        "    :param input_ids: The token input ids\n",
        "    :return: A tuple of tensors (input_ids, targets)\n",
        "    \"\"\"\n",
        "    input_ids = [i['input_ids'] for i in dataset]\n",
        "\n",
        "    input_lengths, padded_input = [], []\n",
        "    for sentence in input_ids:\n",
        "      sentence = sentence[:seq_len]\n",
        "      input_lengths.append(len(sentence) - 1)\n",
        "      sentence = sentence + [0] * (seq_len - len(sentence))\n",
        "      padded_input.append(sentence)\n",
        "\n",
        "    input_data = torch.tensor(padded_input)\n",
        "\n",
        "    # we don't use the last position as there isn't anything left for generation\n",
        "    input_ids = input_data[:, :-1]\n",
        "\n",
        "    # the target at each step is to generate the next word from the sequence\n",
        "    # so we shift the token ids with 1 position\n",
        "    targets = input_data[:, 1:].reshape(-1)\n",
        "\n",
        "    return input_ids, torch.tensor(input_lengths), targets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "id": "bWoG8AE1Pv6T"
      },
      "outputs": [],
      "source": [
        "test_dl = torch.utils.data.DataLoader(lm_datasets['test'], batch_size=32, collate_fn=collate_batch_bilstm)\n",
        "train_dl = torch.utils.data.DataLoader(lm_datasets['train'], batch_size=32, collate_fn=collate_batch_bilstm)\n",
        "valid_dl = torch.utils.data.DataLoader(lm_datasets['validation'], batch_size=32, collate_fn=collate_batch_bilstm)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F0cjbDzfJ3pE"
      },
      "source": [
        "## Model Implementation\n",
        "\n",
        "Next we will create an LSTM model. We again extend the PyTorch class `torch.nn.Module` and implement the `__init__` function, and define how tensors are processed in the `__forward__` function."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "id": "qRuI--wZOsCY"
      },
      "outputs": [],
      "source": [
        "class LSTM_LM(nn.Module):\n",
        "    \"\"\"\n",
        "    LSTM Language Model\n",
        "    \"\"\"\n",
        "    def __init__(\n",
        "            self,            \n",
        "            pretrained_embeddings: torch.tensor,\n",
        "            lstm_dim: int,       \n",
        "            dropout_prob: float = 0.0,\n",
        "            lstm_layers: int = 1,\n",
        "    ):\n",
        "        \"\"\"\n",
        "        Initializer for LSTM Language Model\n",
        "        :param pretrained_embeddings: A tensor containing the pretrained BPE embeddings\n",
        "        :param lstm_dim: The dimensionality of the BiLSTM network\n",
        "        :param dropout_prob: Dropout probability\n",
        "        :param lstm_layers: The number of stacked LSTM layers\n",
        "        \"\"\"\n",
        "\n",
        "        # First thing is to call the superclass initializer\n",
        "        super(LSTM_LM, self).__init__()\n",
        "\n",
        "        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer\n",
        "        # The components are an embedding layer, an LSTM layer, a dropout layer, and a feed-forward output layer\n",
        "        self.vocab_size = pretrained_embeddings.shape[0]\n",
        "        self.model = nn.ModuleDict({\n",
        "            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),\n",
        "            'lstm': nn.LSTM( \n",
        "                pretrained_embeddings.shape[1],\n",
        "                lstm_dim,\n",
        "                num_layers=lstm_layers,\n",
        "                batch_first=True,\n",
        "                dropout=dropout_prob),\n",
        "            'ff': nn.Linear(lstm_dim, pretrained_embeddings.shape[0]),\n",
        "            'drop': nn.Dropout(dropout_prob)\n",
        "        })\n",
        "\n",
        "        # Initialize the weights of the model\n",
        "        self._init_weights()\n",
        "\n",
        "    def _init_weights(self):\n",
        "        all_params = list(self.model['lstm'].named_parameters()) + \\\n",
        "                     list(self.model['ff'].named_parameters())\n",
        "        for n, p in all_params:\n",
        "            if 'weight' in n:\n",
        "                nn.init.xavier_normal_(p)\n",
        "            elif 'bias' in n:\n",
        "                nn.init.zeros_(p)\n",
        "\n",
        "    def forward(self, input_ids, input_lens, hidden_states):\n",
        "        \"\"\"\n",
        "        Defines how tensors flow through the model\n",
        "        :param input_ids: (b x sl) The IDs into the vocabulary of the input samples\n",
        "        :param input_lens: (b x 1) The length of each instance's text\n",
        "        :param hidden_states: (b x sl) x 2 Hidden states for the LSTM model\n",
        "        :return: (lstm output, updated hidden stated)\n",
        "        \"\"\"\n",
        "\n",
        "        # Get embeddings (b x sl x edim)\n",
        "        embeds = self.model['drop'](self.model['embeddings'](input_ids))\n",
        "\n",
        "        lstm_in = nn.utils.rnn.pack_padded_sequence(\n",
        "            embeds,\n",
        "            input_lens.to('cpu'),\n",
        "            batch_first=True,\n",
        "            enforce_sorted=False\n",
        "        )\n",
        "\n",
        "        # Pass the packed sequence through the BiLSTM\n",
        "        lstm_out, hidden = self.model['lstm'](lstm_in)\n",
        "        # Unpack the packed sequence --> (b x sl x 2*lstm_dim)\n",
        "        lstm_out, hidden_states = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)\n",
        "        lstm_out = self.model['drop'](lstm_out)\n",
        "        # generate the prediction of each word in the vocabulary being the next\n",
        "        lstm_out = self.model['ff'](lstm_out)\n",
        "        lstm_out = lstm_out.reshape(-1, self.vocab_size)\n",
        "\n",
        "        return lstm_out, hidden_states"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "h0Rc2oXI6E3_"
      },
      "source": [
        "## Utility Functions"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fDdHYcJ-QRn9"
      },
      "source": [
        "This is a utility function which will take a model and a validation dataloader and return the current perplexity of the model against that dataset. We can use this to know when to save the model and to perform early stopping if desired."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "id": "z2r3DzpSOq_l"
      },
      "outputs": [],
      "source": [
        "def evaluate(model: nn.Module, valid_dl: DataLoader):\n",
        "  \"\"\"\n",
        "  Evaluates the model on the given dataset\n",
        "  :param model: The model under evaluation\n",
        "  :param valid_dl: A `DataLoader` reading validation data\n",
        "  :return: The accuracy of the model on the dataset\n",
        "  \"\"\"\n",
        "  model.eval()\n",
        "  loss_all = []\n",
        "  states = (torch.zeros(lstm_layers, batch_size, lstm_dim).to(device),\n",
        "              torch.zeros(lstm_layers, batch_size, lstm_dim).to(device))\n",
        "  loss_fn = nn.CrossEntropyLoss()\n",
        "        \n",
        "  with torch.no_grad():\n",
        "    for batch in tqdm(valid_dl, desc='Evaluation'):\n",
        "      batch = tuple(t.to(device) for t in batch)\n",
        "      input_ids = batch[0]\n",
        "      input_lens = batch[1]\n",
        "      targets = batch[2]\n",
        "      states = detach(states)\n",
        "      logits, states = model(input_ids, input_lens, states)\n",
        "      loss = loss_fn(logits, targets.reshape(-1))\n",
        "\n",
        "      loss_all.append(loss.detach().cpu().numpy())\n",
        "\n",
        "  perplexity = np.exp(sum(loss_all) / (len(loss_all)))\n",
        "  return perplexity"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ykX39dqiQ3qI"
      },
      "source": [
        "Here we define the main training loop."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "metadata": {
        "id": "7aKieRBZkghS"
      },
      "outputs": [],
      "source": [
        "# Truncated backpropagation\n",
        "def detach(states):\n",
        "    return [state.detach() for state in states]\n",
        "\n",
        "def train(\n",
        "    model: nn.Module, \n",
        "    train_dl: DataLoader, \n",
        "    valid_dl: DataLoader, \n",
        "    optimizer: torch.optim.Optimizer, \n",
        "    n_epochs: int, \n",
        "    device: torch.device\n",
        "):\n",
        "  \"\"\"\n",
        "  The main training loop which will optimize a given model on a given dataset\n",
        "  :param model: The model being optimized\n",
        "  :param train_dl: The training dataset\n",
        "  :param valid_dl: A validation dataset\n",
        "  :param optimizer: The optimizer used to update the model parameters\n",
        "  :param n_epochs: Number of epochs to train for\n",
        "  :param device: The device to train on\n",
        "  :return: (model, losses) The best model and the losses per iteration\n",
        "  \"\"\"\n",
        "\n",
        "  # Keep track of the loss and best accuracy\n",
        "  losses = []\n",
        "  best_perplexity = 300.0\n",
        "  # Set initial hidden and cell states\n",
        "  loss_fn = nn.CrossEntropyLoss()\n",
        "  # Iterate through epochs\n",
        "  for ep in range(n_epochs):\n",
        "    states = (torch.zeros(lstm_layers, batch_size, lstm_dim).to(device),\n",
        "              torch.zeros(lstm_layers, batch_size, lstm_dim).to(device))\n",
        " \n",
        "    loss_epoch = []\n",
        "\n",
        "    #Iterate through each batch in the dataloader\n",
        "    for batch in tqdm(train_dl):\n",
        "      # VERY IMPORTANT: Make sure the model is in training mode, which turns on \n",
        "      # things like dropout and layer normalization\n",
        "      model.train()\n",
        "\n",
        "      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch\n",
        "      # keeps track of these dynamically in its computation graph so you need to explicitly\n",
        "      # zero them out\n",
        "      optimizer.zero_grad()\n",
        "\n",
        "      # Place each tensor on the GPU\n",
        "      batch = tuple(t.to(device) for t in batch)\n",
        "      input_ids = batch[0]\n",
        "      input_lens = batch[1]\n",
        "      targets = batch[2]\n",
        "      # Pass the inputs through the model, get the current loss and logits\n",
        "      states = detach(states)\n",
        "      logits, states = model(input_ids, input_lens, states)\n",
        "      loss = loss_fn(logits, targets.reshape(-1))\n",
        "\n",
        "      losses.append(loss.item())\n",
        "      loss_epoch.append(loss.item())\n",
        "      \n",
        "      # Calculate all of the gradients and weight updates for the model\n",
        "      loss.backward()\n",
        "\n",
        "      # Optional: clip gradients, helps with exploding gradients\n",
        "      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
        "\n",
        "      # Finally, update the weights of the model\n",
        "      optimizer.step()\n",
        "      #gc.collect()\n",
        "\n",
        "    # Perform inline evaluation at the end of the epoch\n",
        "    perplexity = evaluate(model, valid_dl)\n",
        "    print(f'Validation perplexity: {perplexity}, train loss: {sum(loss_epoch) / len(loss_epoch)}')\n",
        "\n",
        "    # Keep track of the best model based on the accuracy\n",
        "    best_model = model.state_dict()\n",
        "    if perplexity < best_perplexity:\n",
        "      best_model = model.state_dict()\n",
        "      best_perplexity = perplexity\n",
        "\n",
        "  model.load_state_dict(best_model)\n",
        "  return model, losses"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ANhsatD4S4C5"
      },
      "source": [
        "Now that we have the basic training and evaluation loops defined, we can create the datasets and optimizer and run it!"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1XSbtdGX5VaJ"
      },
      "source": [
        "## Training"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 40,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 505,
          "referenced_widgets": [
            "ad20aebdaca0451f9c271d68ee1b903c",
            "337b9136c6c64b909d9c5d55ecaa9325",
            "3473d675137c4ee3b5638dd92633c46a",
            "8c1d408aa98140a3a5a6b13b5840e96a",
            "6d8d92055bcc4b4c8b8b3d901d30e078",
            "23bcae4e5dd64792bdc38d7d4516291c",
            "5db51cc47028454a8f2cc472327d8daf",
            "424f69cde0324e65919978883837ee25",
            "ab6bb8c43ca747c499b4b2b16bbe757a",
            "12012ea0aacb46b596d133e649974ec9",
            "2d717e69e9b2416eb4426b7e87ae0d56",
            "5857362d2eb9405e8fb72f1eca4fea77",
            "f5c773915f704d0eab76272ec235a0a6",
            "2ea20d2188af4389a0bac652d92a0f98",
            "c6d798f212e44644b668cb5a457ba6a2",
            "a81a518578ac4f488e81df3fb44ab212",
            "64b437fe17ca4a4bb27b8f9e06884968",
            "8ba41cd959364bd88851387a827c2ad5",
            "0afdfaaf3e4446499bdcda5fb4c24336",
            "ffd6ce5b834b4832bfab39b2355bc7e6",
            "ad388f5f78144f019d47e6b4c851963b",
            "57061d86231d4c52a07a49bfa930d580",
            "7070f5c9cd474026ad21cd09e8252f4b",
            "85f21169852c4ccd8a639150c6f767c5",
            "532fea234c744d9caa4844679a17e493",
            "1f841d13615e49059e02636b230081aa",
            "55af260b5a1d4f0db376bdde62a5fdc5",
            "2ff90189cc654a4aa1a39907f6409688",
            "b1a4820057eb432fb3b9cee6279d19bb",
            "4d82e6a9d3fd499ea6c386671b011344",
            "348b12dd99d747a3921f9b7d04abbf88",
            "42e610721fa44d3cb87354ad5fef409f",
            "fc809722ea6a48c68fd27b6734d6013e",
            "1ec310b23bd7427c803f439dd417de83",
            "016f09363dab401eb96a4ceb8922c433",
            "4b0607403a7a4993a558e9608655f320",
            "c7b91ebc0b7444598c6d7423408da043",
            "e4782bb59b0f4a0fa415146987490561",
            "6885af8154c746b5aceecd30db0bf207",
            "46bc7dda533d4c4c84910a8fe2d2df7d",
            "ed037450149c4f5c81075db72a943df9",
            "f1784e67ab4a4cfe8b80bf2c619ea686",
            "d01bd6479cb04afbb928254a23f1c73f",
            "947c403702c6478687466a743118b740",
            "089d2a995deb4ac0823bef706721a070",
            "ed53fe50fee14bb29b82fd217f06e44e",
            "c4b99f51c0f842cebd5ac8d9c79636b5",
            "fb5fec4aa5844e4c825ea4be90827e6d",
            "37117ee6a7ea41618bd0660fa903f83e",
            "c0d5cbfd3aae40f29b6dc85642871a7c",
            "c5597d8af958412ebf6aa4a2827d1022",
            "7a67f6ab52d447438db31370338d1731",
            "39cce72d57764bc8a51174902b1cb3b2",
            "f0219ec3626c4c9c9716d65d84dd7519",
            "71acfe661ae8481497d7ecd5b028007d",
            "3f954c5a1f314c079ea1d124a6758e56",
            "a36f90a83762430a8e4245fefc535e3b",
            "eb01a870225e47abb3a39c3954d48e00",
            "1bd9417becc74bdc86941cff592d216a",
            "8752220cdb5e4295b1704ba2ad46d0f2",
            "f647ff17b05e4946bb9b07d7ba803c70",
            "3096be10c22243f2ada247a65192fd6e",
            "6f31f0db87e745d18c42020be9884ed3",
            "c9f402c4684240aebfb2bdd39e735c82",
            "13747b82dc534114b40c3492b1a3c6e5",
            "61a4229783e040e18549880a8fc00f88",
            "0aac33b8241d40ad81d62fd96049d7e4",
            "17ce8d0a15ed4ca680694b91cd7f5537",
            "241acac065a84b2e8d3bd52a61fa02d1",
            "640632ed580e451fa8770a85060193fe",
            "cbdc63392dd64fc1b0a2a3ff4d1db344",
            "2d8859fa6787401e8962894f7ee8d0dd",
            "28023c08d79c4a18a296c25cbea29f18",
            "57bb766ed2184c68aee5a72cee2f47dd",
            "5badbd4c47c740c4b857810e156590fb",
            "d1c7fb56de0a44fe9dee27485841e23a",
            "c3128da4d4374433b986e739270825fe",
            "910331156b8d470baeae0034f87d1910",
            "dfa63e4ebb434f2e95dd8e37b7ca8bbd",
            "9fbeafe236f64136aab2d56c0f2e9400",
            "38aeaae4a7144355b4abb74ee850b7ef",
            "7d3f54c221e549f0b0e1f56a2ea58ea1",
            "46cf67e4d4f44845bbf3d03bb4e1edf0",
            "8a723642a6c549f5a88e4d42cc1638ad",
            "a7df374fbc0e47d39bdfb1a36cfe47fd",
            "33fb5c6e323a43d68e950d22c5c68008",
            "2a83cd82a0b7431cae1abde86faec336",
            "169db5da61fd4fc6b1ea7c392071cda7",
            "25b128a71ab54b73901b1de9720c222b",
            "04252c969b234ba3aead9fad983a9bab",
            "721456adde0a4c9db7d34cc3b1eaa8a0",
            "2e6c0f7423894f64b590b202aa5b4836",
            "c7577c1a0ebb4094b5d5d74c2822868e",
            "8518a350023f44a18aa82cc4c859394e",
            "bde068ee3b9541389d53ad356991200f",
            "6724e86a2aca4cfb99efea271f9bf59c",
            "f6d7858bb0b7477b8f492442931eb622",
            "47723d1ecda641d2997188be8d5270bf",
            "97d1de5ffaf04fed95be1327f1729eda",
            "ad5e1c973fa9438ea3147ee07427e6ca",
            "f3984cae86fe4b028c711860ed09aac3",
            "66d4ce1578b2480e8f0cef7f40c66ef9",
            "e43ac60695304e4f9c45ceba979bcb4e",
            "a32e898322894de78459a1434796c13c",
            "0b5a6ac48cf148bfbc3dfd6957b076ff",
            "6e15e56debd24c90811e1bd4f11dd13a",
            "16a0a57eca304a78804fb8789a43d719",
            "3b290b262a7047388808ac7d5f2f386f",
            "8cf0c6f9b0f14f20a638e913e02d17bc",
            "f7736506493349e09f74e36b0c3a0452",
            "ef34ab5b1f674aa7a14f953943961a79",
            "4432e62880524e489ada450358b90ac1",
            "f9a859780df048e3a7b752f54439f3e4",
            "82d8cb63961a415eb2b724167d1a58d3",
            "1cea82aac8cb44cfa5b6834bca40f7f0",
            "28560301892748caaa2aff3730d9ade0",
            "fdf994fb12694da581286cfcd5430fa4",
            "8a8ee3f8748c4cd19706c98bcff4b041",
            "1cbe5e922a7a423d948f1da8397267d6",
            "ccf3db0be70642acbf0c1fdaa342d2cc",
            "6025bf95308b417187b2f2ba42643a09",
            "0c14e3f5a97046228ec5e25811821e2b",
            "c950674e4a0945ddb0efe173cfa229e3",
            "8451f55c34ef4251b1b30910e00a579f",
            "78ef66ec410444b18cd586d4f628e457",
            "938ede49d9f54753a5028f66f541ec14",
            "ae7d9fa2da1a4968b64d21b52478391c",
            "10bd4dfa38374d8d8137fbec623aa2cd",
            "9eaaf4fa2d7c49ebaf2a590290fd5679",
            "02cbf0c7da284b098ff5060eb4feec80",
            "67e2468e7bb3494d9fcf8ebe3bb29a6d",
            "9ad899a60d064f6b9f7131caaf7563fa"
          ]
        },
        "id": "FJCaeC67kaWS",
        "outputId": "2b7df5c4-3d4e-4907-889d-7e95dd6714ec"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "ad20aebdaca0451f9c271d68ee1b903c",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "5857362d2eb9405e8fb72f1eca4fea77",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 1818.0729549106857, train loss: 7.606159637686805\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7070f5c9cd474026ad21cd09e8252f4b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "1ec310b23bd7427c803f439dd417de83",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 1179.8671494429902, train loss: 7.158430816733181\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "089d2a995deb4ac0823bef706721a070",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "3f954c5a1f314c079ea1d124a6758e56",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 1226.9833916912687, train loss: 7.133625640295981\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0aac33b8241d40ad81d62fd96049d7e4",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "910331156b8d470baeae0034f87d1910",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 1239.3600432924457, train loss: 7.105498870346502\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "25b128a71ab54b73901b1de9720c222b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "ad5e1c973fa9438ea3147ee07427e6ca",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 1334.6796099809183, train loss: 7.01695058580631\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "ef34ab5b1f674aa7a14f953943961a79",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/599 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0c14e3f5a97046228ec5e25811821e2b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/62 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Validation perplexity: 483.7851902849598, train loss: 6.430242812295191\n"
          ]
        }
      ],
      "source": [
        "# Define some hyperparameters\n",
        "lr = 0.001\n",
        "n_epochs = 6\n",
        "lstm_dim = 1024\n",
        "lstm_layers = 4\n",
        "batch_size = 128\n",
        "seq_len = 128\n",
        "\n",
        "model = LSTM_LM(\n",
        "    torch.FloatTensor(pretrained_embeddings),\n",
        "    lstm_dim=lstm_dim, \n",
        "    dropout_prob=0.1, \n",
        "    lstm_layers=lstm_layers\n",
        "  ).to(device)\n",
        "\n",
        "# Create the optimizer\n",
        "optimizer = Adam(model.parameters(), lr=lr)\n",
        "\n",
        "# Train\n",
        "model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 41,
      "metadata": {
        "id": "RTg5zq-wsD8v"
      },
      "outputs": [],
      "source": [
        "torch.save(model.state_dict(), 'best_model_wiki')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "s2Jiw-ssYc-u"
      },
      "source": [
        "Next we can plot the loss curve"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 42,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 282
        },
        "id": "B-zuTelyYZgb",
        "outputId": "30dba5d1-36be-4e9d-b9a8-588b3931b51b"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "[<matplotlib.lines.Line2D at 0x7fb3544b0390>]"
            ]
          },
          "execution_count": 42,
          "metadata": {},
          "output_type": "execute_result"
        },
        {
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXwU9f0/8Ndnc0IIgUC4wXCDcsklKCIIyGWFIlq0Vat+pSq26q+tYtGKZ1Er9aiVUlGsWkXxFkUQUG4k3DfhCPcRCEcC5P78/tiZ3dnZmd3J7uwx4fV8PHiwOzu7884m+97PvOdzCCkliIjIeVyxDoCIiELDBE5E5FBM4EREDsUETkTkUEzgREQOlRjNg9WvX19mZ2dH85BERI63Zs2aE1LKLP32qCbw7Oxs5OTkRPOQRESOJ4TYZ7SdJRQiIodiAicicigmcCIih2ICJyJyKCZwIiKHYgInInIoJnAiIodyRAL/ZuNhfLDKsBskEdFFyxEJ/It1hzB98Z5Yh0FEFFcckcAz05JRWl4Z6zCIiOJK0AQuhHhbCHFcCLFZsy1TCDFfCJGr/F83okEKgUquHERE5MNKC3wmgGG6bRMBLJBStgWwQLkfMUIIVDJ/ExH5CJrApZSLARToNo8C8K5y+10Ao22Oy4dLAFy7k4jIV6g18IZSyiPK7aMAGprtKIQYL4TIEULk5Ofnh3QwF1vgRER+wr6IKd1NY9P0KqWcLqXsKaXsmZXlN52tJS4B1sCJiHRCTeDHhBCNAUD5/7h9IfkTQqCSTXAiIh+hJvCvANyh3L4DwJf2hGPMJQTYACci8mWlG+GHAFYAaC+EOCiEuBvAFABDhBC5AAYr9yMXJEsoRER+gi6pJqW8xeShQTbHYsrlEqhgAici8uGIkZgi1gEQEcUhRyRwAKyBExHpOCOBiwD9FImILlKOSOCCRRQiIj+OSOAA2AQnItJxRAIXApDM4EREPpyRwGMdABFRHHJEAgfYC4WISM8RCVywFwoRkR9nJHAWUYiI/DgigQNc0IGISM8RCZwlFCIif85I4LEOgIgoDjkigQPshUJEpOeMBC7YBici0nNGAiciIj+OSOBq+5s9UYiIvJyRwFlBISLy44gErmIDnIjIyxEJXB2JyfxNROTljATOEgoRkR9HJHAVL2ISEXk5IoF7eqHENAoiovjijATOEgoRkR9HJHAVKyhERF6OSOBCqL1QmMGJiFSOSOBEROTPUQmcJRQiIi9HJHBexCQi8ueMBM4lHYiI/DgigatYQiEi8gorgQshHhRCbBZCbBFCPGRXUP7Hcf/PXihERF4hJ3AhRCcA9wDoDaArgOuFEG3sCoyIiAILpwXeEcAqKeV5KWU5gJ8AjLEnLF/eBR0i8epERM4UTgLfDOBqIUQ9IURNACMANNfvJIQYL4TIEULk5Ofnh3Qg9kIhIvIXcgKXUm4D8AKAeQDmAlgPoMJgv+lSyp5Syp5ZWVkhBwpwMisiIq2wLmJKKWdIKXtIKfsDOAVgpz1h+fIs6MAaChGRR2I4TxZCNJBSHhdCtIC7/t3HnrD0x4nEqxIROVtYCRzAp0KIegDKAEyQUp62ISZTbH8TEXmFlcCllFfbFYi140XzaERE8c0RIzEFayhERH4ckcA92AInIvJwRAL3ronJDE5EpHJGAmcFhYjIjyMSuIoXMYmIvByRwL0lFCIiUjkjgbOGQkTkxxEJXMWh9EREXo5I4N4FHYiISOWMBK65ffWLC/FxzoGgzzly5gJy8goiFxQRUYw5IoGrpAQOFFzAI7M3Bt134N9/xNhpK6IQFRFRbDgjgSs1lN7P/2D5KcVllZGKhogoLjgjgSt4DZOIyMsRCTxWnQillNhwIKIz5BIRhcwZCTxGGfzd5XkY9cYyLN4Z2lqeRESR5IgEHis7jhUBAA6cOh/jSIiI/DkigYuYFVGIiOKXMxI48zcRkR9HJHAiIvLniAQe6wY4uy8SUTxyRgKPUQZn6YaI4pkjEnikrdl3CjuPFcY6DCKiKkmMdQBWRLoXyo1vLgcA5E0ZGdHjEBHZyRkt8DDzd1lFJV6Yux1nLpTZEw8RURxwRgI3cMXzP2DNPmvTxc7ZeARv/rgbL8zdHrF4jp0txm0zVvFLgoiixhEJ3KgBfuxsCabO32np+WUV7pkJSyI4Q+G/Fu3CktwT+HztwYgdg4hIyxkJ3KQ7SKXFfGxXq3h3fhH+s3hPwH3Y45CIosURCdxMpcUO2s/O2QYAyC8qCbjfwVPn8dYS8wQ99s3leO7bbSguq7AeJBFRhDgigZtdw1y1t8ByHRwAiksDJ95+LyzCs3O24eiZYsPHzwV5PhFRNDkjgQfohTLlO+sXJsst1lzUmnlVmJV5iIgiJawELoR4WAixRQixWQjxoRAi1a7ArFqddwrnS8st7VtRaa3kMvqNZT739c8a+spiS69zobQCp86VWtqXiKiqQk7gQoimAP4AoKeUshOABADj7ArM91iBHz913tpFygpdzfzgqfOYs/GI334nTZKuGsa+k+bzg287chYFyvN/NX0FLn9mvqXYiIiqKtwSSiKAGkKIRAA1ARwOP6Sqs1q8KK/wTeCj31iOCf9ba2ssH+ccxIhXlwAANh48o2w7gJJy1s+JyF4hJ3Ap5SEAfwewH8ARAGeklPPsCkwr2FD6T9ccRJ/nF6AySIlEX9s+EaRXivf4yv8WvymOni3GyNeWeO4/MnsjXp5nrc86EZFV4ZRQ6gIYBaAlgCYA0oQQvzHYb7wQIkcIkZOfH9raksES58vzd+Lo2WK/EgkA7Dpe5LldprTAl+TmB+3PbaS4CgOBthw+63M/v9Dal4WRz9cdxLJdJ0J+fryQUuL1BbmmvXycpKJSYsvhM7EOgy5y4ZRQBgPYK6XMl1KWAfgMwJX6naSU06WUPaWUPbOyssI4XIhBTv3Jc7tXdiaKyypw24yf8dy328J63clfbcFXGw5j7f5T+H7LURw5cyHg/lb7rBt5eNYG/PqtVSE/P15sP1qIl+fvtL1sFQuvLsjFyNeWYvMhJnGKnXBmI9wPoI8QoiaACwAGAcixJaoQBcuRlzapjdtm2JMIZy7Pw8zleZb3t9gBplpTewGdrwb96TcdPA0AOF5YDCAjtsHQRSucGvgqALMBrAWwSXmt6TbFFVpMQQaySymxOu9UlKLxPzYRkZ3C6oUipXxSStlBStlJSnmblDL0Qm8AVssPs1YfUOMyfLwqObSw2N5ZBZm/vTjkicgejhiJaXXSqr9+uQWAebIM1kLX6jx5Hv63ar/n/unz4Q3IqZQS50qsDTgi5+AXM8WSMxJ4FT8lZnuH82H7/YfrQn8ygO82H8VlT35vuesiEVEw1S6B78kvQrenjLuj/60K86ZoPf7FZuQeKwq+owVqF7p9J8+h61PzsOt4IVbsPmnLa1P0cQociiWHJHDr+/71yy0ojECp4uhZe/ouT/piMwBg9pqDOHOhDIOnLsYt/1mJ77ccteX1yd/p86VYsO1YrMOIW5yzx7kckcCtTkIFAEvjfMDLhgPu7mf6htvv3luDPfn2tPLJq7JS4nfvrcHd7+bgJMtXhka/sYxz9jiUIxJ4OINg4tH+k+dx7Kx/Mnlv5b4YRGPs6JliZE+cg/XKF45TdXhiLlbtdc8ZX1ZRvf6O7LLjWGGsQ6AQOSOBV7NRMP1fWoRZOQf8tr+zLC/6wZhYnOue9uD9CHyphPvb7PXcD7hl+kpL+5aGMLd7tJRVVOLmaSvw817ri5IQaTkigbPhFL4NB05jxtK9MY3Brgt++YUlWLEn8hd+R/1zKf5hceHsUBw8dQE/5xXgkdkbInYMqt4ckcCrWws8Fka9sQzPfLPV0r4zlu7FI7M3Rjii+Lfh4Bm8uiA34D7VrLpHDuOMBM5PSVRZTfRG1h84jW83+S+S4RQni0os9cioyhJ6O44WYu5m9jIi+zkigRtNE0vxafQby3D/B9GdbXDu5qPInjgHuy304pFS4umvt+Lj1Qdw+LT/DJI9nv3B9h4ZQ19ZjHvfX2PLa83fegxfrDtky2uR8zkigasllPsHtI5xJJE36fNNmPT5JtPHDxSc95njPBzvrcjDMYv920vLK3Gg4DxmLvOvo0spUW7zxcKTRSV49putll5XbfFvOhh8atfDZ4rx9rK9eOTTjbhyykL84vWlmPbT7oDPiZeJyJbk5uOe/+bgoVnrYx0KxQlHJHD1M5zgqv7D3j5YtR8faOZg0bv6xUU+c5zrzd96DGctTMR16PQFPPHlFtzzX2szALd7/Dtc/eIiTP56q9/c5098uRltJn1n6XWsmvz1Vry1dC9+2HY86L5fbXCv5Kef68Zo/IA+GW86dAZTgozQ/XTtIb/VnLSvU15RiX/9uAvFZZGbJvdEUQlum/FzxF6fnMkRCVytgbs4bjmgg6fO457/5uBBC/O2qGc1J4uqPgLvmw1HfFY0en+l/xfO3hPngr5O9sQ5aPXYHM/9hduPedYOLSt3J8y/fbcN2RPn4MY3lxu+hjaRaic9KywuwxNfbvbdN8QOjNMX70bbSd9hzT7/7n5CuEfVvjh3B15fGPiCZ8G5Uox6YxkOGZRugikpD/0M58iZC8ieOCeidfiiknJMX7w7Yh0OjMpdxATuWAXKhbZ/zN+JecowfLUFuL/gPMorKgMupKxOzhWsPGBUwnju222GKxppP7xD/7EYs1bvx4GC8wFfX33Kmn0FuGtmjqc1rP6q9508rzx+CmfO+59ZbD7kXbpO/Ts5WVTiN5uk9lhGyisqcc1Liwwf26nMg7PKpL+22jo/bRCfVvdn5mPDgdN4Y9GugPvZTX2PZq85GLFjPPvNVjz/7XYs3O49Y7pp2nI8qfsSDcUX6w7hyikLsTIKXUedxhEJXD0VTnBEtPYwSlZaarJ+dUEuxr/ne4Fsd/45tJn0Hdo/Ptf0+eoIy2ANpqpMTaC92FxaUYlHP92EMSYtZ71T59w/r5qwjfzZoL/0L/651HNbPfwek9b/nz/ZgH4vGCfpvJPnAh5b+/p6icofpr7MYkb/xaLKyStA9sQ5nukWtMJpungbQGG8SBBq2U57prA67xTeXRH+QLC1+92LsGw/cjbInhcfR6RENcm4XALXd2kc22CipOvT80I4bfT/hO46HniYtLassCQ339OaV50oKkX2xDn6pwEAjp0t9mm1GtWcjeYfMWr1qy3uQGcEJ4N07ytSJjG7adoKw8eXB5j1cfJXvl0njVrJy3ad8FsDU0ogUcmMH+e4W7il5ZXYFiTZaLsqqj/xoh3u1uuS3NAW/zajvqd2nMBWVMqoj8vgebc5RyRwqSmhVKX/rdMFWole/xEqr6g0/IAOnrrY93lSYuF278x82s/ibTN+9mvNB9L/xUU+rVaj/vqVEvjXj+5kKAJ8FNXkdU5ZL9PoZwlW7nn6m62eY1VVkW4Gy5e+3+G3z/LdJ3H960uV+LwB6v8mn/p6C4a/uiRgrXv7Uf8vVrVEaJQfzf7srUz0pr5tdpQgW//lW9w4zdpZVTDvr9yH7Uett6rjoy9QfHFEAveUUESgFFD9BOp1I6X0qS9bXXCiUgLr9ntP0aWUOHWuFE99vaXK8ekvrBlN0AUAL851J0O1tb/9aKHfzIvqhdBAFz+D1Zi1x6qqcC4S6n9La/a5T/mDreLkaRkr99V5W7TfU8VlFfh20xH8sNV/Otwv1h1C6798i8OnL0BKibmbjxqWcSqrkMDPFpd5ShZmtH8/eurv2EqPnMe/2IxhrywJuM/0xbs9ZZg46c0ZV5yRwNUWeICEVh27GLqEwA5NS22/prUrJXwmQfrOYg+DSil9Wm0nikpx17urbZlIa+Dffwz4uPYDePO/jcsc+YUlft0UVXtOnENpGIk2kEAXfANZvDPfr3Wstq4DJRxp0Cfm3z+5e/as2ust9Uz5bjvu/2AtnvjS/wv207Xuks3OY4VYuacA976/xvDMQT0z0r6umQkfrMWYfy3H+dKqzamvb1oFGri098Q5jNJcuwjk+W+rtgjL8cJi29ezjWeOSOD1a6UAADLTkjCoYwPDfd75ba9ohhQVLpfA10ofZ8A9i6Eq1NaIlP4jW/V13UjRHvZEgO6L037cjW1HjGv34QzzDyTUBQ30F+m0ZZ7XF+aafuGcvVBmOiBLrdXvO3kO32w0npbg6hcXYkmuu8T2war9+Gi1+wwm91gh9p88jxv+uRRzNx/FmQtlngR+oqgUF0oDf1Gprzntpz0B9wOA5btO4MOfjS/I/rjDvI7/2oJcbLAw6ErPyp987+cWYIiubFidJcY6ACvG92+FRrVTMbpbUwgh8OBHviPRNk6+DrVTk2IUXeT84cN1phdtK6S0fFFK26uhUkqU66Z3rMqCGeGw2spdtbfAtJSinTN9lY3dyk5ZKM9oactX2guWM5fneW5/v+UYHpplXNq6933vdANm7/41L/0Y4Pjes5T5mvLKoh35qPn9dmw8eAb3vr8GvVtm4tdXtPA8bnVeoZ3KWcSafQVoUqcGGmfU8Hn8ZFEJbn1rFQDglt4t/J6vVVhchvTUJGw/ehb7T563VAb9f7PW4zPdlAGHTlm7qG/X6llO4IgWeFKCCzf2aGZ6AdMseXdtlmF7LC3rpwV8fO0TQ2w9nlkLbODff/SccqsGvWw8QnPUG8s8t0+dL/Wrk0arU8F9JnOk6Hu5GF3gM/Iri3OC223H0ULkalrP/1ninV7gqa99zxB+2Bp8JKndjp3xJrBth89aTtraL031LO3GN1cY/l31ePYHz+1dxwsxJ8AEZuqgr2GvLMH499bgyBnfBFtZKbEnvwg/7XS32s+VlPslbwB422Aah4udI1rgoYpEYgpWa09MiF4tPpSVVO58Z7XlBGm3/MLqsaTZ0Fesn6JbWVDCqP95qDV5wLeVXVhSjodnmc83rq11a69haM/Kzgcpu2h7Ohl1MzheWOLzJa2fy/3Gacs9F0av7dAAdWpUv7PpSHFEC7yqvvl9PwBASqI9P95rt1zuuZ0YJIEnueL7LY1V8n7oI2u9ZC5WLR/zPQsJNAgrmLPF5hcgD55yD6vPnjgHBedKUVZu3Mop17V+7nzH2jwsRtMVbA3SJ17bq2Xh9uOGrW8yFt/ZJkSXNamNPw9tj9dvvTz4zhbc0LUJZt7ZC9/8vl/QFnY0W+BO8sX6w8F3uojZ2UUu0GyV2rOHBz9aZzpKpqLS98xh0Y58vG1xRSf9hduNIVywNHPtyz/a9lrVgeMT+If39PHcnvabHph6c1cIITBhYBu/Cy9GVk8abOk4A9o3QKemGUhL9lad/n1bD7/9grXQieLFiaJS0wvhyQkuHDzlW9p52mIPoGA9XcKxJz/4JGkXE8cn8L6t63luD+vUCGO6NzPd99P7rvTblpWeUqXjvTrO26ofelkjXNEyE12b1/FsE0Lgpz8PqNJrEsWCSwB7TRLioh35pvPGBHKyqDTiK2hZmS75YlGtL2LqdalCr5ThnRphUMeGqFcr2Wd7o4xUrJ402NPSnvW7vgB8e1K0yKxpQ7REkbXl8FmfHkp2ePKrLRGfr6jL5HmYfW9f9MzOjOhxnMDRLfBNk6+r0v5mQ4nbNKjlt61r8zoY26MZBrb3HziUlZ6CumnJfttVF9N8LUR62i6GkTLWZMKyi42jW+DpVRi888m9fX2m03xseAeM7eEut3x635UoOFfq6Ub15YSr0Lmp/X3IiYjsFHILXAjRXgixXvPvrBDiITuDMzNxeAc8NLhtlZ7TKzvTp2Vcv1YK6ilD9DNqJPkM0OnavE7AeVeqqnsLb4382dGdMPvevra9NtHFquBcadB586u7kBO4lHKHlLKblLIbgB4AzgP43LbIArj3mtZ4aHA721/3D4Pa4pVfdbPltabe3BXJCS6kpybinqtbAQCGXtYQv+lziemXQ6/surYcm+hi0P2Z+ej69LxYhxFTdpVQBgHYLaUMf/mNCGueWQMHCi4Yzj/x/4bY96UwpnszT48YddV0dZRa/bSq9XwhIjJiVwIfB+BDoweEEOMBjAeAFi0CT3oTDb2z6+FAwUHbuzrNe7g/1u4LPI+yWsFpUY+9VIjsUlRSjlopjr6cF7Kwe6EIIZIB3ADgE6PHpZTTpZQ9pZQ9s7Kywj1c2K7t4O5VclmT2ra+bruG6RhnMiub0XdFW4OeLwICk0Z0tDWui8GLY7vEOgSKoU5Pfh908Yzqyo5uhMMBrJVS+i8ZEodGdmmMrU8PxWVNotfLRJ0fQtu78KPxffC//7vCb997+rdC3pSRePmmrrbPbFhdNasbfMRtOF6/xZ4pGShyVucFPvutruxI4LfApHwSr2omR/d0S22Ba2dqq1crxWcUqbKDx409miEzLRk9LonMhc0rWlajQRBhVsOGXdYo4OPqIiJCAEsfHRjewYKYMqZzRF+/ukpwASsCLFpdXYWVwIUQaQCGAPjMnnCqOV3nE/2An0Ed/AcNzb63b9CWeCg9HsOddGvGHT3Den60vRSgzJKeGvgLXR0AlpzgQrO6kb1+YcfCwxcjlxD4ZuPFN2FaWAlcSnlOSllPShmdNbkcql+b+shKT8F917QOuN/4/q38tgkhkJmWjNQk81/V8omD8MKNndHbYGjxyM6NUTM5wW+7flUeI7PG9zF9LCXR/zW1/nePtzy05amhyHl8MKaM6Yy7rmoZ9Lhm8qaMNNyu/UleGtsFTTJSfR5PS07ATT2bG74PAJAS4L01Ok7uc8ODdjetr5mC4YnrL/V57OWbuuKrB64yfB7zd2h++85qn/uPzDafAx1wz3f+xBebfVZWciJHD6V3irppyVg9aTA6GYzunHmney3P9NTEgEPwtz8z3PSxRhmp+FWvFpj1uz5+5YA3ft0dyQbzomt74dRLS8bd/fwT6xWt6mFkZ/Ml3QJJ0PwsaSmJqF8rBeN6t0CdmtZGz259eqil/QDfi8QSwOcTrsKMO3riiwm+SdKs59EjwzrggYFtTF9fnfdmdLcmANwrRI2+vGnAmBplpCJvykjkTRnp995KAJc2Nr6IHqwF/vjI8C5y68/W/vXr7mG9XjxRF3kGgI9zDnpW+NFatOM4jp8txvoDp/Deyn14aNZ6v32cpNon8MEmiyDHC3XIflWnof30Pv/RnEIIjFKSjNZpg9Fq2gn7f9G1iafWft8A37OEbpqZFrWClR3Mvoz0OTRDt/pKoktgZJfGqJmciNZZgZevU2m/TDo0SkfD2qkY1LEhmusubpp959ROTcKfhrY3ff3EBBc2PHkdnv9l4Pr0kEsbem4brUyjZfb+aNcDef9u/4vcqUkJePeu3gFfO5Df9LnE536/tvWRlpwQ9DoA4H8mEUywv5EkC2W8yb+wfsziMt95yO94270Ixdr9p7Dl8BlkT5yDO99ZjRunLffsIyM8c2KkVfsE/tYdvUxPveNJkzpV60nRrbnxxU2rE2lVahL4E9dfihGdG2PhH6/Bo8M6+OxntMIKANRKScT2Z4ahX5v6AIB//Kqrz0AoIYDMtGS/6XrV12ugbL+sSW38po+3++Wu50fgjVvdrUJta7RhbePBT3VrJqFbszpY+uhALJt4Lbo0853aF4Bn5Kv6k4zu1gTLJ15r+HoAsOf5EX7bMmokITHB9+PSQPez3RukRKaSUppet9D+zP3a1se8h/tjwR+vwa3KwsQSwDXtQu+Oq2/h105Nwpanh2FCgDMQ1V1XZRtuN5oMDkDQvtnqXESB3HFlNi4JY9zEhgOnMeZfyzHytaWebdoFodfuP+2zKLXTVPsEHu/q1UrB32/qineUUooVSx4ZGCABWHuNtg3TMfXmrnhmdCfPOp+tsvw/iIEaKKlJCZ6abWZaik+bs0eLusiZNBirHhvk85yhSktPW+/PNBmZGmz9UQBY99frkFEzCc3q1kRTky9B9XXU1tbfxnQJ+IWpnepg1V8Gme732f1X4rVbLsdXD1yF7i3q+IwtMPvicz/m/0XbtVkG5j3c35NgRypTsrZrmI7WWbW8722YLUb19f+vX0ufLzH99/6Y7t4SUZ9WmXhxbBcIIfx+J2ufGIJWJgt9P3Ct90vh4cHtsPKxQT5dPrs2Mz6703LPrz8Q254eFnRfI+bT5Xp/jhvfXO7zyMo9J/HWkj36J8QlJvA4MLZHMzRITw2+I9ytmuaZNU1b2lZa4B+N74NnR3fCmO7NcJvulFpPTRcD22dh0+TrPK0t9SjqB1J7ujxhYGu4XMLzT6tj49rImzISgzu6yw1XtakfpNigxKHLW/VrpXhapWbUmreatNSTDrO3yOiUv4bJhU8AaFa3Jm7o2gRdmtXBZ/dfhdSkBNze9xLDeLXJ3ajFmpmWjHYN072x6taktJq2p97cFcM7mZdD1F9Hw9qpPl9i+vdk6s3ei7T3D2iDm3s2BwDs1p2dZKYlo6TceOHmWzUD2x4c3BaNMlLx4o3e3kBjujfDJyYTu6WnJOKt2709nSJ5cVe//ue46Svx7Jxtnvsl5RUYN30FNhw4rX9qzF2c408dauPk63wuDhppnBH8i6BPq3pB91Gpiahdo3TD6Xv/ev1l6NcmC91b1MVPO9wXjYLFCADZ9dOw8rFBaJCegnOl5Xh1Qa7fPle3re+3CHN2vZrIO3keOY8HXwrPm8DVn8V936xl/9OfB+LMBd/rBTWTAve20RvXqwX+u2Kfz6ruAPDVA/0gpcSxwhLDMwX1i7dXS3dp7LdXZvs8rsYebJZMdQ4edYGRJ39xKZ762rsU2pBLG+KtpXv9xiBk1fKeBem/GINNO6H/WVVGjYkrlZIb4P69mI2IHtqpEQZrrinEcqnCnUeLsHJPAf7y+SbM+cPVMYvDCFvgDlI7NQlpQeqKnZpm4Jvf9/PZFs5FL30rVk0k6mezRnKC53S/UFkN3eo87Y0yUuFyCaSnJqFpnRqeVrnq0WEdPFPvqiniywf64cc/DbD0+mre8cQO3/t6mWnJPtMKA/Crewej1uqHd/LtvZPgEkhMcPkl7/YN03F730s8F0gbpLt7r1yh+5JV1xjWfzkueSTwwKJMzcIj4/u3whWt6iFvyki/HlENanu/+NVYBrR319qDVW1u6mley540oiNeHWfc5dIlhOnFXv3WxFeOwAoAAA9gSURBVASX6euEQls2KS2vxPHCYtN9hacBYNvhbcMWuIN9/1B/wwswnZpmoGZyAhoprfFr2mUhb8pI5BeW+LUwg/EkbPW+5xH/D179dHeyyDapiQayzOCiYmKCC5e3qItBHRrgd8oFwowaSX49V8wkKcm3faN0AEDfVvWwfPdJSyWbKWM6Y44yi2RV1KuVgo2Tr0MtC6N9c58bDpdBXdlIpe6LU2XURdTMX6o4z4639BQ4c7Wqb3wRE3BPDWH6+i5hWhox2m52sdQOvZ9bYNrZQY0l0mt9hoIJ3MHaN0r3JCe9LU/596POSk+xtIjz2ieGeJLKgPYN8Pd5O31OZ82Mv7oV2jdM90wYZocEl8CM31q/wKuVmZaM9+++Al2au1ub/7m9Jw6dvmBpsY5xvVuYTk4WTG2LZyBJVWjdq8k0weX7HLPSQkqiCyXllWEt7+fyJK7A+7VtWAuts9KwO4wV4xNcAmMub4pP1rj7chu1zCM9SnXF7pP+01tE4bjhYAKvpsL54GpPuzs1zfBtmQT4MCcmuDCoY/BEH0392nprrmkpiWjX0PgLL949OrwDkhNd+EVXd2nmrdt7wuVyt/hfHdcND37kOyBlROfG+HzdIdQO0hc7kAcHtcPWw2f9Rvi2a1gLO48Vee6nJiVgwR8HYMjUn5B7vAiPDe+AAQZryRpR/0z1f61Gf75WzlTCcct/VhqOeA5UQlm+6wTW7DuF3w+q2gphdmECp5DEcaOkWspMS8Yzozt57mvPiEZ1a+qXwB8f2RFtG9ZC/7ah9xnv3CwDyx/z70Y57+FrUFxWgXMl5T7b1QTbv12W6ZmhGX0r1+jvKxotYf1gIAC47/21AIy7ht761ioAwObDZ/D6LcajniOJCZyqJP6qgGSkXq0U3D/A3Q87PTXRdOh+qFKTEpCq66EjLNbMP7m3r9/iJ438ek8ZlVCqHqcd9p445/O/ke+3HMOWw2dweYvoLovIBE5Vor+o6WQ3dG0S8EPpND1Nph7eNNnavDJfP9APBWEsjJDoGTAVeL9e2ZnopZRlUhIT8MqvuqF3y0y88sPOgM+LdAklmLIKidxjhciun4akBBfKKnxb6+oZwp78IqSlJKJhbWtjO8LBBE5VcmWb+sg7uR+1LfYEiWevVaOFGlb9ZZDli6dmOjcLb5GTf956Od5Zllfl1r46MVjvlvXwcY5yETNGJZRghvxjMXpnZ+Lje/ti5rI8n8dKlYR+7cs/ATCfPdNO7AdOVfLUDZfhxz8NQP1aXJg5njSsnRpw1Gg0XFIvDZNvuMxSLx8jY3s0w3+VMQtDDSbXinYvviNnLmBp7gm/7T/nFQAAzhb7dsl988fdhq9TUl5hOtgpXEzgVCVJCa6Q+nkTWdFfGbNgNGFXeaW3ZDHEQrfWcPX920J8t9l4LMCu40V4feEun20XSisM923/+FxM+GCt7fEBTOBE5BDaVqxLwHABE8A9345dLpQZJ+XBU3+q0uvM3XLUjnD8MIETkSO0zqrlWW1JSmDmXb0889//cUg7zwIa9TTjGMJVZmHlKtX6A6dx7KzvkPxSk4m+7MIETkSO4HIJTFWWsmtWtyZqJid6erM01HRDtPNa55wqrLN5oawCVzy/wGfbzf9eYV8wBtgLhYgco0+repj2mx4Y2MFdI7+1dws0SE/xXQ3JxgwezrXHkvIKrI/wFLRM4ETkKMM08527XALX6XqsxL6zoVv7x+dG/BgsoRBRtRLj8T5RxQRORNWKnSWUeMcETkTVSqwHNEUTEzgRVSvPjurkc79rmFMExDMmcCKqVurq+oF/+YB3icFbr2iB7Ho1ox1SxDCBE9FF4/lfdsZ3D/aPdRi2YQInootKjeQEw7lWnIgJnIguOgPbRz+B7zxWaPtrMoETUbXz1u09Az4e6pS34TBbgDocTOBEVO0M1k03O1qZ9EoVi77ijTNq2P6aYSVwIUQdIcRsIcR2IcQ2IURfuwIjIrLLK+Mu91khJxajNSOxJFy4LfBXAcyVUnYA0BXAtvBDIiIK3z9vvRzfPXi14WMtMt1dCX99RQvPtqZ17G8ha8VVAhdCZADoD2AGAEgpS6WUkZ16i4jIouu7NDFd3OHqtln47P4rcXe/lp5tH43vE9F4ItHqD6cF3hJAPoB3hBDrhBBvCSH81toSQowXQuQIIXLy8/PDOBwRkX26t6jrUwtvnlkzoq3wSNTdw0ngiQC6A3hTSnk5gHMAJup3klJOl1L2lFL2zMqqHn0viah6qNStlCyjvXJymMJJ4AcBHJRSrlLuz4Y7oRMROZKV9H1j92YRj8OqkBO4lPIogANCiPbKpkEAttoSFRFRFOgb3PoWuV7HxrXjar7xcFfk+T2AD4QQyQD2ALgz/JCIiKJFX0IJsreUtq65Ga6wEriUcj2AwEOeiIgcwsoamCJuFm3jSEwiuog1UkZHPjvaPYf4+dJyv32a1qmBt3/rbaeqLfDnf9k58gEGwUWNieiiVSsl0WeEZnFZhd8+HRun+wyDVxO4tHTJ061lfb8e1rZgAiciUhiVUPR1cbU/t9Uehy+N7YLrLmsUZmTGWEIhIgpAQtPqlvBUwK32Ge+VnYmMGkkRiY0JnIgoAG3XQgmJ1CT3oskJLmvp0xXBbitM4EREOrf0bu657W51e5Pww0Pa4Xf9W2FsD++Anu8f6o9LTNbajGS3QyZwIiKdp0d1wsw7ewHwH9xTKyURj43oiOREb/ps3ygdN/dsjmhjAici0nEJ4Sl9WL1YqdbEU5N802okW+DshUJEpOMSQJdmGXAJ4L4BrVEz2V33bl7Xt0zyzp29kFkzGYA30f9fv1b456Jdnn0iufoPEzgRkY4QAnVqJmPP37x9xP99Ww/0bV3PZ7+B7Rt4bps11CM5bpMJnIjIgqFB+nKrLfBozpXCGjgRkaJR7dSQn6uOzBQA3vy1d2Zt1sCJiKLgiwlXYduRsyE913OxUwgM79wYDdJTcLywJKKTXzGBExEpGmWkolFGaK1wT/7WbWc/cCIih2ANnIjIaaRaA1f6j0fhkEzgREQ20JTAfUSyQc4ETkRkA083Qt39SGICJyKygacbod9VzMgdkwmciMgG/i1u35p4JDCBExHZwFsD950Ei90IiYjinL4Fzl4oREQOYVYDZy8UIiKH8PQDj0I3FCZwIiI7mOTrSM4HzgRORGQD/UAe1sCJiBxCSu90sgAw445euKFrE9SpkRSxY3I2QiIiGyQluHz+73FJXfS4pG5Ej8kETkRkgwkD26CiUuLWK1pE7ZhM4ERENkhLScRjIzpG9ZisgRMROVRYLXAhRB6AQgAVAMqllD3tCIqIiIKzo4QyUEp5wobXISKiKmAJhYjIocJN4BLAPCHEGiHEeKMdhBDjhRA5Qoic/Pz8MA9HRESqcBN4PylldwDDAUwQQvTX7yClnC6l7Cml7JmVlRXm4YiISBVWApdSHlL+Pw7gcwC97QiKiIiCCzmBCyHShBDp6m0A1wHYbFdgREQUmAh1ykMhRCu4W92AuzfL/6SUzwV5Tj6AfSEdEKgPwAm9XZwSJ+CcWBmn/ZwSq1PiBCIb6yVSSr8adMgJPNqEEDlO6GfulDgB58TKOO3nlFidEicQm1jZjZCIyKGYwImIHMpJCXx6rAOwyClxAs6JlXHazymxOiVOIAaxOqYGTkREvpzUAiciIg0mcCIih3JEAhdCDBNC7BBC7BJCTIyDePKEEJuEEOuFEDnKtkwhxHwhRK7yf11luxBCvKbEvlEI0T2Ccb0thDguhNis2VbluIQQdyj75woh7ohirJOFEIeU93W9EGKE5rHHlFh3CCGGarZH9G9DCNFcCLFICLFVCLFFCPGgsj2u3tcAccbVeyqESBVC/CyE2KDE+ZSyvaUQYpVyzFlCiGRle4pyf5fyeHaw+KMQ60whxF7Ne9pN2R79372UMq7/AUgAsBtAKwDJADYAuDTGMeUBqK/b9iKAicrtiQBeUG6PAPAd3Gud9gGwKoJx9QfQHcDmUOMCkAlgj/J/XeV23SjFOhnAnwz2vVT5vacAaKn8PSRE428DQGMA3ZXb6QB2KvHE1fsaIM64ek+V96WWcjsJwCrlffoYwDhl+zQA9ym37wcwTbk9DsCsQPHb/Ls3i3UmgLEG+0f9d++EFnhvALuklHuklKUAPgIwKsYxGRkF4F3l9rsARmu2/1e6rQRQRwjROBIBSCkXAygIM66hAOZLKQuklKcAzAcwLEqxmhkF4CMpZYmUci+AXXD/XUT8b0NKeURKuVa5XQhgG4CmiLP3NUCcZmLynirvS5FyN0n5JwFcC2C2sl3/fqrv82wAg4QQIkD8tgkQq5mo/+6dkMCbAjiguX8Qgf8wo8FoGt2GUsojyu2jABoqt2Mdf1XjinW8Dyinn2+rZYkAMUU1VuX0/XK4W2Jx+77q4gTi7D0VQiQIIdYDOA53MtsN4LSUstzgmJ54lMfPAKgXjTiNYpVSqu/pc8p7+g8hRIo+Vl1MEYvVCQk8HgWcRle6z5virn9mvMal8SaA1gC6ATgC4OXYhuMlhKgF4FMAD0kpz2ofi6f31SDOuHtPpZQVUspuAJrB3WruEOOQTOljFUJ0AvAY3DH3grss8mis4nNCAj8EoLnmfjNlW8xI42l0j6mlEeX/48rusY6/qnHFLF4p5THlA1MJ4D/wnhLHNFYhRBLcSfEDKeVnyua4e1+N4ozX91SJ7TSARQD6wl1uUJd41B7TE4/yeAaAk9GMUxfrMKVcJaWUJQDeQQzfUyck8NUA2ipXqZPhvpDxVayCEebT6H4FQL26fAeAL5XbXwG4XblC3QfAGc2pdzRUNa7vAVwnhKirnG5fp2yLON21gV/COz3xVwDGKT0SWgJoC+BnROFvQ6m3zgCwTUo5VfNQXL2vZnHG23sqhMgSQtRRbtcAMATuev0iAGOV3fTvp/o+jwWwUDnjMYvfNiaxbtd8cQu4a/Xa9zS6v3s7roRG+h/cV3d3wl0rmxTjWFrBffV7A4Atajxw1+UWAMgF8AOATOm9kv2GEvsmAD0jGNuHcJ8ml8FdZ7s7lLgA3AX3RaFdAO6MYqzvKbFsVD4MjTX7T1Ji3QFgeLT+NgD0g7s8shHAeuXfiHh7XwPEGVfvKYAuANYp8WwG8FfN5+pn5b35BECKsj1Vub9LebxVsPijEOtC5T3dDOB9eHuqRP13z6H0REQO5YQSChERGWACJyJyKCZwIiKHYgInInIoJnAiIodiAicicigmcCIih/r/oXsF/K7LEE0AAAAASUVORK5CYII=\n",
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ]
          },
          "metadata": {
            "needs_background": "light"
          },
          "output_type": "display_data"
        }
      ],
      "source": [
        "plt.plot(losses)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 43,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "f225e8db3e074dfea38a0a89f710b03e",
            "dada48c7b7b549cdb63c201b6dab8340",
            "b4e393cebfd14db195c6765672612c94",
            "0e13b279425b4a40b45e4ffc00726e37",
            "654ff7cbd00f4364b84447ef37339f0b",
            "21f204431f574e9481ac773228ac757c",
            "b772845b6ef44fecbf430e98c2d6a9e6",
            "2f3f8142cfc24267b70e7aa705ee6fa1",
            "9c00155bf6b94ce2893bb5e8b333b463",
            "65790fbc80a54ca29d71f63121219f63",
            "bc0833806606434e8ea84db054b7f8f4"
          ]
        },
        "id": "RcixkfU80Ysn",
        "outputId": "52a6bc89-e902-4fd4-d63c-03b23c29b8c8"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f225e8db3e074dfea38a0a89f710b03e",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Evaluation:   0%|          | 0/71 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "499.80787826019485"
            ]
          },
          "execution_count": 43,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "evaluate(model, test_dl)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "H3Qs51Aec229"
      },
      "source": [
        "![Results on dataset](https://c1.sfdcstatic.com/content/dam/web/en_us/www/images/einstein/publications/wikitext-2.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TAkMtao5SEYA"
      },
      "source": [
        "## Generation of text\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 44,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NTbRuE29SB_T",
        "outputId": "5bc3aa09-c2d6-4603-bc21-706e27803cd4"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "This was a the star star of . the the star star of of the the star star of , the the star star of , the the star star of , the the star star of , the the star star of , the the star star of , the the star "
          ]
        }
      ],
      "source": [
        "sentence_start = 'This was a'\n",
        "print(sentence_start, end=' ')\n",
        "\n",
        "new_token = None\n",
        "states = (torch.zeros(lstm_layers, 1, lstm_dim).to(device),\n",
        "              torch.zeros(lstm_layers, 1, lstm_dim).to(device))\n",
        "loss_fn = torch.nn.CrossEntropyLoss()\n",
        "  \n",
        "while new_token != '<eos>' and len(sentence_start) < 200:\n",
        "  token_ids = bpemb_en.encode_ids(sentence_start)\n",
        "  batch = collate_batch_bilstm([{'input_ids': token_ids}])\n",
        "  logits, states = model(batch[0].to(device), batch[1].to(device), states)\n",
        "  logits = logits.detach().cpu().numpy()[-1]\n",
        "  \n",
        "  new_token_ids = np.argsort(logits, axis=-1)[::-1]\n",
        "  token_id = new_token_ids[0]\n",
        "  word = bpemb_en.decode_ids([int(token_id),])\n",
        "  print(word[0], end = ' ')\n",
        "  sentence_start = sentence_start + \" \" + word[0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1JEopLHVTyKf"
      },
      "source": [
        "## Check perplexity of a sentence"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "metadata": {
        "id": "ptMZ6wrYT1KZ"
      },
      "outputs": [],
      "source": [
        "def get_sentence_perplexity(sentence, model, vocabulary, seq_len):\n",
        "  states = (torch.zeros(lstm_layers, 1, lstm_dim).to(device),\n",
        "              torch.zeros(lstm_layers, 1, lstm_dim).to(device))\n",
        "  token_ids = [{'input_ids': bpemb_en.encode_ids(sentence)}]\n",
        "  batch = collate_batch_bilstm(token_ids)\n",
        "  loss_fn = torch.nn.CrossEntropyLoss()\n",
        "  logits, states = model(batch[0].to(device), batch[1].to(device), states)\n",
        "\n",
        "  target = batch[2].to(device)[:len(token_ids[0]['input_ids'])-1]\n",
        "  loss = loss_fn(logits, target.reshape(-1))\n",
        "  loss = loss.detach().cpu().numpy()\n",
        "  return np.exp(loss)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 46,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nz9VeUg2jypl",
        "outputId": "76629af2-ccb1-41d9-c3f7-e5d6192ebcc7"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "1096.5176"
            ]
          },
          "execution_count": 46,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "get_sentence_perplexity('I want to buy some potatoes from the airport.', model, vocabulary, seq_len)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 47,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1AklQNHQrYfk",
        "outputId": "c036bbe5-63f2-48b4-ea52-635708ca5fa3"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "1489.3047"
            ]
          },
          "execution_count": 47,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "get_sentence_perplexity('jibberish ? . something something is', model, vocabulary, seq_len)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MtHm1h11KAEJ"
      },
      "source": [
        "References:\n",
        "- Bucket Iterator https://torchtext.readthedocs.io/en/latest/data.html#bucketiterator\n",
        "- Packing https://github.com/HarshTrivedi/packing-unpacking-pytorch-minimal-tutorial"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cAY1VYG55dgE"
      },
      "source": [
        "# Transformers"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ExnICM_L5gGX"
      },
      "source": [
        "## Recap\n",
        "\n",
        "**Problems with using Bi-LSTMs to learn contextual word embeddings**:\n",
        "* Recurrent neural networks are difficult to train\n",
        "  * Vanishing gradients (LSTM; Hochreiter & Schmidhuber 1997)\n",
        "  * Exploding gradients (norm rescaling; Pascanu et al. 2013)\n",
        "* Proposal: Transformer Networks\n",
        "  * Idea 1: Encode words individually with feed-forward neural networks\n",
        "    * Shorter path for gradient back-propagation\n",
        "    * Parallel computation possible\n",
        "  * Idea 2: Replace recurrence function with a positional encoding\n",
        "    * Fixed-length vectors, similar word embeddings, that represents the position\n",
        "  * Current base architecture used for all state-of-the-art NLP models\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4MsItzYS56mA"
      },
      "source": [
        "## [Summary of Transformer Models](https://huggingface.co/transformers/model_summary.html)\n",
        "\n",
        "Apart from improveemnts of the core Transformer architecture, Transformer models differ in the *objectives* they are trained to optimise:\n",
        "\n",
        "* Decoders/autoregressive models -- pretrained on language modeling task, i.e. predict the next word, given the preceding context. \n",
        "* Encoders/autoencoding models -- pretrained to reconstruct corrupted text, i.e. \n",
        "* Sequence-to-sequence models -- use both an encoder and a decoder for text-to-text tasks\n",
        "\n",
        "One architecture can be used for training with different objectives. The used objective makes a model more suitable for particular tasks, e.g., **autoregressive models are most suitable for generation tasks**, while autoencoding models -- for classification tasks, and sequence-to-sequence models for translations, summarisation and other text-to-text tasks."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "M29abqnB8lkV"
      },
      "source": [
        "\n",
        "![LMs](https://imgs.developpaper.com/imgs/4055417039-5e704fdcc441a_articlex.gif)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1YRThOeT8pbY"
      },
      "source": [
        "## Fine-tuning Transformer Models \n",
        "\n",
        "We can directly use pretrained Transformer Models (search for suitable models on [HuggingFace's Models page](https://huggingface.co/models)) and fine-tune them to perform our task at hand.\n",
        "\n",
        "We can also fine-tune the selected Transformer Model with intermediate in-domain data and then use it for the end task.\n",
        "* Benefit of using unlabelled in-domain data to fine-tune the model with its original objective.\n",
        "* Especially useful when there is limited labelled data for training the end task."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BQldkcV7iUcK"
      },
      "source": [
        "### Intermediate Fine-tuning with original LM objective and unsupervised data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104,
          "referenced_widgets": [
            "c1b56124d7234834974539d927c5b197",
            "6d27d0d5783f459eaff3e856dd71f9c2",
            "7eb3f361e8e547f7a940601701ebaa17",
            "0ee650253b62481c84d370f43789b1cf",
            "35285ee23f78457eab320630f7db261b",
            "d46ed86942254815b4faaba2fcfc4ea8",
            "0f407ace25264af2998cb4124a6eb33d",
            "2d25a1092e3946588fd9e8f7a4a43c73",
            "eaf4be1c2b744586b8fe05bd9ad72470",
            "a3bf2721908b4aed9dd0f4afb9d49a1d",
            "e4cf62bee9d9461f9ccd149009267946"
          ]
        },
        "id": "A8JixiOV8zSA",
        "outputId": "9a2f57ea-1bd0-477a-9245-58fbec618916"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Moving 0 files to the new cache system\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "c1b56124d7234834974539d927c5b197",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "0it [00:00, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "import transformers\n",
        "from transformers import AutoTokenizer\n",
        "from torch.utils.data import Dataset\n",
        "from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification\n",
        "from transformers import Trainer, TrainingArguments\n",
        "import math"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 177,
          "referenced_widgets": [
            "67f8d1ff24bf4cd09d526a6b94f7ba43",
            "8056b129d5fc494a87473b7e5b02c100",
            "27d968707968423b8ab9a4a5545bddf4",
            "255e5dc08158435ab0fc79e13ad7f05f",
            "cd38df23534a4341a49cd77c7886be0a",
            "643c33c018434ebebb825d7d652641b3",
            "1ccb76a86dd1435e9556974395ec31b7",
            "d4b1ccf87ff54d8bb02ad2818b57fd3e",
            "4b3e67bf6fef45e9aa75a02cb7540457",
            "8586f18248744aa8a8f5fcd95799a1c6",
            "d42ef75d7ad84d39865733abe50955d1",
            "905ca4b515b24b45aa5751f6cbcf4136",
            "b02763daa89348ff8c16a20c9122cfb1",
            "65d6d31358ee428fb6f266ba6eaf2289",
            "9cf9ab3318db47e69c4d673d661a9a3d",
            "0cd6c1c82b144cd4b519bdefbbfda8a8",
            "7c54b11e67204d5a899b9d41ffb68661",
            "710170897a94420eaac2fe0c51f0cba0",
            "0dba056d414d40ce8abc36f1c4e7c416",
            "1b6a26e449d14b28ad064fb3b1a81ad7",
            "fa2520db173344378e9b9117cf5f1a2b",
            "fae809c8b6364c96ac9c785f994a717c",
            "245e8439aa58470da5296177953867df",
            "97267e0aca2245b4b110630483048203",
            "3944d507674648a38143a1692e665e5b",
            "0567489996aa48a4920dd41895c18a59",
            "8c3a7d753dbf40c88e7e60d941975b01",
            "95cea981a463401d806003c613d52ad7",
            "d09db33af4fd41bb914f298b63891f29",
            "333f38d2519f48cd8bfb793a5c03b5ec",
            "85cd7693fa7e44c99d1150995c47b9ba",
            "00aff87aa8da4de8bbf8d08d9d568dd3",
            "a91a5c7d4f39461abb1de38b077e9834",
            "d51a3173d7994f218c243f1e8e60f85a",
            "9e1246820ce94528b643b4a702a7713f",
            "d4b5280c66f74fc4ac51d313ccbebb0d",
            "0eb4a54c94174591aff7cd9d2a679bce",
            "63632ad73af341b5b4691fdb8b04c9cc",
            "335d7dbe317f4f01863463fc5bfff3bc",
            "a645e4c03dd54e0b99965b25a2dde813",
            "ad85df138a194809a95366c8a03a4916",
            "928348c0195d4adb8c7230637993b962",
            "93ae29400ced4c6a8307258522ea507f",
            "597e1a1e5ff24c23b57d386a680bfc04",
            "d7069d155a4b466cbab9cd938e5495a9",
            "083bd41bfd4f414fbb7b31e0d6670cb6",
            "375b3b42459d454a8f5f3a47a5b1a9f8",
            "310b1b662cf544c3b94d06f9553bc06d",
            "5d356fdf0c99400c85631a8c8f57b7a4",
            "cdc57e386c5b47faa5002723e7631b69",
            "a82ecfb2b9ee4293aac35f2d59f1fed6",
            "25a90247a143423b80ff7d5771edc07c",
            "1d3c649a81b540c0ad10266265df07ee",
            "d2b2939b79b54844a3efaea78a9f67f3",
            "30dfe2d0ee8546c594ac87f782023c06"
          ]
        },
        "id": "HLHUjXwr5fM4",
        "outputId": "29310433-d6a7-4def-eecb-023a48a29525"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "67f8d1ff24bf4cd09d526a6b94f7ba43",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "905ca4b515b24b45aa5751f6cbcf4136",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "245e8439aa58470da5296177953867df",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "d51a3173d7994f218c243f1e8e60f85a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "d7069d155a4b466cbab9cd938e5495a9",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "model_checkpoint = \"distilgpt2\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)\n",
        "model = AutoModelForCausalLM.from_pretrained(model_checkpoint)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 50,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 202,
          "referenced_widgets": [
            "39281156f77c431cb8616afc69e8c347",
            "95031a84b0c24b57884681998bc9fa63",
            "ae39082921ff496da363436ebbd2caae",
            "38bc945dea474f32af8f4e4b882f6705",
            "720f243976994fea8af15ee8dc88e891",
            "28b9fe3e6ebb4bc6a9a2b3754591d1c1",
            "4a94be658c0f4c128f0bd69df5190acc",
            "2915d114e3ec44379ca373573adca26f",
            "7e2d0630ca584ce49a71779e2f020dd2",
            "1346382a44924d50914f924a0bfa7f83",
            "5f2378ac24a54ed5a44ce408105e03f9",
            "4acb4ce089da46e7af8ce37c9b8fd1f9",
            "c8036303ce8e4da79ab5d99263a11e78",
            "bd1476cdc3024cb9a6d8938f6c27a31c",
            "e44fdc4c3c1d4836a320166d90c64f54",
            "d8145a97de9c4e0baea5e7191c4593ed",
            "7d5ace29125742ca9ad6ce7f865bb361",
            "2907fb178f764ed8a120ab90f020e1eb",
            "3f2780e94e2948999b8b4e815b5347bd",
            "501f260ce5ad4bb488decedcb8913dd0",
            "28dc3ee7bc0440618140338f3d08114c",
            "1b054e3fc71e4471bca6fe11b9f1ce08",
            "56aa4a43cdc34f8fb23c74af448e65ac",
            "58f8e19704784d3db386c3c8bd9e9c3c",
            "633d41b734a348edafce8125d861272b",
            "04c5777299a94baba0c78971cd7750bf",
            "7ac8f8a46ecd4419890e6b08d7565cd1",
            "fa6650d3599c4994b68467962a9e737e",
            "40a4b58e5dfb4cb8b8da89522e7601b1",
            "77e299845ab942238858bb1cd1daf10a",
            "f4d01de4042a468daa04ae2d0a6163cb",
            "a7e59a7bb9964d3fa1f82a5fd670b2aa",
            "fd777af1ea5043bfafcc46baecf5688b",
            "136e87d781274c56be92083a912ea519",
            "d129b102c5434b849e9e59e87048229f",
            "a0de869c95024cfcac07c6b89050602d",
            "bae319dddbf346f5bcd0f14c8dbafc9c",
            "67afcc2a1fbb4ce3a46e04df66d1b9da",
            "4624d01260b94a09a0ac581c8830e853",
            "5f9340806d3d421bbfb9b0d2e70cdae8",
            "abe78c0b3db44fffb7e559e2b8060081",
            "adff9d1a740f48bf82bf5836041fd786",
            "5aee8992f6a24451ab298eb4b853fa35",
            "eddef2d4b5874071b80571b0dc9f76cc",
            "914d25a2ba3a4e11b109c4e35cee57b8",
            "81c1b8020ce540f2889417c90dd4abd7",
            "8afbcb42cebb4081b45816e557dbb6ce",
            "3378f0014ab64554b4edad854051f44d",
            "9b9b485625754a14abd9395d1529870f",
            "130e8632518946d58feb37e4b71be3e0",
            "112e3ebf1d534dcebeb5d82d63d48653",
            "73254ba11adc4756b0988a7211229cbf",
            "ab9568cae5c24316a67c0dd07da13cc2",
            "efc5ccb46f7d4eca93eb65e2f875237e",
            "62afb681157a439fb7f36dad9406c5aa",
            "aadc0da0256e47aeb6f8e727494d8ac5",
            "df86d8bceea14d94a8d3e6340435d1a7",
            "95e407fbcc024e2e957e10467b63d639",
            "711f3d9acb9e4888a1ea33a01eaf6006",
            "aaa4f5b75b15411ca9a3c375917e80d0",
            "1b49d030e8bd4949b1991e979d5b0b5e",
            "0cbc45e3054247868500f78e1cd93f66",
            "6b5307c6c5244d6e9a140f4e477696ff",
            "4a5bbe7654a3466d8b2c404cadf0f2f3",
            "582ad0d6a26b48e294f55c0c7ae57e29",
            "557fd4cb70ab4a86b797f691b789e66b"
          ]
        },
        "id": "n5OyjD-oZsri",
        "outputId": "e1174a5b-3dda-4458-8c6a-6426eb680ac7"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "39281156f77c431cb8616afc69e8c347",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4acb4ce089da46e7af8ce37c9b8fd1f9",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "56aa4a43cdc34f8fb23c74af448e65ac",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "136e87d781274c56be92083a912ea519",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "914d25a2ba3a4e11b109c4e35cee57b8",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "aadc0da0256e47aeb6f8e727494d8ac5",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.\n",
            "dict_keys(['train', 'test'])\n",
            "{'text': 'In preparation for writing this comment, which I am compelled to write for reasons which will become clear, I read a fair few of the major critics to see, if on this occasion, they had seen and felt what I had.<br /><br />James Bernadelli suggests this film should be known as the \"Pursuit of Richness\"; another column I have read suggests that the money-will-solve-your-problems resolution is depressing and not the message Hollywood (or art in general) should be trying to put across.<br /><br />I ask myself when the last occasion might have been that any of these critics genuinely had to :spoiler: run, under painful, embarrassing duress, from a cab - because they didn\\'t have the money; or when the forces of life seemed to conspire against them so unfairly that they broke down in tears. Spoiler: when your wealthy boss, who you CANNOT disappoint, asks you to borrow the last five dollars in your wallet, and you know that that money is all you have in the world - to feed your family, to pay your gas.<br /><br />You go through times in life when you feel that things couldn\\'t possibly get worse - and then they do - and then they do again. Sometimes, (like Chris and son at the beach towards the end of the film) you just want to get away from it all, other times you cry. Sometimes, and this is rare, you laugh - because if you don\\'t you\\'ll cry. You know that if you let it all become too much, you will sink to the bottom of the sea.<br /><br />Chris Gardner (not the real one perhaps, but the one in this movie) is my personal hero; my shining example; my inspiration - the guy that never allows himself to sink - even when he can feel his shoelaces trailing on the seabed.<br /><br />I realised as I was watching this film that I was watching another me, so I never once stopped rooting for Chris. When he :spoiler: fixes the scanner in the shelter, I felt like I had fixed it.<br /><br />I\\'m still working on getting the job that earns me enough to have a less painful life - when Chris finally achieves it, I want it for him so badly that I feel it in the very fibre of my being.<br /><br />I suppose that it doesn\\'t matter that much that Chris wants to succeed as much for his son as for himself. In a way, the fact that Chris has a son makes this movie emotionally frightening and if (and only if) our basest fears are being toyed with by the director, it is only in exactly the same way that we sometimes look up in to the sky and say - like Jim Carrey in \"Bruce Almighty\" - is there anything else you could possibly do to me today? In my life, God doesn\\'t appear and explain why he keeps toying with me. The truest belief is in oneself and one must never lose it.<br /><br />I adore this movie and I couldn\\'t be more grateful for its existence. It comes at a time when I need it most. Its message: Keep going - never, ever give up.<br /><br />And so I come back to those reviewers. Money certainly doesn\\'t mean happiness in this existence, but no money, in this cruel capitalist world, can cost you your life. Do not be judgmental of those who want more, it might just be enough to pay for their son to eat, to keep a roof over his head, to keep their dignity. This film is not about rags-to-riches as some reviewers have said. It\\'s not about the American dream either. It is about dignity and integrity and how money (or lack thereof) could easily strip you of both.<br /><br />Chris Gardner never loses either and for this, he is an inspiration to us all.<br /><br />See this film TODAY.', 'label': -1}\n"
          ]
        }
      ],
      "source": [
        "unsupervised_imdb = load_dataset('imdb', split='unsupervised')\n",
        "unsupervised_imdb_splits = unsupervised_imdb.train_test_split(test_size=0.01)\n",
        "print(unsupervised_imdb_splits.keys())\n",
        "print(unsupervised_imdb_splits['train'][0])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 51,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 827,
          "referenced_widgets": [
            "df24fe5b6aa44d4481ed70aa7c6377f3",
            "f68ef66040a94595977a42dbc5c8b19d",
            "f6ed4e386d144f958cc5168f8d9b1bd9",
            "da8bbdca5a754f3fb14e7fd9fc9cff7b",
            "9be3427ce86f42539f16818ccfa6c3cc",
            "e265241f4cb14c7da4d69b7c3c80dd1b",
            "670bd75a22554251ac67949c6fc381ba",
            "e403cb337922477baa4d5fb4108849cf",
            "2066d2cb2cf04913b28716fa902e2cff",
            "a69083dfe1054d54a5b72f25d29265fb",
            "bfd8e388fa5b46d8851bab8a7ba4ef43",
            "54eae0966bf24f2192e3dc3ff9b80bf8",
            "0ccd3e10f4d64e8fad01395ac31a899a",
            "f993c5889dff416085587cb3ef3a760f",
            "5c2061a0a6e0451383c85ff215ffe665",
            "4d881970a41545bda115771b2ad0d4a6",
            "b62a89cbed734750bdda7a549afd3a20",
            "20373281b1a14edd839276efdade0d18",
            "886f14656c8844f6a9f5fd54a738eb04",
            "50f215b0244743c2a646db774d346938",
            "2c26d8dda3264aac8aed334635ef0cb4",
            "8cfa34ea9359428ebdb27ffc1b902bb9",
            "809457903b7c4dc2861a462b373644cc",
            "6a42e926674c48efb3690f5b49cd226b",
            "dd2fd7a72afd4baf9cae80aa32504f0a",
            "cf67efbc053c44f3948d31b733049e03",
            "158ea2ef7daf4734bd102c881a492b79",
            "66c13afc9b4d435484b1a13b4f2692e1",
            "89c0c687e3f6410a8e418be272185a12",
            "97e9658ad5aa4275ae7859a43c12b852",
            "1841bda69a9e46c4a2f3667a050042bb",
            "ce6b0b840d3443df88a0e16e884846d9",
            "0f38bc7fb3fb4953a07fd66eb7236c33",
            "3edcb07f980a47478a06b57de2183de2",
            "ffd085b088994674acc0fe984e05e571",
            "eeef33d74b5d42c4af04c1096afcba4b",
            "1240d761fcfb4567b8e8626599e416d8",
            "05a1d1e61479479788ddee073af988d5",
            "62e72ef1c77d41c098cc7337070be33e",
            "dc7a25fe2f024892a0ecf3367267196c",
            "41beaaa8298f43a490d574432439f76b",
            "efa52057c7214dc8a9c559514956f21e",
            "e72c218791c049f0aa18b1d7b1bae20b",
            "06a5174e5a4b420ba94117c6767ec585",
            "0a8291bb5b904fce9e73f3df086ad202",
            "67dba2df05c84b40a87e004e627df480",
            "7ab9a465b9af4ed8b340f858645c137c",
            "fcb06bfa70bc4379b6d4cbaaad4ebbfb",
            "d82ca7d4506046649bf1113f37ef5256",
            "55639a2a3c8d4b3abb0a857a1d75076b",
            "a7a6ee19051948e0a06e86fa5c442340",
            "28e09de7b4eb4fc0a7d44cf22e0c60c1",
            "26ca2b53c4fd4473b0f0f39768f39e8b",
            "e361e6e179af460db17996f544d3f2e5",
            "749ff2c1863b456db12e635d7b783788",
            "b4ca3f4de9ae4259a650e12461361966",
            "07ef8e8031824fd889dbc27d844046d0",
            "779640d94d234a30a14f14fa7facf464",
            "a024c46dd786423d943c458499bd5db5",
            "14d0a9474065475f92652bef3e965da4",
            "51b608f693774a1caed9835dbb814ca2",
            "00f421ddd1df48c387398bf6dadce5c9",
            "a7328bf7533548beb0d5db7c860c5f23",
            "b6becc9caa404ba5ad415973252924ab",
            "ba966b7727bf4085b22f682d812646af",
            "c2136ef766af4a47b6e642e41b5f2c33",
            "5a4e7b6e73f44ded9fe58f430c2678ca",
            "ec6c7355fa9b4799b4474dfe7ce7685f",
            "9ec905be381343c3885378a3e81b18c9",
            "aa898114154e4f60a92e677b8592bfdd",
            "805063e3e52f4749ace7dc0ee99905d1",
            "f5b7096b9ed5424a9158fc5f24074337",
            "526685432f854d4ca085ad79fc86da60",
            "46568388596e47919690f6b431a81b8e",
            "2447193868b8481e8d81085c1a49d61c",
            "479d5070565147c9a9ff1b05e0f277a9",
            "6873d83315044b1ba17100690d2633b5",
            "8935b52c06fe4fa79f39a992d1fd761f",
            "82773007e1d343dc9cf9150dd6963050",
            "437871fdca0244548f1f4c861ced96b3",
            "bf79cf17ea4d47acb9b02a12b2805616",
            "c5c71eb89293417ab0a24daf8912f258",
            "ce419304adc14ae199faa0762a724789",
            "ccc7de38ec284d78888101b09fcd5802",
            "0acba823c1d446b5925c3dfc3650c951",
            "201223421f744d1d8264bfcbfa6e2579",
            "27fcdea841e14886bce42dda01186df3",
            "107eb8ef2eea4f0cabfdebb793c66d4f",
            "eab798ec79864728b355358247cbc26a",
            "2fe4a4e4abd54926b771274f498d8b18",
            "bac57db32e364640a733f02635734db7",
            "2fc455f209dd4b2f99b18ecc5dcf935c",
            "9e511ff014ce4bb49b89afb55961c5c9",
            "652356d1d5394c0798f075411aed94e9",
            "19598253afd64e54b3638cc11dd386a2",
            "124fb7825b7049ac805a18af1cba72b6",
            "a15bd0a4602f4a71a05591366552f772",
            "0be182e6491143c38cd0d480afadf289",
            "0c04fbb5713d40cea8e3bfe26e0cf53a",
            "50c1d5ba5340498ba326c0bf510affee",
            "032bff1f891b4d029e9498a5c2f68bb3",
            "1a687cf7b4ed48949d294a42ae4714ff",
            "ae92ddd6004a4754b99059581ada98c9",
            "a6e6b22328f54e83aeb410eb9d305a98",
            "5706a3b1976f42368e2bd130019e3d60",
            "6c3d769ca75c4729a56a655b82aa4581",
            "26b02ad2e6f14cc998a3aba99628b766",
            "a5dd042cc7c442d5b48226fe7bcfeee3",
            "ed60f8b38fa3445fafb43e2609b0ec8a",
            "fc5670d816254bfabf6d6ed0d404daf9",
            "58c4fb3602d84e46a50f3f041044278b",
            "d8ea08b42081416b97f63ff2ce00a2c3",
            "04ddb11c548e43b58a3d72e3736b1080",
            "78a5c11ac07e4228ae245145d390ce15",
            "70a8944de1c6498e8064ad3429cfecba",
            "888d5acec7e04f6286447ab5c2694bc8",
            "1112e88add954afcaeff5bd3263e355d",
            "b982996b00fb445a8991e2db3f14335e",
            "cb7170a57d8044f09452562bca587f28",
            "32512b7e84114ad0bbd143449e270a09",
            "3b85f0e184504d2e8236e2a4a588392b",
            "a6b5cfbab95c4b0ab1d5631a7b9f0545",
            "41fec251bfde4461be937ef820e2214e",
            "64c7a4aa3d7241e692a15634c3c97d4f",
            "b544ecd8f9b24ac39988e07e9cdf778a",
            "024d9ca6818048cba99591b5403547c0",
            "861943526f4c456b9380dd39d28d6854",
            "c2f9f42939d241e99d573b411de0c26f",
            "75e6f9042ce644d89db7085345553306",
            "f9de800bba02474c9b328a8292872129",
            "bbfd3a46d25149f0861a83d1b9821be1",
            "41da7d8f3fb54d3f9ea871064d779d17",
            "600415d395e94940bbb28d83de66cff8",
            "862459632dd844c3be54d204e3cceb15",
            "326480bffed64b91b74c201ea0389161",
            "f9444223e120426e9f816ba3070e58fb",
            "4ac1b086b00744f9b6cb2689478d0d0d",
            "8cd4c847275845e08e9be08baac6cea5",
            "c9ac5b153c3a4dfaa51c9ab0bff3a4eb",
            "c969d0c6ec014b5a976d2b399340b953",
            "f5dd1276afda440292e2123f2a661689",
            "de8df329e6074159ba327e452341b0c4",
            "ac96e38c17ba4ec087f22edfcdef18af",
            "42b262146a324c2f8a5aa361c1c85fac",
            "6257c88b5631469db06f2f94333c5159",
            "f83b42f8cf6a42988a1a2dde64cda0e1",
            "226c152db17c4094bfeef15d7e0c9e5c",
            "2491254d68d545cda76e5166b6101730",
            "1d062cbefabe41fc97c15242e696b842",
            "d558f5bca32d4d9387639935f3d62498",
            "a09e90e7cd8847bda54b005b8c5f019f",
            "5e15e0bb605d42688c7f4d535e0c0ed7",
            "24bc9b1ad31a4e268b1573d5707d2800",
            "a4cf49e473b847ab8c96f0c73f5eb95a",
            "3c7e3ad352324c99b4933c6d9c0eafa8",
            "217faa80c53c46cea03f775a64c7965f",
            "46bef6aa64654a9995ddb38fe71aa39d",
            "b23299679c1c4172b1f26cd4efa4e90a",
            "4cc6e08161824db3abe899c23009513c",
            "4fb110a999de4bfa9730147006cc2147",
            "28b0b940bcc54b489a7607132cc2f3b8",
            "1a26323e2c364a2daa0ba419251203b1",
            "3ca6e85dff734b5d8065820763008765",
            "5548fc1282784d26bb3f2db43ef0c8c6",
            "2eecf85cb0134bf5b94259c86fa89953",
            "4536c7d1164e49cea243f1f73e4e1e4b",
            "1a5b13651aed46819b8f81ee47b41d1b",
            "019f3988f0b24b07b9116959739e10dc",
            "422ed50cdbb64a73a7f13fed3daf3174",
            "3bbaf229a7894cc79d94d72072f2c2d1",
            "2933ac55e2d7491b96cc2d89de129fca",
            "9580b72336c7404398e64a03ee7dc019",
            "184643450acf43edaff661ebf8d38e30",
            "0dc25b5124bf4f31a515784d51be162c",
            "b62eb6401a514b118ee3ab1f67b8d790",
            "6f13d0cdcf264afbb41b9b91092840bb"
          ]
        },
        "id": "lTWOOd-Ta6KZ",
        "outputId": "906f97b2-4256-4357-aaa3-a672ca5c6b37"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "df24fe5b6aa44d4481ed70aa7c6377f3",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#0:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "   "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "54eae0966bf24f2192e3dc3ff9b80bf8",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#1:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "809457903b7c4dc2861a462b373644cc",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#3:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "3edcb07f980a47478a06b57de2183de2",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#2:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1297 > 1024). Running this sequence through the model will result in indexing errors\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1286 > 1024). Running this sequence through the model will result in indexing errors\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 1024). Running this sequence through the model will result in indexing errors\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1115 > 1024). Running this sequence through the model will result in indexing errors\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "      "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0a8291bb5b904fce9e73f3df086ad202",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#0:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "  "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "b4ca3f4de9ae4259a650e12461361966",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#1:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "5a4e7b6e73f44ded9fe58f430c2678ca",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#3:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "8935b52c06fe4fa79f39a992d1fd761f",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#2:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1282 > 1024). Running this sequence through the model will result in indexing errors\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1198 > 1024). Running this sequence through the model will result in indexing errors\n",
            "Token indices sequence length is longer than the specified maximum sequence length for this model (1286 > 1024). Running this sequence through the model will result in indexing errors\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "      "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "eab798ec79864728b355358247cbc26a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#0:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "  "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "50c1d5ba5340498ba326c0bf510affee",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#1:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "58c4fb3602d84e46a50f3f041044278b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#2:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "a6b5cfbab95c4b0ab1d5631a7b9f0545",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#3:   0%|          | 0/13 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "     "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "600415d395e94940bbb28d83de66cff8",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#0:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "   "
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "42b262146a324c2f8a5aa361c1c85fac",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#1:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "3c7e3ad352324c99b4933c6d9c0eafa8",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#3:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4536c7d1164e49cea243f1f73e4e1e4b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "#2:   0%|          | 0/1 [00:00<?, ?ba/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "{'input_ids': [1532, 345, 1053, 1775, 262, 6833, 13637, 327, 26183, 2196, 20495, 18653, 7886, 340, 338, 1327, 284, 1234, 340, 503, 286, 534, 1182, 11, 475, 345, 2192, 815, 466, 780, 428, 530, 318, 6635, 1180, 13, 47419, 774, 468, 587, 9958, 287, 7075, 286, 10319, 12, 448, 9961, 532, 42156, 11, 46835, 290, 477, 12, 744, 22029, 1108, 13, 7477, 340, 338, 11441, 11, 13913, 88, 11, 31826, 1417, 290, 15074, 25292, 357, 20839, 597, 1866, 286, 262, 41162, 1107, 5806, 12718, 12, 3036, 1150, 15232, 33924, 475, 3805, 477, 428, 340, 318, 29381, 13206, 13, 314, 7360, 3521, 470, 11626, 3589, 1497, 422, 262, 3159, 1566, 262, 886, 286, 262, 3807, 13, 1002, 612, 338, 257, 5749, 19370, 345, 460, 1414, 284, 257, 2646, 314, 836, 470], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1532, 345, 1053, 1775, 262, 6833, 13637, 327, 26183, 2196, 20495, 18653, 7886, 340, 338, 1327, 284, 1234, 340, 503, 286, 534, 1182, 11, 475, 345, 2192, 815, 466, 780, 428, 530, 318, 6635, 1180, 13, 47419, 774, 468, 587, 9958, 287, 7075, 286, 10319, 12, 448, 9961, 532, 42156, 11, 46835, 290, 477, 12, 744, 22029, 1108, 13, 7477, 340, 338, 11441, 11, 13913, 88, 11, 31826, 1417, 290, 15074, 25292, 357, 20839, 597, 1866, 286, 262, 41162, 1107, 5806, 12718, 12, 3036, 1150, 15232, 33924, 475, 3805, 477, 428, 340, 318, 29381, 13206, 13, 314, 7360, 3521, 470, 11626, 3589, 1497, 422, 262, 3159, 1566, 262, 886, 286, 262, 3807, 13, 1002, 612, 338, 257, 5749, 19370, 345, 460, 1414, 284, 257, 2646, 314, 836, 470]}\n"
          ]
        }
      ],
      "source": [
        "block_size = 128\n",
        "def tokenize_function(examples):\n",
        "    return tokenizer(examples[\"text\"])\n",
        "\n",
        "def group_texts(examples):\n",
        "    # Concatenate all texts.\n",
        "    keys = ['attention_mask', 'input_ids']\n",
        "    concatenated_examples = {k: sum(examples[k], []) for k in keys}\n",
        "    total_length = len(concatenated_examples[list(keys)[0]])\n",
        "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
        "        # customize this part to your needs.\n",
        "    total_length = (total_length // block_size) * block_size\n",
        "    # Split by chunks of max_len.\n",
        "    result = {\n",
        "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
        "        for k, t in concatenated_examples.items()\n",
        "    }\n",
        "    # this is needed as the used dataset is a subclass of ClassificationDataset, which requires label as a field...\n",
        "    result[\"label\"] = result[\"input_ids\"].copy()\n",
        "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
        "    return result\n",
        "\n",
        "unsupervised_imdb_tok = unsupervised_imdb_splits.map(tokenize_function, batched=True, num_proc=4, remove_columns=[\"text\"])\n",
        "unsupervised_imdb_splits = unsupervised_imdb_splits.remove_columns(['label'])\n",
        "unsupervised_imdb_tok_lm = unsupervised_imdb_tok.map(group_texts, batched=True, batch_size=1000, num_proc=4,)\n",
        "unsupervised_imdb_tok_lm = unsupervised_imdb_tok_lm.remove_columns(['label'])\n",
        "print(unsupervised_imdb_tok_lm['test'][0])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 52,
      "metadata": {
        "id": "9HoD6AxNDZwF"
      },
      "outputs": [],
      "source": [
        "model_name = model_checkpoint.split(\"/\")[-1]\n",
        "training_args = TrainingArguments(\n",
        "    \"test-clm\",\n",
        "    evaluation_strategy = \"epoch\",\n",
        "    learning_rate=2e-5,\n",
        "    weight_decay=0.01,\n",
        "    num_train_epochs=1,\n",
        "    max_steps=300\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 53,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oY4_BQB1DemF",
        "outputId": "2d507588-7732-417b-d595-1174dff11f1c"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "max_steps is given, it will override any value given in num_train_epochs\n"
          ]
        }
      ],
      "source": [
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=unsupervised_imdb_tok_lm[\"train\"],\n",
        "    eval_dataset=unsupervised_imdb_tok_lm[\"test\"],\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 54,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 106
        },
        "id": "DFqEJgVmsi61",
        "outputId": "1caac670-32de-4698-c0c4-168ace4726ae"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "***** Running Evaluation *****\n",
            "  Num examples = 1073\n",
            "  Batch size = 8\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='135' max='135' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [135/135 00:07]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Perplexity: 70.50\n"
          ]
        }
      ],
      "source": [
        "eval_results = trainer.evaluate()\n",
        "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 55,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 477
        },
        "id": "fxiNAJHaDfY6",
        "outputId": "d5c7f600-c90d-44fc-d360-770732580229"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
            "  FutureWarning,\n",
            "***** Running training *****\n",
            "  Num examples = 116288\n",
            "  Num Epochs = 1\n",
            "  Instantaneous batch size per device = 8\n",
            "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
            "  Gradient Accumulation steps = 1\n",
            "  Total optimization steps = 300\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='300' max='300' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [300/300 01:07, Epoch 0/1]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              " <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>0</td>\n",
              "      <td>No log</td>\n",
              "      <td>4.047292</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "***** Running Evaluation *****\n",
            "  Num examples = 1073\n",
            "  Batch size = 8\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='270' max='135' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [135/135 01:17]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
            "\n",
            "\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "TrainOutput(global_step=300, training_loss=4.1625390625, metrics={'train_runtime': 68.0522, 'train_samples_per_second': 35.267, 'train_steps_per_second': 4.408, 'total_flos': 78389025177600.0, 'train_loss': 4.1625390625, 'epoch': 0.02})"
            ]
          },
          "execution_count": 55,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "trainer.train()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 56,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 106
        },
        "id": "7USdHF8PDg_Y",
        "outputId": "7fb23f47-621c-4e8d-f4ea-9f7ad5f24df0"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "***** Running Evaluation *****\n",
            "  Num examples = 1073\n",
            "  Batch size = 8\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='135' max='135' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [135/135 00:07]\n",
              "    </div>\n",
              "    "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Perplexity: 57.24\n"
          ]
        }
      ],
      "source": [
        "eval_results = trainer.evaluate()\n",
        "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KyuxY0ZqD4P6",
        "outputId": "e652988b-059f-408b-f80a-a94c1fb07bfe"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Configuration saved in imdb-gpt2/config.json\n",
            "Model weights saved in imdb-gpt2/pytorch_model.bin\n"
          ]
        }
      ],
      "source": [
        "model.save_pretrained(\"imdb-gpt2\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Lb9jtJBfib08"
      },
      "source": [
        "### End Domain finetuning for the target classification task.\n",
        "\n",
        "* The pretrained (and potentially finetuned on an intermediate task) model can be used in multiple ways:\n",
        "  * Use the learned weights and add a linear layer at the end for classification, e.g. for onne [CLS] token\n",
        "  * Export the contextual representations of the words and use them in another model or aggregate them in one representation with following layers for classification."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 60,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ejlj9o0Bs_Ze",
        "outputId": "74ddfd87-b2ef-422a-fff4-f373e1203c4c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "odict_keys(['logits', 'past_key_values', 'hidden_states'])\n",
            "7\n",
            "torch.Size([1, 29, 768])\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "torch.Size([1, 768])"
            ]
          },
          "execution_count": 60,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Export contextual representations:\n",
        "\n",
        "def mean_pooling(model_output, attention_mask):\n",
        "    # Mean Pooling - Take attention mask into account for correct averaging\n",
        "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(model_output.size()).float()\n",
        "    sum_embeddings = torch.sum(model_output * input_mask_expanded, 1)\n",
        "    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
        "    return sum_embeddings / sum_mask\n",
        "\n",
        "encoded = tokenizer(['Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\".'],\n",
        "                    return_tensors='pt')\n",
        "encoded = {k:v.to(device) for k, v in encoded.items()}\n",
        "\n",
        "model_output = model(**encoded, output_hidden_states=True, return_dict=True)\n",
        "print(model_output.keys())\n",
        "print(len(model_output['hidden_states'])) # contextual representations of separate words from each of the 6 layes\n",
        "print(model_output['hidden_states'][-1].shape) # last layer with contextual representations (batch_size x num words x representation dim)\n",
        "\n",
        "# Aggregate all the representations into one\n",
        "mean_model_output = mean_pooling(model_output['hidden_states'][-1], encoded['attention_mask'])\n",
        "mean_model_output.shape"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oVKoXAPgihaz"
      },
      "source": [
        "## Generation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 61,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "J7RY6XMnK3NY",
        "outputId": "34724d83-675c-44c2-bdfc-5df8037d97c2"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
            "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/config.json\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"distilgpt2\",\n",
            "  \"_num_labels\": 1,\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.1,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.1,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"id2label\": {\n",
            "    \"0\": \"LABEL_0\"\n",
            "  },\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"label2id\": {\n",
            "    \"LABEL_0\": 0\n",
            "  },\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 6,\n",
            "  \"n_positions\": 1024,\n",
            "  \"reorder_and_upcast_attn\": false,\n",
            "  \"resid_pdrop\": 0.1,\n",
            "  \"scale_attn_by_inverse_layer_idx\": false,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"transformers_version\": \"4.22.1\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading file vocab.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/vocab.json\n",
            "loading file merges.txt from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/merges.txt\n",
            "loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/tokenizer.json\n",
            "loading file added_tokens.json from cache at None\n",
            "loading file special_tokens_map.json from cache at None\n",
            "loading file tokenizer_config.json from cache at None\n",
            "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/config.json\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"distilgpt2\",\n",
            "  \"_num_labels\": 1,\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.1,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.1,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"id2label\": {\n",
            "    \"0\": \"LABEL_0\"\n",
            "  },\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"label2id\": {\n",
            "    \"LABEL_0\": 0\n",
            "  },\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 6,\n",
            "  \"n_positions\": 1024,\n",
            "  \"reorder_and_upcast_attn\": false,\n",
            "  \"resid_pdrop\": 0.1,\n",
            "  \"scale_attn_by_inverse_layer_idx\": false,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"transformers_version\": \"4.22.1\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/config.json\n",
            "Model config GPT2Config {\n",
            "  \"_name_or_path\": \"distilgpt2\",\n",
            "  \"_num_labels\": 1,\n",
            "  \"activation_function\": \"gelu_new\",\n",
            "  \"architectures\": [\n",
            "    \"GPT2LMHeadModel\"\n",
            "  ],\n",
            "  \"attn_pdrop\": 0.1,\n",
            "  \"bos_token_id\": 50256,\n",
            "  \"embd_pdrop\": 0.1,\n",
            "  \"eos_token_id\": 50256,\n",
            "  \"id2label\": {\n",
            "    \"0\": \"LABEL_0\"\n",
            "  },\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"label2id\": {\n",
            "    \"LABEL_0\": 0\n",
            "  },\n",
            "  \"layer_norm_epsilon\": 1e-05,\n",
            "  \"model_type\": \"gpt2\",\n",
            "  \"n_ctx\": 1024,\n",
            "  \"n_embd\": 768,\n",
            "  \"n_head\": 12,\n",
            "  \"n_inner\": null,\n",
            "  \"n_layer\": 6,\n",
            "  \"n_positions\": 1024,\n",
            "  \"reorder_and_upcast_attn\": false,\n",
            "  \"resid_pdrop\": 0.1,\n",
            "  \"scale_attn_by_inverse_layer_idx\": false,\n",
            "  \"scale_attn_weights\": true,\n",
            "  \"summary_activation\": null,\n",
            "  \"summary_first_dropout\": 0.1,\n",
            "  \"summary_proj_to_labels\": true,\n",
            "  \"summary_type\": \"cls_index\",\n",
            "  \"summary_use_proj\": true,\n",
            "  \"task_specific_params\": {\n",
            "    \"text-generation\": {\n",
            "      \"do_sample\": true,\n",
            "      \"max_length\": 50\n",
            "    }\n",
            "  },\n",
            "  \"transformers_version\": \"4.22.1\",\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 50257\n",
            "}\n",
            "\n",
            "loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilgpt2/snapshots/c3772e6d13ecdaf8d1105055f7c89becd6e37590/pytorch_model.bin\n",
            "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n",
            "\n",
            "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.\n",
            "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n"
          ]
        }
      ],
      "source": [
        "model_checkpoint = \"distilgpt2\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)\n",
        "model = AutoModelForCausalLM.from_pretrained(model_checkpoint)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 62,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "AqU-dFqcLRac",
        "outputId": "cea8b553-3948-455b-c23b-c41ff0028b20"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Output:\n",
            "----------------------------------------------------------------------------------------------------\n",
            "I was meaning to be a part of the world, and I was a part of it. I was a part of it. I was a part of it. I was a part of it. I was a part of it. I was a\n"
          ]
        }
      ],
      "source": [
        "# encode context the generation is conditioned on\n",
        "input_ids = tokenizer.encode('I was meaning to', return_tensors='pt')\n",
        "\n",
        "# generate text until the output length (which includes the context length) reaches 50\n",
        "greedy_output = model.generate(input_ids, max_length=50)\n",
        "\n",
        "print(\"Output:\\n\" + 100 * '-')\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e22JOheyijyq"
      },
      "source": [
        "## Text Generation Approaches\n",
        "[Source 1](https://github.com/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb), [Source 2](https://towardsdatascience.com/how-to-sample-from-language-models-682bceb97277)\n",
        "\n",
        "Standard generation \n",
        "* **Greedy search** - pick the word with the highest probability as the next word at each time step.\n",
        "* The model quickly starts to repeat itself.\n",
        "  * Common problem in language generation (Vijayakumar et. al, 2016)[https://arxiv.org/abs/1610.02424],(Shao et. al, 2017)[https://arxiv.org/abs/1701.03185].\n",
        "* Misses high probability words hidden behind a low probability word.\n",
        "\n",
        "![BEAM](https://camo.githubusercontent.com/41e1d5381d477e5fe790c89e75eade680714172f/68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f7061747269636b766f6e706c6174656e2f736369656e74696669635f696d616765732f6d61737465722f6265616d5f7365617263682e706e67)\n",
        "\n",
        "\n",
        "Other generation approaches:\n",
        "* **Beam Search**\n",
        "  * Keeps a list of the most likely num_beams so far, the final text is the one with the highes score.\n",
        "  * Solves the problem of missing high probability words hidden behind a low probability words.\n",
        "  * Downside: still generates repeating words\n",
        "  * Downside: works well only in tasks where the length of the generated text is predictable and not variable - e.g. machine translation and summarisation vs. dialogue/story generation\n",
        "\n",
        "* **N-gram penalty**\n",
        "  * Reduce probability of next words that will result in n-grams that have been generated already\n",
        "  * Downside: Not always a desirable effect, some repetiotions might be ok\n",
        "\n",
        "* **Sampling**\n",
        "  * Randomly pick the next work from the generated probability distribution\n",
        "  * Ari Holtzman et al. (2019) note that high quality human language does not follow a distribution of high probability next words\n",
        "  * Downside: Output is not often incoherent \n",
        "  * Use **temperature** to shift the probability distributions\n",
        "    * Dividing logits by the temperature before feeding them into softmax \n",
        "\n",
        "* **Top-K Sampling**\n",
        "  * [(Fan et al., 2020)](https://arxiv.org/pdf/1805.04833.pdf) \n",
        "  * The K most likely next words are filtered and the probability mass is redistributed among only those K next words\n",
        "  * Used in GPT2 and can be partly attributed to its success\n",
        "  * Downside: Does not dynamically adapt the number of words \n",
        "\n",
        "![top-k-before](https://miro.medium.com/proxy/1*Fa84hdkDEOT02Kz-S5IWuw.png)\n",
        "![top-k-after](https://miro.medium.com/proxy/1*eGdyEZ_SMa5FH1ElY6Ro7w.png)\n",
        "\n",
        "* Top-p Nucleus Sampling\n",
        "  * Instead ot top-k, chooses from the smallest possible set of words whose cumulative probability exceeds the probability p.\n",
        "  * Appears to improve quality by removing the tail and making it less likely to go off-topic.\n",
        "  * Dynamic number of words."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 63,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "T7bx10aCSpAv",
        "outputId": "06a3cc26-32f5-4d65-8749-55dbc85285a2"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Beam Search Output:\n",
            "0: I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I\n",
            "1: I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to do a lot of work to make sure I had enough time to make sure I had enough time to make sure I had\n",
            "2: I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to make sure that I had enough time to make sure that I had enough time to make sure that I had enough time to\n",
            "3: I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to make sure that I had enough time to make sure I had enough time to make sure I had enough time to make sure\n",
            "4: I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I was enough time to make sure I\n"
          ]
        }
      ],
      "source": [
        "input_ids = tokenizer.encode('I was going', return_tensors='pt')\n",
        "\n",
        "beam_outputs = model.generate(input_ids, max_length=50, num_beams=5, num_return_sequences=5)\n",
        "print(\"Beam Search Output:\")\n",
        "for i, beam_output in enumerate(beam_outputs):\n",
        "  print(\"{}: {}\".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 64,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zx3sf9MJTbQh",
        "outputId": "0e5d8a8e-32d5-4382-d75d-2c34e1842b86"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Beam Search Output:\n",
            "I was going to have to do a lot of work to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I had enough time to make sure I\n",
            "N-Gram Penalty on Beam Search Output:\n",
            "I was going to have to do a lot of work to make sure I had the right amount of time to work on this project.”\n",
            "\n",
            "“I’m not sure if I can do that, but I think it�\n"
          ]
        }
      ],
      "source": [
        "input_ids = tokenizer.encode('I was going', return_tensors='pt')\n",
        "\n",
        "output = model.generate(input_ids, max_length=50, num_beams=5)\n",
        "print(\"Beam Search Output:\")\n",
        "print(tokenizer.decode(output[0], skip_special_tokens=True))\n",
        "\n",
        "output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2)\n",
        "print(\"N-Gram Penalty on Beam Search Output:\")\n",
        "print(tokenizer.decode(output[0], skip_special_tokens=True))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 65,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bbYI-3lFXDeM",
        "outputId": "b261b38f-26cc-467d-a2e2-5760835ef47b"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Sampling Output:\n",
            "I was going to lose something on that night. After I got hurt, I kept being hurt.\"\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Sampling Output:\n",
            "I was going about every hour from morning with three very experienced athletes from Florida right where they would meet face for face – in-line football coaches everywhere and everything is on line like some team.\" So with your support now coming from Jacksonville the guys have\n",
            "Sampling Output:\n",
            "I was going to have to go to the police station. He was going to have to go to the police station. He was going to have to go to the police station. He was going to have to go to the police station. He was\n"
          ]
        }
      ],
      "source": [
        "# Higher temperature -- more random samples, lower -- more deterministic\n",
        "\n",
        "input_ids = tokenizer.encode('I was going', return_tensors='pt')\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True)\n",
        "print(\"Sampling Output:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, temperature=10.)\n",
        "print(\"Sampling Output:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, temperature=0.5)\n",
        "print(\"Sampling Output:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 66,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "s1NBHpZKIGi5",
        "outputId": "886d8285-7289-4848-c3aa-be5bd628d92b"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Tok-K Sampling:\n",
            "I was going to go to school at 12 and go to work at 8. It was also good for me personally.\n",
            "\n",
            "\"We are really hoping we will have something to do with our new location to raise awareness to where this is happening,\"\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Tok-K Sampling:\n",
            "I was going to be doing a show about how I‪re a good guy, I don‪e have the right to do that‪e be the show‪e be the talk show‪e be the talk show�\n",
            "Tok-K Sampling:\n",
            "I was going to live here,\" she said.\n",
            "\n",
            "\n",
            "\n",
            "\"So she wasn't ready for New York City?\" her husband asked.\n",
            "\"I was sleeping with nothing and her husband wanted me and I could not even move out of fear\n"
          ]
        }
      ],
      "source": [
        "input_ids = tokenizer.encode('I was going', return_tensors='pt')\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=50)\n",
        "print(\"Tok-K Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=10)\n",
        "print(\"Tok-K Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=100)\n",
        "print(\"Tok-K Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 67,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "74X0cw-jIee2",
        "outputId": "bc1539df-7f9b-4e7a-e383-6702a14bd240"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Tok-p Nucleus Sampling:\n",
            "I was going to have to go to the hospital, and I'm going to have to go to the hospital.\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Tok-p Nucleus Sampling:\n",
            "I was going to need to beat me and because that fight would probably end without me.\n",
            "\n",
            "\n",
            "In his bout with Steve Luger, Sergey Kovalev gave everything that made him the No. 1 heavyweight in the country and thought he would\n",
            "Tok-p Nucleus Sampling:\n",
            "I was going to keep running, survive straight for half a year.›\n",
            "\n",
            "\n",
            "As evil as I was, they seemed to just sort to just return to life, escape from their prison captors, flee the country, and then back\n"
          ]
        }
      ],
      "source": [
        "input_ids = tokenizer.encode('I was going', return_tensors='pt')\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=0, top_p=0.2)\n",
        "print(\"Tok-p Nucleus Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=0, top_p=0.92)\n",
        "print(\"Tok-p Nucleus Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))\n",
        "\n",
        "greedy_output = model.generate(input_ids, max_length=50, do_sample=True, top_k=0, top_p=0.99)\n",
        "print(\"Tok-p Nucleus Sampling:\")\n",
        "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, do_sample=True))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TAofc_BiiqrZ"
      },
      "source": [
        "# Further reading material:\n",
        "* [Calculation of perplexity for Transformer models](https://huggingface.co/transformers/perplexity.html)\n",
        "* [Current state-of-the-art in Language Modeling](http://nlpprogress.com/english/language_modeling.html)\n",
        "* Language models learn some language specific skills, syntactic grammatical relationships and anaphoric coreference, speech, syntactic chunks, and\n",
        "roles.[(Manning et al., 2020)](https://www.pnas.org/content/pnas/117/48/30046.full.pdf), [(Rogers et al., 2020)](https://aclanthology.org/2020.tacl-1.54/)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "RF-oSG9m5Zr5"
      ],
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.2"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}