{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "36e71407",
   "metadata": {},
   "source": [
    "## Question 1\n",
    "\n",
    "### Q1.1\n",
    "\n",
    "$$\\begin{align*}\\text{buy} &= [1,0,0,0,0,0] \\\\ \\text{bought} &= [0,1,0,0,0,0] \\\\ \\text{girl} &= [0,0,1,0,0,0] \\\\ \\text{woman} &= [0,0,0,1,0,0] \\\\ \\text{word} &= [0,0,0,0,1,0] \\\\ \\text{words} &= [0,0,0,0,0,1] \\end{align*}$$\n",
    "\n",
    "### Q1.2\n",
    "\n",
    "1. The dimension of the embedding linearly increases with the vocabulary size.\n",
    "2. Not involve semantic features.\n",
    "\n",
    "### Q1.3\n",
    "\n",
    "Represent the words as {00, 01, 10, 11}."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80f9ea18",
   "metadata": {},
   "source": [
    "## Question 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1982c0b7",
   "metadata": {},
   "source": [
    "### Q2.1\n",
    "\n",
    "- bi-grams: \n",
    "    - \"CS6493\": \"taking CS6493\", \"CS6493 this\"\n",
    "    - \"NLP\": \"studying NLP\", \"NLP is\"\n",
    "- tri-grams:\n",
    "    - \"CS6493\": \"am taking CS6493\", \"taking CS6493 this\", \"CS6493 this semester\"\n",
    "    - \"NLP\": \"and studying NLP\", \"studying NLP is\", \"NLP is really\"\n",
    "\n",
    "### Q2.2\n",
    "\n",
    "1. Sparse feature space;\n",
    "2. Only suitable to the large training dataset;\n",
    "3. Cannot interpret unseen words;\n",
    "4. Sensitive to the hyper-parameter `N`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c35ed258-87a7-4bfb-b585-09fc0169c5b0",
   "metadata": {},
   "source": [
    "### Question 2.3 4-Gram Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "9ebe6f58-bea8-4ff3-a2b5-30afb57c4354",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 22.401702046394348\n",
      "Epoch 2, Loss: 19.851872205734253\n",
      "Epoch 3, Loss: 17.524168014526367\n",
      "Epoch 4, Loss: 15.372764229774475\n",
      "Epoch 5, Loss: 13.371510922908783\n",
      "Epoch 6, Loss: 11.53460431098938\n",
      "Epoch 7, Loss: 9.843500971794128\n",
      "Epoch 8, Loss: 8.309678226709366\n",
      "Epoch 9, Loss: 6.957409352064133\n",
      "Epoch 10, Loss: 5.794081926345825\n",
      "Training complete!\n",
      "The training loss for embedding_dim=32 is 5.794081926345825\n",
      "Next word prediction: CS6493\n",
      "Epoch 1, Loss: 23.116318225860596\n",
      "Epoch 2, Loss: 18.840309739112854\n",
      "Epoch 3, Loss: 15.208776473999023\n",
      "Epoch 4, Loss: 12.037311911582947\n",
      "Epoch 5, Loss: 9.362350881099701\n",
      "Epoch 6, Loss: 7.122990161180496\n",
      "Epoch 7, Loss: 5.349436938762665\n",
      "Epoch 8, Loss: 4.025747239589691\n",
      "Epoch 9, Loss: 3.078266069293022\n",
      "Epoch 10, Loss: 2.413187339901924\n",
      "Training complete!\n",
      "The training loss for embedding_dim=64 is 2.413187339901924\n",
      "Next word prediction: CS6493\n",
      "Epoch 1, Loss: 22.857290029525757\n",
      "Epoch 2, Loss: 16.749339699745178\n",
      "Epoch 3, Loss: 12.256677389144897\n",
      "Epoch 4, Loss: 8.72059839963913\n",
      "Epoch 5, Loss: 6.024809896945953\n",
      "Epoch 6, Loss: 4.136698365211487\n",
      "Epoch 7, Loss: 2.900818020105362\n",
      "Epoch 8, Loss: 2.115286648273468\n",
      "Epoch 9, Loss: 1.6096487492322922\n",
      "Epoch 10, Loss: 1.272818386554718\n",
      "Training complete!\n",
      "The training loss for embedding_dim=128 is 1.272818386554718\n",
      "Next word prediction: CS6493\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "# Define the training data (4-grams) and vocabulary\n",
    "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n",
    "text_list = text.split()\n",
    "text_list_length = len(text_list)\n",
    "vocab = set(text_list)\n",
    "vocab_size = len(vocab)\n",
    "training_data = []\n",
    "\n",
    "for i in range(0, text_list_length-3):\n",
    "    context = ' '.join(text_list[i:i+3])\n",
    "    next_word = text_list[i+3]\n",
    "    training_data.append((context, next_word))\n",
    "\n",
    "word_to_ix = {word: i for i, word in enumerate(vocab)}\n",
    "\n",
    "# Hyperparameters\n",
    "embedding_dims = [32, 64, 128]\n",
    "context_size = 3\n",
    "hidden_size = 128\n",
    "learning_rate = 0.01\n",
    "epochs = 10\n",
    "\n",
    "# Create the language model\n",
    "class LanguageModeler(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, context_size, hidden_size=128):\n",
    "        super(LanguageModeler, self).__init__()\n",
    "        self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_size)\n",
    "        self.linear2 = nn.Linear(hidden_size, vocab_size)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        embeds = self.embeddings(inputs).view((1, -1))\n",
    "        out = F.relu(self.linear1(embeds))\n",
    "        out = self.linear2(out)\n",
    "        log_probs = F.log_softmax(out, dim=1)\n",
    "        return log_probs\n",
    "\n",
    "for embedding_dim in embedding_dims:\n",
    "    model = LanguageModeler(vocab_size, embedding_dim, context_size, hidden_size)\n",
    "    \n",
    "    # Loss and optimizer\n",
    "    loss_function = nn.NLLLoss()\n",
    "    optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
    "    \n",
    "    # Training loop\n",
    "    for epoch in range(epochs):\n",
    "        total_loss = 0\n",
    "        for context, target in training_data:\n",
    "            context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n",
    "            model.zero_grad()\n",
    "            log_probs = model(context_idxs)\n",
    "            # print(log_probs)\n",
    "            target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)\n",
    "            loss = loss_function(log_probs, target_idx)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            total_loss += loss.item()\n",
    "        print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n",
    "    \n",
    "    print(\"Training complete!\")\n",
    "    print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n",
    "    \n",
    "    # Example usage to predict the next word\n",
    "    context = \"I am taking\"\n",
    "    context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n",
    "    log_probs = model(context_idxs)\n",
    "    predicted_idx = torch.argmax(log_probs).item()\n",
    "    predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]\n",
    "    print(f\"Next word prediction: {predicted_word}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8846baa-8135-4e0f-aba4-ead6cdcb7af4",
   "metadata": {},
   "source": [
    "### Question 3.1 CBOW Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "1e25fca2-bf10-444a-9056-4680789fc372",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 21.519867658615112\n",
      "Epoch 2, Loss: 13.469220399856567\n",
      "Epoch 3, Loss: 8.231013298034668\n",
      "Epoch 4, Loss: 4.947874903678894\n",
      "Epoch 5, Loss: 3.111188143491745\n",
      "Epoch 6, Loss: 2.105540543794632\n",
      "Epoch 7, Loss: 1.534349948167801\n",
      "Epoch 8, Loss: 1.1862706989049911\n",
      "Epoch 9, Loss: 0.9588550999760628\n",
      "Epoch 10, Loss: 0.7983578592538834\n",
      "Training complete!\n",
      "The training loss for embedding_dim=128 is 0.7983578592538834\n",
      "Target word prediction: CS6493\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "# Define the training data and vocabulary\n",
    "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n",
    "text_list = text.split()\n",
    "text_list_length = len(text_list)\n",
    "vocab = set(text_list)\n",
    "vocab_size = len(vocab)\n",
    "training_data = []\n",
    "\n",
    "window_size = 2 # the context length is window_size * 2\n",
    "\n",
    "for i in range(window_size, text_list_length-window_size):\n",
    "    context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])\n",
    "    target_word = text_list[i]\n",
    "    training_data.append((context, target_word))\n",
    "\n",
    "# print(training_data)\n",
    "word_to_ix = {word: i for i, word in enumerate(vocab)}\n",
    "\n",
    "# Hyperparameters\n",
    "embedding_dim = 128\n",
    "hidden_size = 128\n",
    "learning_rate = 0.01\n",
    "epochs = 10\n",
    "\n",
    "# Create the language model\n",
    "class LanguageModeler(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, hidden_size=128):\n",
    "        super(LanguageModeler, self).__init__()\n",
    "        self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.linear1 = nn.Linear(embedding_dim, hidden_size)\n",
    "        self.linear2 = nn.Linear(hidden_size, vocab_size)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        # take the sum of the context embedding as the representation of the context\n",
    "        embeds = self.embeddings(inputs)\n",
    "        embeds = torch.sum(embeds, dim=0)\n",
    "        embeds = torch.unsqueeze(embeds, dim=0)\n",
    "        out = F.relu(self.linear1(embeds))\n",
    "        out = self.linear2(out)\n",
    "        log_probs = F.log_softmax(out, dim=1)\n",
    "        return log_probs\n",
    "\n",
    "model = LanguageModeler(vocab_size, embedding_dim, hidden_size)\n",
    "\n",
    "# Loss and optimizer\n",
    "loss_function = nn.NLLLoss()\n",
    "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
    "\n",
    "# Training loop\n",
    "for epoch in range(epochs):\n",
    "    total_loss = 0\n",
    "    for context, target in training_data:\n",
    "        context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n",
    "        model.zero_grad()\n",
    "        log_probs = model(context_idxs)\n",
    "        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)\n",
    "        loss = loss_function(log_probs, target_idx)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        total_loss += loss.item()\n",
    "    print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n",
    "\n",
    "print(\"Training complete!\")\n",
    "print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n",
    "\n",
    "# Example usage to predict the next word\n",
    "context = \"am taking this semester\"\n",
    "context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n",
    "log_probs = model(context_idxs)\n",
    "predicted_idx = torch.argmax(log_probs).item()\n",
    "predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]\n",
    "print(f\"Target word prediction: {predicted_word}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0f46c52-8593-47a6-8568-8f4567b23f91",
   "metadata": {},
   "source": [
    "### 3.2 Skip-gram Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "8c780d7f-d333-4207-a176-52ccc425a9d6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 20.37692356109619\n",
      "Epoch 2, Loss: 19.625171661376953\n",
      "Epoch 3, Loss: 18.91118288040161\n",
      "Epoch 4, Loss: 18.229763507843018\n",
      "Epoch 5, Loss: 17.573153018951416\n",
      "Epoch 6, Loss: 16.935642957687378\n",
      "Epoch 7, Loss: 16.3105411529541\n",
      "Epoch 8, Loss: 15.688520431518555\n",
      "Epoch 9, Loss: 15.058596849441528\n",
      "Epoch 10, Loss: 14.423372864723206\n",
      "Training complete!\n",
      "The training loss for embedding_dim=128 is 14.423372864723206\n",
      "Target context prediction: am taking this semester\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "# Define the training data (4-grams) and vocabulary\n",
    "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n",
    "text_list = text.split()\n",
    "text_list_length = len(text_list)\n",
    "vocab = set(text_list)\n",
    "vocab_size = len(vocab)\n",
    "training_data = []\n",
    "\n",
    "window_size = 2 # the context length is window_size * 2\n",
    "for i in range(window_size, text_list_length-window_size):\n",
    "    # target_context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])\n",
    "    target_context = text_list[i-window_size:i] + text_list[i+1:i+1+window_size]\n",
    "    central_word = text_list[i]\n",
    "    training_data.append((central_word, target_context))\n",
    "\n",
    "# print(training_data)\n",
    "word_to_ix = {word: i for i, word in enumerate(vocab)}\n",
    "\n",
    "# Hyperparameters\n",
    "embedding_dim = 128\n",
    "hidden_size = 128\n",
    "learning_rate = 0.01\n",
    "epochs = 10\n",
    "\n",
    "# Create the language model\n",
    "class LanguageModeler(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, hidden_size, context_length):\n",
    "        super(LanguageModeler, self).__init__()\n",
    "        self.vocab_size = vocab_size\n",
    "        self.embedding_dim = embedding_dim\n",
    "        self.hidden_size = hidden_size\n",
    "        self.context_length = context_length\n",
    "        \n",
    "        self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.linear1 = nn.Linear(embedding_dim, hidden_size)\n",
    "        self.linear2 = nn.Linear(hidden_size, context_length*vocab_size)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        embeds = self.embeddings(inputs)\n",
    "        out = F.relu(self.linear1(embeds))\n",
    "        out = self.linear2(out)\n",
    "        out = out.view(1, self.context_length, -1)\n",
    "        log_probs = F.log_softmax(out, dim=2)\n",
    "        log_probs = torch.squeeze(log_probs)\n",
    "        return log_probs\n",
    "\n",
    "model = LanguageModeler(vocab_size, embedding_dim, hidden_size, window_size*2)\n",
    "\n",
    "# Loss and optimizer\n",
    "loss_function = nn.NLLLoss()\n",
    "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
    "\n",
    "# Training loop\n",
    "for epoch in range(epochs):\n",
    "    total_loss = 0\n",
    "    for central_word, target_context in training_data:\n",
    "        central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)\n",
    "        model.zero_grad()\n",
    "        log_probs = model(central_word_idx)\n",
    "        target_context_idx = torch.tensor([word_to_ix[word] for word in target_context], dtype=torch.long)\n",
    "        loss = loss_function(log_probs, target_context_idx)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        total_loss += loss.item()\n",
    "    print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n",
    "\n",
    "print(\"Training complete!\")\n",
    "print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n",
    "\n",
    "# Example usage to predict the next word\n",
    "central_word = \"CS6493\"\n",
    "central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)\n",
    "log_probs = model(central_word_idx)\n",
    "predicted_idxs = torch.argmax(log_probs, dim=1)\n",
    "predicted_context = ''\n",
    "for predicted_idx in predicted_idxs:\n",
    "    for word, idx in word_to_ix.items():\n",
    "        if idx == predicted_idx:\n",
    "            predicted_context += word + ' '\n",
    "predicted_context = predicted_context.strip()\n",
    "# predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx for predicted_idx in predicted_idxs][0]\n",
    "print(f\"Target context prediction: {predicted_context}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8ff78d7",
   "metadata": {},
   "source": [
    "### Wikipedia corpus\n",
    "The logic of Question 3.3 is the same as Question 3.2."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "05ad9962-0034-408c-b620-28e0f6796b7e",
   "metadata": {},
   "source": [
    "### Question 4"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "addd4ca4",
   "metadata": {},
   "source": [
    "### 4.1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c10cae40",
   "metadata": {},
   "source": [
    "1. Very large vocabulary size;\n",
    "2. Cannot deal with out of vocablary words;\n",
    "3. Cannot capture the semantic relations between similar words."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47114b01",
   "metadata": {},
   "source": [
    "### 4.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "760870e1-aec1-43b5-881b-fae861fc5797",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p']\n"
     ]
    }
   ],
   "source": [
    "words = [('old', 10), ('older', 5), ('oldest', 8), ('hug', 8), ('pug', 4), ('hugs', 5)]\n",
    "vocab = []\n",
    "for word in words:\n",
    "    for char in word[0]:\n",
    "        if char not in vocab:\n",
    "            vocab.append(char)\n",
    "print(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "784d0522-26ae-48c9-9e72-da89618fed7f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "724145c7-a483-4917-99a7-50d9378ece9e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ol': 23, 'old': 23, 'ld': 23, 'olde': 13, 'older': 5, 'lde': 13, 'lder': 5, 'de': 13, 'der': 5, 'er': 5, 'oldes': 8, 'oldest': 8, 'ldes': 8, 'ldest': 8, 'des': 8, 'dest': 8, 'es': 8, 'est': 8, 'st': 8, 'hu': 13, 'hug': 13, 'ug': 17, 'pu': 4, 'pug': 4, 'hugs': 5, 'ugs': 5, 'gs': 5}\n"
     ]
    }
   ],
   "source": [
    "# compute the frequencies of subwords\n",
    "freqs = {}\n",
    "for word in words:\n",
    "    for i, _ in enumerate(word[0]):\n",
    "        for j in range(i+1, len(word[0])):\n",
    "            if word[0][i:j+1] not in freqs:\n",
    "                freqs[word[0][i:j+1]] = word[1]\n",
    "            else:\n",
    "                freqs[word[0][i:j+1]] += word[1]\n",
    "print(freqs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "42c09777-f98b-4261-ac1f-7e034b4ec926",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('ol', 23), ('old', 23), ('ld', 23), ('ug', 17), ('olde', 13), ('lde', 13), ('de', 13), ('hu', 13), ('hug', 13), ('oldes', 8), ('oldest', 8), ('ldes', 8), ('ldest', 8), ('des', 8), ('dest', 8), ('es', 8), ('est', 8), ('st', 8), ('older', 5), ('lder', 5), ('der', 5), ('er', 5), ('hugs', 5), ('ugs', 5), ('gs', 5), ('pu', 4), ('pug', 4)]\n"
     ]
    }
   ],
   "source": [
    "freq_tuples = list(freqs.items())\n",
    "sorted_freq_tuples = sorted(freq_tuples, key=lambda t: t[1], reverse=True)\n",
    "print(sorted_freq_tuples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "ec98162f-3efe-4a47-9843-5fd80cb3e09d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p', 'ol', 'old', 'ld', 'ug', 'olde']\n"
     ]
    }
   ],
   "source": [
    "# add the 5 most frequent symbol pairs to the vocab\n",
    "for i in range(5):\n",
    "    vocab.append(sorted_freq_tuples[i][0])\n",
    "print(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "2e0af173-355a-4505-a07a-ef16dab2f5a2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['h', 'old'], ['olde', 's', 't'], ['olde', 'r'], ['p', 'ug'], ['[unk]', 'ug'], ['h', 'ug', 'g', '[unk]', '[unk]', 'g', '[unk]', '[unk]', '[unk]', 'e']]\n"
     ]
    }
   ],
   "source": [
    "# using the vocab to tokenize the given words\n",
    "words = ['hold', 'oldest', 'older', 'pug', 'mug', 'huggingface']\n",
    "tokenizations = []\n",
    "for word in words:\n",
    "    subword = word\n",
    "    tokenization = []\n",
    "    while len(subword) != 0:\n",
    "        for i in range(len(subword), 0, -1):\n",
    "            if subword[:i] in vocab:\n",
    "                tokenization.append(subword[:i])\n",
    "                subword = subword[i:]\n",
    "                break\n",
    "            elif i==1 and subword[:i] not in vocab:\n",
    "                tokenization.append('[unk]')\n",
    "                subword = subword[i:]\n",
    "                break\n",
    "    tokenizations.append(tokenization)\n",
    "print(tokenizations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58a726e4-6ff4-4a1f-831c-4d0d45a0a7c6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}