{ "cells": [ { "cell_type": "markdown", "id": "36e71407", "metadata": {}, "source": [ "## Question 1\n", "\n", "### Q1.1\n", "\n", "$$\\begin{align*}\\text{buy} &= [1,0,0,0,0,0] \\\\ \\text{bought} &= [0,1,0,0,0,0] \\\\ \\text{girl} &= [0,0,1,0,0,0] \\\\ \\text{woman} &= [0,0,0,1,0,0] \\\\ \\text{word} &= [0,0,0,0,1,0] \\\\ \\text{words} &= [0,0,0,0,0,1] \\end{align*}$$\n", "\n", "### Q1.2\n", "\n", "1. The dimension of the embedding linearly increases with the vocabulary size.\n", "2. Not involve semantic features.\n", "\n", "### Q1.3\n", "\n", "Represent the words as {00, 01, 10, 11}." ] }, { "cell_type": "markdown", "id": "80f9ea18", "metadata": {}, "source": [ "## Question 2" ] }, { "cell_type": "markdown", "id": "1982c0b7", "metadata": {}, "source": [ "### Q2.1\n", "\n", "- bi-grams: \n", " - \"CS6493\": \"taking CS6493\", \"CS6493 this\"\n", " - \"NLP\": \"studying NLP\", \"NLP is\"\n", "- tri-grams:\n", " - \"CS6493\": \"am taking CS6493\", \"taking CS6493 this\", \"CS6493 this semester\"\n", " - \"NLP\": \"and studying NLP\", \"studying NLP is\", \"NLP is really\"\n", "\n", "### Q2.2\n", "\n", "1. Sparse feature space;\n", "2. Only suitable to the large training dataset;\n", "3. Cannot interpret unseen words;\n", "4. Sensitive to the hyper-parameter `N`." ] }, { "cell_type": "markdown", "id": "c35ed258-87a7-4bfb-b585-09fc0169c5b0", "metadata": {}, "source": [ "### Question 2.3 4-Gram Model" ] }, { "cell_type": "code", "execution_count": 132, "id": "9ebe6f58-bea8-4ff3-a2b5-30afb57c4354", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1, Loss: 22.401702046394348\n", "Epoch 2, Loss: 19.851872205734253\n", "Epoch 3, Loss: 17.524168014526367\n", "Epoch 4, Loss: 15.372764229774475\n", "Epoch 5, Loss: 13.371510922908783\n", "Epoch 6, Loss: 11.53460431098938\n", "Epoch 7, Loss: 9.843500971794128\n", "Epoch 8, Loss: 8.309678226709366\n", "Epoch 9, Loss: 6.957409352064133\n", "Epoch 10, Loss: 5.794081926345825\n", "Training complete!\n", "The training loss for embedding_dim=32 is 5.794081926345825\n", "Next word prediction: CS6493\n", "Epoch 1, Loss: 23.116318225860596\n", "Epoch 2, Loss: 18.840309739112854\n", "Epoch 3, Loss: 15.208776473999023\n", "Epoch 4, Loss: 12.037311911582947\n", "Epoch 5, Loss: 9.362350881099701\n", "Epoch 6, Loss: 7.122990161180496\n", "Epoch 7, Loss: 5.349436938762665\n", "Epoch 8, Loss: 4.025747239589691\n", "Epoch 9, Loss: 3.078266069293022\n", "Epoch 10, Loss: 2.413187339901924\n", "Training complete!\n", "The training loss for embedding_dim=64 is 2.413187339901924\n", "Next word prediction: CS6493\n", "Epoch 1, Loss: 22.857290029525757\n", "Epoch 2, Loss: 16.749339699745178\n", "Epoch 3, Loss: 12.256677389144897\n", "Epoch 4, Loss: 8.72059839963913\n", "Epoch 5, Loss: 6.024809896945953\n", "Epoch 6, Loss: 4.136698365211487\n", "Epoch 7, Loss: 2.900818020105362\n", "Epoch 8, Loss: 2.115286648273468\n", "Epoch 9, Loss: 1.6096487492322922\n", "Epoch 10, Loss: 1.272818386554718\n", "Training complete!\n", "The training loss for embedding_dim=128 is 1.272818386554718\n", "Next word prediction: CS6493\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "\n", "# Define the training data (4-grams) and vocabulary\n", "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n", "text_list = text.split()\n", "text_list_length = len(text_list)\n", "vocab = set(text_list)\n", "vocab_size = len(vocab)\n", "training_data = []\n", "\n", "for i in range(0, text_list_length-3):\n", " context = ' '.join(text_list[i:i+3])\n", " next_word = text_list[i+3]\n", " training_data.append((context, next_word))\n", "\n", "word_to_ix = {word: i for i, word in enumerate(vocab)}\n", "\n", "# Hyperparameters\n", "embedding_dims = [32, 64, 128]\n", "context_size = 3\n", "hidden_size = 128\n", "learning_rate = 0.01\n", "epochs = 10\n", "\n", "# Create the language model\n", "class LanguageModeler(nn.Module):\n", " def __init__(self, vocab_size, embedding_dim, context_size, hidden_size=128):\n", " super(LanguageModeler, self).__init__()\n", " self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n", " self.linear1 = nn.Linear(context_size * embedding_dim, hidden_size)\n", " self.linear2 = nn.Linear(hidden_size, vocab_size)\n", "\n", " def forward(self, inputs):\n", " embeds = self.embeddings(inputs).view((1, -1))\n", " out = F.relu(self.linear1(embeds))\n", " out = self.linear2(out)\n", " log_probs = F.log_softmax(out, dim=1)\n", " return log_probs\n", "\n", "for embedding_dim in embedding_dims:\n", " model = LanguageModeler(vocab_size, embedding_dim, context_size, hidden_size)\n", " \n", " # Loss and optimizer\n", " loss_function = nn.NLLLoss()\n", " optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n", " \n", " # Training loop\n", " for epoch in range(epochs):\n", " total_loss = 0\n", " for context, target in training_data:\n", " context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n", " model.zero_grad()\n", " log_probs = model(context_idxs)\n", " # print(log_probs)\n", " target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)\n", " loss = loss_function(log_probs, target_idx)\n", " loss.backward()\n", " optimizer.step()\n", " total_loss += loss.item()\n", " print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n", " \n", " print(\"Training complete!\")\n", " print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n", " \n", " # Example usage to predict the next word\n", " context = \"I am taking\"\n", " context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n", " log_probs = model(context_idxs)\n", " predicted_idx = torch.argmax(log_probs).item()\n", " predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]\n", " print(f\"Next word prediction: {predicted_word}\")" ] }, { "cell_type": "markdown", "id": "e8846baa-8135-4e0f-aba4-ead6cdcb7af4", "metadata": {}, "source": [ "### Question 3.1 CBOW Model" ] }, { "cell_type": "code", "execution_count": 133, "id": "1e25fca2-bf10-444a-9056-4680789fc372", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1, Loss: 21.519867658615112\n", "Epoch 2, Loss: 13.469220399856567\n", "Epoch 3, Loss: 8.231013298034668\n", "Epoch 4, Loss: 4.947874903678894\n", "Epoch 5, Loss: 3.111188143491745\n", "Epoch 6, Loss: 2.105540543794632\n", "Epoch 7, Loss: 1.534349948167801\n", "Epoch 8, Loss: 1.1862706989049911\n", "Epoch 9, Loss: 0.9588550999760628\n", "Epoch 10, Loss: 0.7983578592538834\n", "Training complete!\n", "The training loss for embedding_dim=128 is 0.7983578592538834\n", "Target word prediction: CS6493\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "\n", "# Define the training data and vocabulary\n", "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n", "text_list = text.split()\n", "text_list_length = len(text_list)\n", "vocab = set(text_list)\n", "vocab_size = len(vocab)\n", "training_data = []\n", "\n", "window_size = 2 # the context length is window_size * 2\n", "\n", "for i in range(window_size, text_list_length-window_size):\n", " context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])\n", " target_word = text_list[i]\n", " training_data.append((context, target_word))\n", "\n", "# print(training_data)\n", "word_to_ix = {word: i for i, word in enumerate(vocab)}\n", "\n", "# Hyperparameters\n", "embedding_dim = 128\n", "hidden_size = 128\n", "learning_rate = 0.01\n", "epochs = 10\n", "\n", "# Create the language model\n", "class LanguageModeler(nn.Module):\n", " def __init__(self, vocab_size, embedding_dim, hidden_size=128):\n", " super(LanguageModeler, self).__init__()\n", " self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n", " self.linear1 = nn.Linear(embedding_dim, hidden_size)\n", " self.linear2 = nn.Linear(hidden_size, vocab_size)\n", "\n", " def forward(self, inputs):\n", " # take the sum of the context embedding as the representation of the context\n", " embeds = self.embeddings(inputs)\n", " embeds = torch.sum(embeds, dim=0)\n", " embeds = torch.unsqueeze(embeds, dim=0)\n", " out = F.relu(self.linear1(embeds))\n", " out = self.linear2(out)\n", " log_probs = F.log_softmax(out, dim=1)\n", " return log_probs\n", "\n", "model = LanguageModeler(vocab_size, embedding_dim, hidden_size)\n", "\n", "# Loss and optimizer\n", "loss_function = nn.NLLLoss()\n", "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "# Training loop\n", "for epoch in range(epochs):\n", " total_loss = 0\n", " for context, target in training_data:\n", " context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n", " model.zero_grad()\n", " log_probs = model(context_idxs)\n", " target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)\n", " loss = loss_function(log_probs, target_idx)\n", " loss.backward()\n", " optimizer.step()\n", " total_loss += loss.item()\n", " print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n", "\n", "print(\"Training complete!\")\n", "print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n", "\n", "# Example usage to predict the next word\n", "context = \"am taking this semester\"\n", "context_idxs = torch.tensor([word_to_ix[word] for word in context.split()], dtype=torch.long)\n", "log_probs = model(context_idxs)\n", "predicted_idx = torch.argmax(log_probs).item()\n", "predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx][0]\n", "print(f\"Target word prediction: {predicted_word}\")" ] }, { "cell_type": "markdown", "id": "a0f46c52-8593-47a6-8568-8f4567b23f91", "metadata": {}, "source": [ "### 3.2 Skip-gram Model" ] }, { "cell_type": "code", "execution_count": 135, "id": "8c780d7f-d333-4207-a176-52ccc425a9d6", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1, Loss: 20.37692356109619\n", "Epoch 2, Loss: 19.625171661376953\n", "Epoch 3, Loss: 18.91118288040161\n", "Epoch 4, Loss: 18.229763507843018\n", "Epoch 5, Loss: 17.573153018951416\n", "Epoch 6, Loss: 16.935642957687378\n", "Epoch 7, Loss: 16.3105411529541\n", "Epoch 8, Loss: 15.688520431518555\n", "Epoch 9, Loss: 15.058596849441528\n", "Epoch 10, Loss: 14.423372864723206\n", "Training complete!\n", "The training loss for embedding_dim=128 is 14.423372864723206\n", "Target context prediction: am taking this semester\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "\n", "# Define the training data (4-grams) and vocabulary\n", "text = 'I am taking CS6493 this semester and studying NLP is really fascinating'\n", "text_list = text.split()\n", "text_list_length = len(text_list)\n", "vocab = set(text_list)\n", "vocab_size = len(vocab)\n", "training_data = []\n", "\n", "window_size = 2 # the context length is window_size * 2\n", "for i in range(window_size, text_list_length-window_size):\n", " # target_context = ' '.join(text_list[i-window_size:i] + text_list[i+1:i+1+window_size])\n", " target_context = text_list[i-window_size:i] + text_list[i+1:i+1+window_size]\n", " central_word = text_list[i]\n", " training_data.append((central_word, target_context))\n", "\n", "# print(training_data)\n", "word_to_ix = {word: i for i, word in enumerate(vocab)}\n", "\n", "# Hyperparameters\n", "embedding_dim = 128\n", "hidden_size = 128\n", "learning_rate = 0.01\n", "epochs = 10\n", "\n", "# Create the language model\n", "class LanguageModeler(nn.Module):\n", " def __init__(self, vocab_size, embedding_dim, hidden_size, context_length):\n", " super(LanguageModeler, self).__init__()\n", " self.vocab_size = vocab_size\n", " self.embedding_dim = embedding_dim\n", " self.hidden_size = hidden_size\n", " self.context_length = context_length\n", " \n", " self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n", " self.linear1 = nn.Linear(embedding_dim, hidden_size)\n", " self.linear2 = nn.Linear(hidden_size, context_length*vocab_size)\n", "\n", " def forward(self, inputs):\n", " embeds = self.embeddings(inputs)\n", " out = F.relu(self.linear1(embeds))\n", " out = self.linear2(out)\n", " out = out.view(1, self.context_length, -1)\n", " log_probs = F.log_softmax(out, dim=2)\n", " log_probs = torch.squeeze(log_probs)\n", " return log_probs\n", "\n", "model = LanguageModeler(vocab_size, embedding_dim, hidden_size, window_size*2)\n", "\n", "# Loss and optimizer\n", "loss_function = nn.NLLLoss()\n", "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n", "\n", "# Training loop\n", "for epoch in range(epochs):\n", " total_loss = 0\n", " for central_word, target_context in training_data:\n", " central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)\n", " model.zero_grad()\n", " log_probs = model(central_word_idx)\n", " target_context_idx = torch.tensor([word_to_ix[word] for word in target_context], dtype=torch.long)\n", " loss = loss_function(log_probs, target_context_idx)\n", " loss.backward()\n", " optimizer.step()\n", " total_loss += loss.item()\n", " print(f\"Epoch {epoch+1}, Loss: {total_loss}\")\n", "\n", "print(\"Training complete!\")\n", "print(f\"The training loss for embedding_dim={embedding_dim} is {total_loss}\")\n", "\n", "# Example usage to predict the next word\n", "central_word = \"CS6493\"\n", "central_word_idx = torch.tensor([word_to_ix[central_word]], dtype=torch.long)\n", "log_probs = model(central_word_idx)\n", "predicted_idxs = torch.argmax(log_probs, dim=1)\n", "predicted_context = ''\n", "for predicted_idx in predicted_idxs:\n", " for word, idx in word_to_ix.items():\n", " if idx == predicted_idx:\n", " predicted_context += word + ' '\n", "predicted_context = predicted_context.strip()\n", "# predicted_word = [word for word, idx in word_to_ix.items() if idx == predicted_idx for predicted_idx in predicted_idxs][0]\n", "print(f\"Target context prediction: {predicted_context}\")" ] }, { "cell_type": "markdown", "id": "d8ff78d7", "metadata": {}, "source": [ "### Wikipedia corpus\n", "The logic of Question 3.3 is the same as Question 3.2." ] }, { "cell_type": "markdown", "id": "05ad9962-0034-408c-b620-28e0f6796b7e", "metadata": {}, "source": [ "### Question 4" ] }, { "cell_type": "markdown", "id": "addd4ca4", "metadata": {}, "source": [ "### 4.1" ] }, { "cell_type": "markdown", "id": "c10cae40", "metadata": {}, "source": [ "1. Very large vocabulary size;\n", "2. Cannot deal with out of vocablary words;\n", "3. Cannot capture the semantic relations between similar words." ] }, { "cell_type": "markdown", "id": "47114b01", "metadata": {}, "source": [ "### 4.2" ] }, { "cell_type": "code", "execution_count": 137, "id": "760870e1-aec1-43b5-881b-fae861fc5797", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p']\n" ] } ], "source": [ "words = [('old', 10), ('older', 5), ('oldest', 8), ('hug', 8), ('pug', 4), ('hugs', 5)]\n", "vocab = []\n", "for word in words:\n", " for char in word[0]:\n", " if char not in vocab:\n", " vocab.append(char)\n", "print(vocab)" ] }, { "cell_type": "code", "execution_count": 138, "id": "784d0522-26ae-48c9-9e72-da89618fed7f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(vocab)" ] }, { "cell_type": "code", "execution_count": 139, "id": "724145c7-a483-4917-99a7-50d9378ece9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'ol': 23, 'old': 23, 'ld': 23, 'olde': 13, 'older': 5, 'lde': 13, 'lder': 5, 'de': 13, 'der': 5, 'er': 5, 'oldes': 8, 'oldest': 8, 'ldes': 8, 'ldest': 8, 'des': 8, 'dest': 8, 'es': 8, 'est': 8, 'st': 8, 'hu': 13, 'hug': 13, 'ug': 17, 'pu': 4, 'pug': 4, 'hugs': 5, 'ugs': 5, 'gs': 5}\n" ] } ], "source": [ "# compute the frequencies of subwords\n", "freqs = {}\n", "for word in words:\n", " for i, _ in enumerate(word[0]):\n", " for j in range(i+1, len(word[0])):\n", " if word[0][i:j+1] not in freqs:\n", " freqs[word[0][i:j+1]] = word[1]\n", " else:\n", " freqs[word[0][i:j+1]] += word[1]\n", "print(freqs)" ] }, { "cell_type": "code", "execution_count": 140, "id": "42c09777-f98b-4261-ac1f-7e034b4ec926", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('ol', 23), ('old', 23), ('ld', 23), ('ug', 17), ('olde', 13), ('lde', 13), ('de', 13), ('hu', 13), ('hug', 13), ('oldes', 8), ('oldest', 8), ('ldes', 8), ('ldest', 8), ('des', 8), ('dest', 8), ('es', 8), ('est', 8), ('st', 8), ('older', 5), ('lder', 5), ('der', 5), ('er', 5), ('hugs', 5), ('ugs', 5), ('gs', 5), ('pu', 4), ('pug', 4)]\n" ] } ], "source": [ "freq_tuples = list(freqs.items())\n", "sorted_freq_tuples = sorted(freq_tuples, key=lambda t: t[1], reverse=True)\n", "print(sorted_freq_tuples)" ] }, { "cell_type": "code", "execution_count": 141, "id": "ec98162f-3efe-4a47-9843-5fd80cb3e09d", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['o', 'l', 'd', 'e', 'r', 's', 't', 'h', 'u', 'g', 'p', 'ol', 'old', 'ld', 'ug', 'olde']\n" ] } ], "source": [ "# add the 5 most frequent symbol pairs to the vocab\n", "for i in range(5):\n", " vocab.append(sorted_freq_tuples[i][0])\n", "print(vocab)" ] }, { "cell_type": "code", "execution_count": 142, "id": "2e0af173-355a-4505-a07a-ef16dab2f5a2", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['h', 'old'], ['olde', 's', 't'], ['olde', 'r'], ['p', 'ug'], ['[unk]', 'ug'], ['h', 'ug', 'g', '[unk]', '[unk]', 'g', '[unk]', '[unk]', '[unk]', 'e']]\n" ] } ], "source": [ "# using the vocab to tokenize the given words\n", "words = ['hold', 'oldest', 'older', 'pug', 'mug', 'huggingface']\n", "tokenizations = []\n", "for word in words:\n", " subword = word\n", " tokenization = []\n", " while len(subword) != 0:\n", " for i in range(len(subword), 0, -1):\n", " if subword[:i] in vocab:\n", " tokenization.append(subword[:i])\n", " subword = subword[i:]\n", " break\n", " elif i==1 and subword[:i] not in vocab:\n", " tokenization.append('[unk]')\n", " subword = subword[i:]\n", " break\n", " tokenizations.append(tokenization)\n", "print(tokenizations)" ] }, { "cell_type": "code", "execution_count": null, "id": "58a726e4-6ff4-4a1f-831c-4d0d45a0a7c6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }