{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "a8e4909967c440ffa02b63a013f12adb": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3f981889ad224474839b1b7ae8ca08e3", "IPY_MODEL_73472dfbbbb74ba3b4884a51199a1de9", "IPY_MODEL_c13706913456492baa77b44073dc1271" ], "layout": "IPY_MODEL_93fa292923d64ef6a0d81f3beca6b3c2" } }, "3f981889ad224474839b1b7ae8ca08e3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f83d8c153a76420cb7c77dca993f91f0", "placeholder": "​", "style": "IPY_MODEL_22f22b431dc04a1b8d13823068a83067", "value": "100%" } }, "73472dfbbbb74ba3b4884a51199a1de9": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1875f86e3db14fae9a7d09e644343b7d", "max": 5482800, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_6f18abcd355748979d893a906848e5dd", "value": 5482800 } }, "c13706913456492baa77b44073dc1271": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b42a7b8ad2494eb6aa550d6dc6f47b5a", "placeholder": "​", "style": "IPY_MODEL_6c932e88f54f4cd8b1987d0b5a7d0d9b", "value": " 5482800/5482800 [00:00<00:00, 56199028.27it/s]" } }, "93fa292923d64ef6a0d81f3beca6b3c2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f83d8c153a76420cb7c77dca993f91f0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "22f22b431dc04a1b8d13823068a83067": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "1875f86e3db14fae9a7d09e644343b7d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6f18abcd355748979d893a906848e5dd": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b42a7b8ad2494eb6aa550d6dc6f47b5a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6c932e88f54f4cd8b1987d0b5a7d0d9b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# Part 1: Transformer Language Model\n", "\n", "A transformer language model is a type of neural network architecture used for natural language processing tasks such as language translation and text generation. It uses self-attention mechanisms to capture dependencies between different parts of the input text, allowing it to better understand context and produce more accurate outputs. Transformer language models have achieved state-of-the-art performance on many language-related tasks, including language modeling and machine translation.\n", "\n", "For more information: https://arxiv.org/pdf/1706.03762.pdf\n", "\n", "__Tasks:__ \n", "\n", "1. How does the Transformer function?\n", "2. Understand the differnece between query, key and value vectors.\n", "3. Understanding the positional encoding scheme that enables the model to distinguish between elements in the input sequence, despite the absence of explicit ordering information.\n", "4. Have the abality of \"code navigation\"/\"code exploration\", to have the skill of being able to quickly jump between different parts of a codebase to understand how different functions and classes interact with each other." ], "metadata": { "id": "ETZ7f2R2FqtG" } }, { "cell_type": "markdown", "source": [ "\"Image\"" ], "metadata": { "id": "fTCk9r_OJ_LD" } }, { "cell_type": "code", "source": [ "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "import matplotlib.pyplot as plt\n", "import math\n", "from torch.utils.data import Dataset, DataLoader, random_split\n", "from torchvision.datasets.utils import download_url\n", "import io\n", "import torch.nn.functional as F\n", "from torch.nn.utils.rnn import pad_sequence" ], "metadata": { "id": "x3FHUVuaIjRf" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n", "## Basically, the model can be divided into three parts:\n", "## Encoding layer, Decoding layer and Output layer\n", "class Transformer(nn.Module):\n", " def __init__(self):\n", " super(Transformer, self).__init__()\n", " self.encoder = Encoder().to(device) #See below\n", " self.decoder = Decoder().to(device) #See below\n", "\n", " ## The output d_model is the dimension of each token output in the decoding layer\n", " self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False) ##\n", " def forward(self, enc_inputs, dec_inputs): # encoder / decoder \n", "\n", " enc_outputs, enc_self_attns = self.encoder(enc_inputs)\n", "\n", " dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)\n", "\n", " dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]\n", " return dec_logits, enc_self_attns, dec_self_attns, dec_enc_attns" ], "metadata": { "id": "-PUUoFyibtbJ" }, "execution_count": 15, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Question\n", " \n", " 1. What is the function of enc_self_attns (one of the output of encoder)? \n", "\n", " 2. What are the inputs of the decoder? Why does the decoder require two outputs?" ], "metadata": { "id": "JzWUlLaqbxQd" } }, { "cell_type": "markdown", "source": [ "Answer:" ], "metadata": { "id": "uTpcW-GuIPBg" } }, { "cell_type": "code", "source": [ "class Encoder(nn.Module):\n", " def __init__(self):\n", " super(Encoder, self).__init__()\n", " self.src_emb = nn.Embedding(src_vocab_size, d_model).to(device) ## size --> src_vocab_size * d_model\n", " self.pos_emb = PositionalEncoding(d_model).to(device) ## see the details in 'attention is all you need'\n", " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", "\n", " def forward(self, enc_inputs): # enc_inputs --> [batch_size x source_len]\n", "\n", " ## enc_outputs --> [batch_size, src_len, d_model]\n", " enc_outputs = self.src_emb(enc_inputs)\n", "\n", " enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(0, 1)\n", "\n", " enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)\n", " enc_self_attns = []\n", " for layer in self.layers:\n", " enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)\n", " enc_self_attns.append(enc_self_attn)\n", " return enc_outputs, enc_self_attns" ], "metadata": { "id": "s7bsyNZ3b9W7" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "class PositionalEncoding(nn.Module):\n", " def __init__(self, d_model, dropout=0.1, max_len=5000):\n", " super(PositionalEncoding, self).__init__()\n", "\n", " self.dropout = nn.Dropout(p=dropout)\n", "\n", " pe = torch.zeros(max_len, d_model).to(device)\n", " position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)\n", " div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))\n", " pe[:, 0::2] = torch.sin(position * div_term)\n", " pe[:, 1::2] = torch.cos(position * div_term) # pe --> [max_len*d_model]\n", "\n", " pe = pe.unsqueeze(0).transpose(0, 1) # pe --> [max_len*1*d_model]\n", "\n", " self.register_buffer('pe', pe) \n", "\n", " def forward(self, x): # x --> [seq_len, batch_size, d_model]\n", " x = x + self.pe[:x.size(0), :]\n", " return self.dropout(x.to(device))" ], "metadata": { "id": "j_iKELKFdWNt" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "def get_attn_pad_mask(seq_q, seq_k):\n", " batch_size, len_q = seq_q.size()\n", " batch_size, len_k = seq_k.size()\n", " # eq(zero) is PAD token\n", " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k, one is masking\n", " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k" ], "metadata": { "id": "8TisUu24dzYE" }, "execution_count": 18, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Question\n", " \n", " 1. Why do we need positional encoding?\n", "\n", " 2. What does get_attn_pad_mask do? " ], "metadata": { "id": "OzZ0GXurdD8p" } }, { "cell_type": "markdown", "source": [ "Answer:" ], "metadata": { "id": "9cw5RJAYIOMI" } }, { "cell_type": "code", "source": [ "class EncoderLayer(nn.Module):\n", " def __init__(self):\n", " super(EncoderLayer, self).__init__()\n", " self.enc_self_attn = MultiHeadAttention().to(device)\n", " self.pos_ffn = PoswiseFeedForwardNet().to(device)\n", "\n", " def forward(self, enc_inputs, enc_self_attn_mask):\n", " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) \n", " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs --> [batch_size x len_q x d_model]\n", " return enc_outputs, attn" ], "metadata": { "id": "BhgrS9dskNr5" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "class MultiHeadAttention(nn.Module):\n", " def __init__(self):\n", " super(MultiHeadAttention, self).__init__()\n", " ## The inputs size of QKV vectors are the same\n", " self.W_Q = nn.Linear(d_model, d_k * n_heads).to(device)\n", " self.W_K = nn.Linear(d_model, d_k * n_heads).to(device)\n", " self.W_V = nn.Linear(d_model, d_v * n_heads).to(device)\n", " self.linear = nn.Linear(n_heads * d_v, d_model).to(device)\n", " self.layer_norm = nn.LayerNorm(d_model)\n", "\n", " def forward(self, Q, K, V, attn_mask):\n", "\n", " ## Q --> [batch_size x len_q x d_model], K: [batch_size x len_k x d_model], V: [batch_size x len_k x d_model]\n", " residual, batch_size = Q, Q.size(0)\n", "\n", " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s --> [batch_size x n_heads x len_q x d_k]\n", " \n", " # YOUR CODE \n", " raise NotImplementedError()\n", "\n", " ## input attn_mask --> batch_size x len_q x len_k\n", " ## new attn_mask --> [batch_size x n_heads x len_q x len_k]\n", " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n", "\n", " ## context --> [batch_size x n_heads x len_q x d_v]\n", " ## attn --> [batch_size x n_heads x len_q x len_k]\n", " context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context --> [batch_size x len_q x n_heads * d_v]\n", " output = self.linear(context)\n", " return self.layer_norm(output + residual), attn # output --> [batch_size x len_q x d_model]" ], "metadata": { "id": "ohNfaNQMjzdP" }, "execution_count": 20, "outputs": [] }, { "cell_type": "markdown", "source": [ "#Calculate the Attention Scores\n", "We'll now try to calculate the attention scores between a set of query vectors ($Q$) and a set of key vectors ($K$), given a set of value vectors ($V$). The attention scores indicate the importance of each key vector to each query vector, and are calculated by taking the dot product of the query and key vectors, scaled by the square root of the dimensionality of the key vectors.\n", "\n", "\\begin{equation}\n", "\\operatorname{Attention}(Q, K, V)=\\operatorname{softmax}\\left(\\frac{Q K^T}{\\sqrt{d_k}}\\right) V\n", "\\end{equation}\n", "\n", "Where $Q$, $K$, and $V$ are the query, key, and value matrices, respectively, and $d_k$ is the dimensionality of the key vectors. The function first calculates the dot product of $Q$ and $K$ transpose, scaled by the square root of $d_k$. It then applies the softmax function to the resulting attention scores to obtain a probability distribution over the key vectors. Finally, the function multiplies each value vector by its corresponding attention score and sums the results to obtain the output of the attention mechanism.\n", "\n", "It might be helpful to keep in mind that matrix K need to be transposed." ], "metadata": { "id": "UAMMqJ4vY0h1" } }, { "cell_type": "code", "source": [ "class ScaledDotProductAttention(nn.Module):\n", " def __init__(self):\n", " super(ScaledDotProductAttention, self).__init__()\n", "\n", " def forward(self, Q, K, V, attn_mask):\n", " \"\"\"Q --> [batch_size x n_heads x len_q x d_k]\n", " K --> [batch_size x n_heads x len_k x d_k]\n", " V --> [batch_size x n_heads x len_k x d_v]\n", " scores --> [batch_size x n_heads x len_q x len_k]\"\"\"\n", "\n", " # YOUR CODE \n", " raise NotImplementedError()\n", "\n", " scores.masked_fill_(attn_mask, -1e9) # This line applies a mask attn_mask to the attention scores scores by replacing the masked positions with a large negative number -1e9. \n", " attn = nn.Softmax(dim=-1)(scores)\n", " context = torch.matmul(attn, V) # calculates the weighted sum of the values V, using the attention probabilities attn as the weights.\n", " return context, attn" ], "metadata": { "id": "fk0KgCdHkoJF" }, "execution_count": 21, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Question\n", "\n", " What does scores.masked_fill_(attn_mask, -1e9) do?" ], "metadata": { "id": "tBSzqa9TkxwO" } }, { "cell_type": "markdown", "source": [ "Answer:" ], "metadata": { "id": "5T7eN3Y8IMF2" } }, { "cell_type": "code", "source": [ "class PoswiseFeedForwardNet(nn.Module):\n", " def __init__(self):\n", " super(PoswiseFeedForwardNet, self).__init__()\n", " self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)\n", " self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)\n", " self.layer_norm = nn.LayerNorm(d_model)\n", "\n", " def forward(self, inputs):\n", " residual = inputs # inputs --> [batch_size, len_q, d_model]\n", " output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))\n", " output = self.conv2(output).transpose(1, 2)\n", " return self.layer_norm(output + residual)" ], "metadata": { "id": "j0xItvGdk1Hk" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "def get_attn_subsequent_mask(seq): # seq --> [batch_size, tgt_len]\n", " attn_shape = [seq.size(0), seq.size(1), seq.size(1)] # attn_shape --> [batch_size, tgt_len, tgt_len]\n", " subsequence_mask = np.triu(np.ones(attn_shape), k=1)\n", " subsequence_mask = torch.from_numpy(subsequence_mask).byte()\n", " return subsequence_mask # [batch_size, tgt_len, tgt_len]" ], "metadata": { "id": "qzDB0wycmB_6" }, "execution_count": 23, "outputs": [] }, { "cell_type": "code", "source": [ "class Decoder(nn.Module):\n", " def __init__(self):\n", " super(Decoder, self).__init__()\n", " self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model).to(device)\n", " self.pos_emb = PositionalEncoding(d_model).to(device)\n", " self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])\n", "\n", " def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs --> [batch_size x target_len]\n", " dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model]\n", " dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, tgt_len, d_model]\n", "\n", " dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs).to(device)\n", "\n", " dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs).to(device)\n", "\n", " dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)\n", "\n", " dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)\n", "\n", " dec_self_attns, dec_enc_attns = [], []\n", " for layer in self.layers:\n", " dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)\n", " dec_self_attns.append(dec_self_attn)\n", " dec_enc_attns.append(dec_enc_attn)\n", " return dec_outputs, dec_self_attns, dec_enc_attns" ], "metadata": { "id": "ek03eNX8lVyu" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "class DecoderLayer(nn.Module):\n", " def __init__(self):\n", " super(DecoderLayer, self).__init__()\n", " self.dec_self_attn = MultiHeadAttention().to(device)\n", " self.dec_enc_attn = MultiHeadAttention().to(device)\n", " self.pos_ffn = PoswiseFeedForwardNet().to(device)\n", "\n", " def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):\n", " dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)\n", " dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)\n", " dec_outputs = self.pos_ffn(dec_outputs)\n", " return dec_outputs, dec_self_attn, dec_enc_attn" ], "metadata": { "id": "SDgRmYK-laRK" }, "execution_count": 25, "outputs": [] }, { "cell_type": "code", "source": [ "## Let's train a model for translation from Norwaygian to English\n", "\n", "def make_batch(sentences):\n", " input_batch = [[src_vocab[n] for n in sentences[0].split()]]\n", " output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]\n", " target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]\n", " return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)\n", "\n", "\n", "sentences = ['jeg elsker dyp læring P', 'S i love deep learning', 'i love deep learning E']\n", "\n", "\n", "# Transformer Parameters\n", "## word list\n", "src_vocab = {'P': 0, 'jeg': 1, 'elsker': 2, 'dyp': 3, 'læring': 4}\n", "src_vocab_size = len(src_vocab)\n", "\n", "tgt_vocab = {'P': 0, 'i': 1, 'love': 2, 'deep': 3, 'learning': 4, 'S': 5, 'E': 6}\n", "tgt_vocab_size = len(tgt_vocab)\n", "\n", "src_len = 5 # length of source\n", "tgt_len = 5 # length of target\n", "\n", "## hyper-parameters\n", "d_model = 512 # Embedding Size\n", "d_ff = 2048 # FeedForward dimension\n", "d_k = d_v = 64 # dimension of K(=Q), V\n", "n_layers = 6 # number of Encoder of Decoder Layer\n", "n_heads = 8 # number of heads in Multi-Head Attention\n", "\n", "model = Transformer().to(device)\n", "\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.0001)\n", "\n", "enc_inputs, dec_inputs, target_batch = make_batch(sentences)\n", "\n", "for epoch in range(20):\n", " optimizer.zero_grad()\n", " outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs.to(device), dec_inputs.to(device))\n", " loss = criterion(outputs.view(-1, outputs.size(-1)).to(device), target_batch.contiguous().view(-1).to(device))\n", " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", " loss.backward()\n", " optimizer.step()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1kOVL5H8leCW", "outputId": "b20b3ab7-4b1b-4177-c4af-ae4a8c4a3ebc" }, "execution_count": 26, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch: 0001 cost = 1.901744\n", "Epoch: 0002 cost = 1.185367\n", "Epoch: 0003 cost = 0.887102\n", "Epoch: 0004 cost = 0.655738\n", "Epoch: 0005 cost = 0.129070\n", "Epoch: 0006 cost = 0.067563\n", "Epoch: 0007 cost = 0.040468\n", "Epoch: 0008 cost = 0.025629\n", "Epoch: 0009 cost = 0.017392\n", "Epoch: 0010 cost = 0.005468\n", "Epoch: 0011 cost = 0.003348\n", "Epoch: 0012 cost = 0.002447\n", "Epoch: 0013 cost = 0.002104\n", "Epoch: 0014 cost = 0.002414\n", "Epoch: 0015 cost = 0.001725\n", "Epoch: 0016 cost = 0.001988\n", "Epoch: 0017 cost = 0.001315\n", "Epoch: 0018 cost = 0.001588\n", "Epoch: 0019 cost = 0.001371\n", "Epoch: 0020 cost = 0.000982\n" ] } ] }, { "cell_type": "code", "source": [ "# Now let's see the outputs\n", "model.eval()\n", "outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs.to(device), dec_inputs.to(device))\n", "result_index = torch.max(outputs.view(-1, outputs.size(-1)), 1) \n", "print(result_index.indices)" ], "metadata": { "id": "jW_xv2OlmNMB", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7c1ead41-dfbe-447c-aa1c-cbb92168b3e8" }, "execution_count": 27, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "tensor([1, 2, 3, 4, 6], device='cuda:0')\n" ] } ] }, { "cell_type": "markdown", "source": [ "It is expected to get 'tensor([1, 2, 3, 4, 6])' " ], "metadata": { "id": "OJBhNC6HmVTl" } }, { "cell_type": "markdown", "source": [ "# Question\n", "\n", "1. What is the meaning of '[1, 2, 3, 4, 6]'?\n", "\n", "2. Can you train the model to translate another language? \n", "please put your answer below. " ], "metadata": { "id": "RQIUuUUiIYz9" } }, { "cell_type": "code", "source": [], "metadata": { "id": "6A1jaTDjVTZq" }, "execution_count": 27, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Transformer with Morse Code \n", "\"Image\"" ], "metadata": { "id": "dTEeARSRVZhZ" } }, { "cell_type": "markdown", "source": [ "Task:\n", "\n", "1. Understand Transformer translation of Morse Code\n", "2. Change some parameters and play around\n", "3. Further understand the process of Tranformer mechanism" ], "metadata": { "id": "3GDGX5Q2W0xA" } }, { "cell_type": "code", "source": [ "## hyper-parameters\n", "epoch = 1\n", "batch_size = 16\n", "learning_rate = 0.0001\n", "d_model = 512 # Embedding Size\n", "d_ff = 2048 # FeedForward dimension\n", "d_k = d_v = 64 # dimension of K(=Q), V\n", "n_layers = 6 # number of Encoder of Decoder Layer\n", "n_heads = 8 # number of heads in Multi-Head Attention\n", "src_vocab_size = 100 # length of source\n", "tgt_vocab_size = 100 # length of target" ], "metadata": { "id": "WP2y47dMKoql" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "class CodeDataset(Dataset):\n", " def __init__(self):\n", " download_url('https://github.com/ecs-vlc/COMP6248/raw/master/exercises/lab7/dataset.txt', '.', 'dataset.txt', None)\n", " with io.open('dataset.txt', 'r') as f:\n", " self.data = f.readlines()\n", "\n", " self.PAD='_'\n", " self.SOS='^'\n", " self.EOS='$'\n", " self.PAD_IDX=0\n", "\n", " # construct the vocabularies to numericalise the data\n", " self.alphabet = \"*\".join(self.PAD+self.SOS+self.EOS+\"abcdefghijklmnopqrstuvwxyz \").split('*')\n", "\n", " self.alphabet_indices = dict((c, i) for i, c in enumerate(self.alphabet))\n", " self.indices_alphabet = dict((i, c) for i, c in enumerate(self.alphabet))\n", "\n", " self.morsebet = self.PAD+self.SOS+self.EOS+'.- /' ##### '_^$.- /'\n", " self.morse_indices = dict((c, i) for i, c in enumerate(self.morsebet))\n", " self.indices_morse = dict((i, c) for i, c in enumerate(self.morsebet))\n", "\n", " def encode_alpha(self, inp):\n", " x = torch.zeros(len(inp), dtype=torch.long)\n", " for t, char in enumerate(inp):\n", " x[t] = self.alphabet_indices[char]\n", "\n", " return x\n", "\n", " def decode_alpha(self, ten, skip_tok=False):\n", " s = ''\n", " ten = ten.view(-1)\n", " for v in ten.view(-1):\n", " if not skip_tok:\n", " s += self.indices_alphabet[v.item()]\n", " elif v>2:\n", " s += self.indices_alphabet[v.item()]\n", " return s\n", "\n", " def encode_morse(self, inp):\n", " x = torch.zeros(len(inp), dtype=torch.long)\n", " for t, char in enumerate(inp):\n", " x[t] = self.morse_indices[char]\n", "\n", " return x\n", "\n", " def decode_morse(self, ten):\n", " s = ''\n", " for v in ten:\n", " s += self.indices_morse[v]\n", " return s\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, i, max_len_input=30, max_len_label=30):\n", " inp, out = self.data[i].strip().split('|')\n", " x = self.encode_morse(inp)\n", " y = self.encode_alpha(out)\n", "\n", " if max_len_input and max_len_label is not None:\n", " x_len = min(len(x), max_len_input)\n", " y_len = min(len(y), max_len_label)\n", "\n", " # Pad the input sequence with the PAD character\n", " x_padded = F.pad(x[:x_len], (0, max_len_input - x_len), value=self.morse_indices[self.PAD])\n", "\n", " # Pad the output sequence with the PAD character\n", " y_padded = F.pad(y[:y_len], (0, max_len_label - y_len), value=self.alphabet_indices[self.PAD])\n", "\n", " return x_padded, y_padded\n", "\n", " return x, y" ], "metadata": { "id": "NMJnASSPLhGT" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "# This will be used to automatically pad all batch items to the same length\n", "def pad_collate(batch):\n", " data = [item[0] for item in batch]\n", " data = pad_sequence(data)\n", " targets = [item[1] for item in batch]\n", " targets = pad_sequence(targets)\n", " return [data, targets]\n", "\n", "# Load the data and split randomly into training and val subsets\n", "ds = CodeDataset()\n", "tr, va = random_split(ds, [len(ds) - len(ds)//3, len(ds)//3])\n", "trainloader = DataLoader(tr, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)\n", "valloader = DataLoader(va, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)" ], "metadata": { "id": "a3v4CeDPLkQb", "colab": { "base_uri": "https://localhost:8080/", "height": 66, "referenced_widgets": [ "a8e4909967c440ffa02b63a013f12adb", "3f981889ad224474839b1b7ae8ca08e3", "73472dfbbbb74ba3b4884a51199a1de9", "c13706913456492baa77b44073dc1271", "93fa292923d64ef6a0d81f3beca6b3c2", "f83d8c153a76420cb7c77dca993f91f0", "22f22b431dc04a1b8d13823068a83067", "1875f86e3db14fae9a7d09e644343b7d", "6f18abcd355748979d893a906848e5dd", "b42a7b8ad2494eb6aa550d6dc6f47b5a", "6c932e88f54f4cd8b1987d0b5a7d0d9b" ] }, "outputId": "344d7bbc-b70e-41e9-82b8-71c894043420" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading https://raw.githubusercontent.com/ecs-vlc/COMP6248/master/exercises/lab7/dataset.txt to ./dataset.txt\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " 0%| | 0/5482800 [00:00