{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4b24e71b-8ae9-4b95-8ab6-52f027398e9a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import sys\n",
"import torch.nn as nn\n",
"import torch\n",
"import warnings\n",
"import numpy as np\n",
"from sklearn.model_selection import ParameterGrid\n",
"sys.path.append('/home/jovyan/work/d2l_solutions/notebooks/exercises/d2l_utils/')\n",
"import d2l\n",
"from torchsummary import summary\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"class Seq2SeqEncoder(d2l.Encoder): #@save\n",
" \"\"\"The RNN encoder for sequence-to-sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = d2l.GRU(embed_size, num_hiddens, num_layers, dropout)\n",
" self.apply(init_seq2seq)\n",
"\n",
" def forward(self, X, *args):\n",
" # X shape: (batch_size, num_steps)\n",
" embs = self.embedding(X.t().type(torch.int64))\n",
" # embs shape: (num_steps, batch_size, embed_size)\n",
" outputs, state = self.rnn(embs)\n",
" # outputs shape: (num_steps, batch_size, num_hiddens)\n",
" # state shape: (num_layers, batch_size, num_hiddens)\n",
" return outputs, state\n",
" \n",
"class Seq2SeqDecoder(d2l.Decoder):\n",
" \"\"\"The RNN decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = d2l.GRU(embed_size+num_hiddens, num_hiddens,\n",
" num_layers, dropout)\n",
" self.dense = nn.LazyLinear(vocab_size)\n",
" self.apply(init_seq2seq)\n",
"\n",
" def init_state(self, enc_all_outputs, *args):\n",
" return enc_all_outputs\n",
"\n",
" def forward(self, X, state):\n",
" # X shape: (batch_size, num_steps)\n",
" # embs shape: (num_steps, batch_size, embed_size)\n",
" embs = self.embedding(X.t().type(torch.int32))\n",
" enc_output, hidden_state = state\n",
" # context shape: (batch_size, num_hiddens)\n",
" context = enc_output[-1]\n",
" # Broadcast context to (num_steps, batch_size, num_hiddens)\n",
" context = context.repeat(embs.shape[0], 1, 1)\n",
" # Concat at the feature dimension\n",
" embs_and_context = torch.cat((embs, context), -1)\n",
" # print(embs_and_context.shape,len(hidden_state))\n",
" outputs, hidden_state = self.rnn(embs_and_context, hidden_state)\n",
" outputs = self.dense(outputs).swapaxes(0, 1)\n",
" # outputs shape: (batch_size, num_steps, vocab_size)\n",
" # hidden_state shape: (num_layers, batch_size, num_hiddens)\n",
" return outputs, [enc_output, hidden_state]\n",
" \n",
"class Seq2Seq(d2l.EncoderDecoder): #@save\n",
" \"\"\"The RNN encoder--decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, encoder, decoder, tgt_pad, lr):\n",
" super().__init__(encoder, decoder)\n",
" self.save_hyperparameters()\n",
" \n",
" def loss(self, Y_hat, Y):\n",
" l = super(Seq2Seq, self).loss(Y_hat, Y, averaged=False)\n",
" mask = (Y.reshape(-1) != self.tgt_pad).type(torch.float32)\n",
" return (l * mask).sum() / mask.sum()\n",
"\n",
" def validation_step(self, batch, plot_flag=True):\n",
" Y_hat = self(*batch[:-1])\n",
" l = self.loss(Y_hat, batch[-1])\n",
" if plot_flag:\n",
" self.plot('loss', l, train=False)\n",
" return l\n",
"\n",
" def configure_optimizers(self):\n",
" # Adam optimizer is used here\n",
" return torch.optim.Adam(self.parameters(), lr=self.lr)\n",
" \n",
"def init_seq2seq(module): #@save\n",
" \"\"\"Initialize weights for sequence-to-sequence learning.\"\"\"\n",
" if type(module) == nn.Linear:\n",
" nn.init.xavier_uniform_(module.weight)\n",
" if type(module) == nn.GRU:\n",
" for param in module._flat_weights_names:\n",
" if \"weight\" in param:\n",
" nn.init.xavier_uniform_(module._parameters[param])\n",
"\n",
"def stat_val(model, data):\n",
" ppls = []\n",
" for batch in iter(data.get_dataloader(False)):\n",
" ppls.append(model.validation_step(batch, plot_flag=False).detach().numpy())\n",
" return np.exp(np.mean(ppls))\n",
"\n",
"def experiment(model, data):\n",
" trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=0)\n",
" trainer.fit(model, data)\n",
" return stat_val(model, data)"
]
},
{
"cell_type": "markdown",
"id": "c8b8a3ca-c83b-4883-bc21-be3eb945e7d3",
"metadata": {},
"source": [
"# 1. Can you adjust the hyperparameters to improve the translation results?"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4c9090b0-dcf6-4aa5-b56a-865c9588bac2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading ../data/fra-eng.zip from http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip...\n"
]
}
],
"source": [
"data = d2l.MTFraEng(batch_size=128)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35928e47-55fd-4bb2-9530-714b10d00dbc",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data = d2l.MTFraEng(batch_size=128)\n",
"embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2\n",
"param_grid = {'embed_size':[128, 256, 512],\n",
" 'num_hiddens':[128, 256, 512],\n",
" 'num_layers':[1,2,3],\n",
" 'dropout':[0, 0.1, 0.2, 0.5]\n",
" # 'lr':[0.001,0.003,0.005, 0.01]\n",
" }\n",
"param_grid_obj = ParameterGrid(param_grid)\n",
"ppls = []\n",
"for params in param_grid_obj:\n",
" encoder = Seq2SeqEncoder(\n",
" len(data.src_vocab), **params)\n",
" decoder = Seq2SeqDecoder(\n",
" len(data.tgt_vocab), **params)\n",
" model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''],\n",
" lr=0.005)\n",
" ppl = experiment(model, data)\n",
" ppls.append(ppl)\n",
" print(params, ppl)"
]
},
{
"cell_type": "markdown",
"id": "3cb6e152-0d35-43b2-94a4-5b1a37464a77",
"metadata": {},
"source": [
"# 2. Rerun the experiment without using masks in the loss calculation. What results do you observe? Why?"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "da61ec83-56a2-4912-b924-dfbe891cf1f7",
"metadata": {},
"outputs": [],
"source": [
"class NoMaskSeq2Seq(Seq2Seq): #@save\n",
" \"\"\"The RNN encoder--decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, encoder, decoder, tgt_pad, lr):\n",
" super().__init__(encoder, decoder, tgt_pad, lr)\n",
" self.save_hyperparameters()\n",
" \n",
" def loss(self, Y_hat, Y):\n",
" l = super(Seq2Seq, self).loss(Y_hat, Y, averaged=False)\n",
" return l.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f398647b-0eff-46d3-be30-5d90b5a961a8",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2\n",
"encoder = Seq2SeqEncoder(\n",
" len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"decoder = Seq2SeqDecoder(\n",
" len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"model = NoMaskSeq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''],\n",
" lr=0.005)\n",
"trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=0)\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "markdown",
"id": "0eb69769-38d7-46b5-8fba-3a5c28c43208",
"metadata": {},
"source": [
"# 3. If the encoder and the decoder differ in the number of layers or the number of hidden units, how can we initialize the hidden state of the decoder?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ba82da3-fc3f-4d76-a9a7-fe80d643dea7",
"metadata": {},
"outputs": [],
"source": [
"class DiffSeq2SeqDecoder(d2l.Decoder, d2l.HyperParameters):\n",
" \"\"\"The RNN decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__()\n",
" self.save_hyperparameters()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = d2l.GRU(embed_size+num_hiddens, num_hiddens,\n",
" num_layers, dropout)\n",
" self.dense = nn.LazyLinear(vocab_size)\n",
" self.apply(init_seq2seq)\n",
"\n",
" def init_state(self, enc_all_outputs, *args):\n",
" tran = nn.LazyLinear(self.num_hiddens*self.num_layers)\n",
" H = enc_all_outputs[1].swapaxes(0, 1)\n",
" H = H.reshape(H.shape[0], -1)\n",
" S = tran(H)\n",
" S = S.reshape(S.shape[0],-1, self.num_hiddens)\n",
" S = S.swapaxes(0, 1)\n",
" return enc_all_outputs[0], S\n",
"\n",
" def forward(self, X, state):\n",
" # X shape: (batch_size, num_steps)\n",
" # embs shape: (num_steps, batch_size, embed_size)\n",
" embs = self.embedding(X.t().type(torch.int32))\n",
" enc_output, hidden_state = state\n",
" # context shape: (batch_size, num_hiddens)\n",
" context = enc_output[-1]\n",
" # Broadcast context to (num_steps, batch_size, num_hiddens)\n",
" context = context.repeat(embs.shape[0], 1, 1)\n",
" # Concat at the feature dimension\n",
" embs_and_context = torch.cat((embs, context), -1)\n",
" outputs, hidden_state = self.rnn(embs_and_context, hidden_state)\n",
" outputs = self.dense(outputs).swapaxes(0, 1)\n",
" # outputs shape: (batch_size, num_steps, vocab_size)\n",
" # hidden_state shape: (num_layers, batch_size, num_hiddens)\n",
" return outputs, [enc_output, hidden_state]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "d4cf2c4f-6f4a-427c-bf96-6cf39a6683db",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4.833370327949524, 5.980086803436279)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2\n",
"encoder = Seq2SeqEncoder(\n",
" len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"decoder = DiffSeq2SeqDecoder(\n",
" len(data.tgt_vocab), embed_size, num_hiddens, num_layers+1, dropout)\n",
"model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''],\n",
" lr=0.005)\n",
"trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=0)\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "markdown",
"id": "95b58fa5-d374-4194-834e-a3b01119561c",
"metadata": {},
"source": [
"# 4. In training, replace teacher forcing with feeding the prediction at the previous time step into the decoder. How does this influence the performance?"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "24bd2fa0-c8c4-454c-a56a-27c14f50395a",
"metadata": {},
"outputs": [],
"source": [
"class NoTeacherForceSeq2SeqDecoder(d2l.Decoder):\n",
" \"\"\"The RNN decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = d2l.GRU(embed_size+num_hiddens, num_hiddens,\n",
" num_layers, dropout)\n",
" self.dense = nn.LazyLinear(vocab_size)\n",
" self.apply(init_seq2seq)\n",
"\n",
" def init_state(self, enc_all_outputs, *args):\n",
" return enc_all_outputs\n",
"\n",
" def forward(self, X, state):\n",
" # X shape: (batch_size, num_steps)\n",
" # embs shape: (num_steps, batch_size, embed_size)\n",
" embs = self.embedding(X.t().type(torch.int32))\n",
" enc_output, hidden_state = state\n",
" # context shape: (batch_size, num_hiddens)\n",
" context = enc_output[-1]\n",
" context = context.repeat(1, 1, 1)\n",
" outputs = []\n",
" for i in range(embs.shape[0]):\n",
" embs_and_context = torch.cat((embs[i:i+1], context), -1)\n",
" Y, hidden_state = self.rnn(embs_and_context, hidden_state)\n",
" outputs.append(Y)\n",
" # Broadcast context to (num_steps, batch_size, num_hiddens)\n",
" outputs = torch.cat(outputs,0)\n",
" outputs = self.dense(outputs).swapaxes(0, 1)\n",
" # outputs shape: (batch_size, num_steps, vocab_size)\n",
" # hidden_state shape: (num_layers, batch_size, num_hiddens)\n",
" return outputs, [enc_output, hidden_state]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23e32ecb-1319-48fa-a366-cdbf1bcb070f",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2\n",
"encoder = Seq2SeqEncoder(\n",
" len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"decoder = NoTeacherForceSeq2SeqDecoder(\n",
" len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''],\n",
" lr=0.005)\n",
"trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=0)\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "markdown",
"id": "4f9390b0-8983-4acd-a18f-e00ae21cb515",
"metadata": {},
"source": [
"# 5. Rerun the experiment by replacing GRU with LSTM."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4867bfbc-023c-49aa-9457-a57229d76644",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"class LSTM(d2l.RNN):\n",
" \"\"\"The multilayer GRU model.\n",
"\n",
" Defined in :numref:`sec_deep_rnn`\"\"\"\n",
" def __init__(self, num_inputs, num_hiddens, num_layers, dropout=0):\n",
" d2l.Module.__init__(self)\n",
" self.save_hyperparameters()\n",
" self.rnn = nn.LSTM(num_inputs, num_hiddens, num_layers,\n",
" dropout=dropout)\n",
" \n",
"class LSTMSeq2SeqEncoder(Seq2SeqEncoder): #@save\n",
" \"\"\"The RNN encoder for sequence-to-sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__(vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout)\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = LSTM(embed_size, num_hiddens, num_layers, dropout)\n",
" self.apply(init_seq2seq)\n",
" \n",
"class LSTMSeq2SeqDecoder(Seq2SeqDecoder):\n",
" \"\"\"The RNN decoder for sequence to sequence learning.\"\"\"\n",
" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout=0):\n",
" super().__init__(vocab_size, embed_size, num_hiddens, num_layers,\n",
" dropout)\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.rnn = LSTM(embed_size+num_hiddens, num_hiddens,\n",
" num_layers, dropout)\n",
" self.dense = nn.LazyLinear(vocab_size)\n",
" self.apply(init_seq2seq)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ef3715e3-3b50-408a-8add-9f4a5bae10d2",
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[13], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m model \u001b[38;5;241m=\u001b[39m Seq2Seq(encoder, decoder, tgt_pad\u001b[38;5;241m=\u001b[39mdata\u001b[38;5;241m.\u001b[39mtgt_vocab[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 8\u001b[0m lr\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.005\u001b[39m)\n\u001b[1;32m 9\u001b[0m trainer \u001b[38;5;241m=\u001b[39m d2l\u001b[38;5;241m.\u001b[39mTrainer(max_epochs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m30\u001b[39m, gradient_clip_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, num_gpus\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 10\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:208\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, data)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mval_batch_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_epochs):\n\u001b[0;32m--> 208\u001b[0m train_loss, valid_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_epoch\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m train_loss, valid_loss\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:224\u001b[0m, in \u001b[0;36mTrainer.fit_epoch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 220\u001b[0m train_loss, valid_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrain_dataloader:\n\u001b[1;32m 222\u001b[0m \u001b[38;5;66;03m# if len(batch[0]) != 32:\u001b[39;00m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;66;03m# print(len(batch[0]))\u001b[39;00m\n\u001b[0;32m--> 224\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 225\u001b[0m \u001b[43m \u001b[49m\u001b[43mplot_flag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot_flag\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;66;03m# print(f'step train loss:{loss}, T:{self.model.T}')\u001b[39;00m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptim\u001b[38;5;241m.\u001b[39mzero_grad()\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:333\u001b[0m, in \u001b[0;36mClassifier.training_step\u001b[0;34m(self, batch, plot_flag)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtraining_step\u001b[39m(\u001b[38;5;28mself\u001b[39m, batch, plot_flag\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[0;32m--> 333\u001b[0m y_hat \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbatch\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;66;03m# auc = torch.tensor(roc_auc_score(batch[-1].detach().numpy() , y_hat[:,1].detach().numpy()))\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m plot_flag:\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:738\u001b[0m, in \u001b[0;36mEncoderDecoder.forward\u001b[0;34m(self, enc_X, dec_X, *args)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, enc_X, dec_X, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m--> 738\u001b[0m enc_all_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43menc_X\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 739\u001b[0m dec_state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder\u001b[38;5;241m.\u001b[39minit_state(enc_all_outputs, \u001b[38;5;241m*\u001b[39margs)\n\u001b[1;32m 740\u001b[0m \u001b[38;5;66;03m# print(dec_X.shape,len(dec_state))\u001b[39;00m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;66;03m# Return decoder output only\u001b[39;00m\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
"Cell \u001b[0;32mIn[7], line 25\u001b[0m, in \u001b[0;36mSeq2SeqEncoder.forward\u001b[0;34m(self, X, *args)\u001b[0m\n\u001b[1;32m 23\u001b[0m embs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membedding(X\u001b[38;5;241m.\u001b[39mt()\u001b[38;5;241m.\u001b[39mtype(torch\u001b[38;5;241m.\u001b[39mint64))\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# embs shape: (num_steps, batch_size, embed_size)\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m outputs, state \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrnn\u001b[49m\u001b[43m(\u001b[49m\u001b[43membs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# outputs shape: (num_steps, batch_size, num_hiddens)\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# state shape: (num_layers, batch_size, num_hiddens)\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs, state\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:666\u001b[0m, in \u001b[0;36mRNN.forward\u001b[0;34m(self, inputs, H)\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, inputs, H\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 666\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrnn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mH\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/modules/rnn.py:812\u001b[0m, in \u001b[0;36mLSTM.forward\u001b[0;34m(self, input, hx)\u001b[0m\n\u001b[1;32m 810\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcheck_forward_args(\u001b[38;5;28minput\u001b[39m, hx, batch_sizes)\n\u001b[1;32m 811\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m batch_sizes \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 812\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43m_VF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlstm\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_flat_weights\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_layers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 813\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbidirectional\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbatch_first\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 814\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 815\u001b[0m result \u001b[38;5;241m=\u001b[39m _VF\u001b[38;5;241m.\u001b[39mlstm(\u001b[38;5;28minput\u001b[39m, batch_sizes, hx, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flat_weights, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias,\n\u001b[1;32m 816\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_layers, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbidirectional)\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data = d2l.MTFraEng(batch_size=128)\n",
"embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2\n",
"encoder = LSTMSeq2SeqEncoder(\n",
" len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"decoder = LSTMSeq2SeqDecoder(\n",
" len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)\n",
"model = Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''],\n",
" lr=0.005)\n",
"trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=0)\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "markdown",
"id": "0589bdc8-da0a-4bea-82a0-91150c602a22",
"metadata": {},
"source": [
"# 6. Are there any other ways to design the output layer of the decoder?"
]
},
{
"cell_type": "markdown",
"id": "ca164a50-cf53-42c6-8e19-8723fa3b6935",
"metadata": {},
"source": [
"There are several ways to design the output layer of the decoder in addition to using `nn.Linear`. The choice of the output layer design often depends on the specific task you are working on and the characteristics of your data. Here are some alternative ways to design the output layer:\n",
"\n",
"1. **Softmax Layer**: For tasks like sequence generation, machine translation, or language modeling, you can use a softmax layer as the output layer. This layer converts the decoder's hidden states into probability distributions over the vocabulary. Each element in the output represents the probability of a particular word in the vocabulary.\n",
"\n",
"2. **Linear Layer with Custom Activation**: Instead of using a simple linear layer, you can apply a custom activation function to the linearly transformed hidden states. For example, you can use a sigmoid activation for binary classification tasks or a hyperbolic tangent (tanh) for bounded outputs.\n",
"\n",
"3. **Attention Mechanism**: In sequence-to-sequence models with attention, the output layer is often combined with an attention mechanism. This allows the model to focus on specific parts of the input sequence when generating the output sequence. The output layer takes into account both the decoder's hidden state and the context vector obtained from attention.\n",
"\n",
"4. **Gated Layers**: For more complex sequence generation tasks, you can use gated layers like Gated Recurrent Units (GRUs) or Long Short-Term Memory (LSTM) units as the output layer. These layers have internal gating mechanisms that can capture long-range dependencies and improve sequence generation.\n",
"\n",
"5. **Custom Output Layer**: Depending on your specific task, you can design a custom output layer that suits the problem's requirements. This could involve using a combination of different neural network layers or applying domain-specific operations.\n",
"\n",
"6. **Hybrid Approaches**: In some cases, it may be beneficial to combine multiple output layers. For example, you can use a linear layer followed by a softmax layer for language modeling and then apply an additional linear layer for post-processing or to obtain specific representations.\n",
"\n",
"Remember that the choice of the output layer depends on the specific task and the nature of your data. Experimentation and model evaluation are often necessary to determine the most suitable output layer design."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:d2l]",
"language": "python",
"name": "conda-env-d2l-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}