{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"id": "af830d32-dbc2-40a6-818a-848e2cb02e8f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`.\n",
"[NbConvertApp] Converting notebook 10_7_10_Exercises.ipynb to markdown\n",
"[NbConvertApp] Support files will be in 10_7_10_Exercises_files/\n",
"[NbConvertApp] Making directory 10_7_10_Exercises_files\n",
"[NbConvertApp] Writing 15184 bytes to 10_7_10_Exercises.md\n"
]
}
],
"source": [
"!jupyter nbconvert --to markdown 10_7_10_Exercises.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "895b7391-a96a-4324-a177-8efcc0d83ff5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import sys\n",
"import torch.nn as nn\n",
"import torch\n",
"import warnings\n",
"from sklearn.model_selection import ParameterGrid\n",
"sys.path.append('/home/jovyan/work/d2l_solutions/notebooks/exercises/d2l_utils/')\n",
"import d2l\n",
"from torchsummary import summary\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "markdown",
"id": "a34f0bc1-db8b-4bc6-9d12-10cdf99b1e29",
"metadata": {},
"source": [
"# 1. Adjust the hyperparameters and analyze their influence on running time, perplexity, and the output sequence."
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "a7e6d952-b52c-4868-86ca-2addaa088ae2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"class LSTM(d2l.RNN):\n",
" def __init__(self, num_inputs, num_hiddens, num_layers=1,\n",
" dropout=0):\n",
" d2l.Module.__init__(self)\n",
" self.save_hyperparameters()\n",
" self.rnn = nn.LSTM(num_inputs, num_hiddens, num_layers=num_layers, dropout=dropout)\n",
"\n",
" def forward(self, inputs, H_C=None):\n",
" print(inputs.shape)\n",
" return self.rnn(inputs, H_C)\n",
" \n",
"def stat_val(model, data):\n",
" ppls = []\n",
" for batch in iter(data.get_dataloader(False)):\n",
" ppls.append(model.validation_step(batch, plot_flag=False).detach().numpy())\n",
" return np.exp(np.mean(ppls))\n",
"\n",
"def experient(data_class=d2l.TimeMachine, num_steps=32, num_hiddens=32, lr=1):\n",
" data = data_class(batch_size=1024, num_steps=num_steps)\n",
" lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=num_hiddens)\n",
" model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=lr)\n",
" trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1) #, num_gpus=1\n",
" trainer.fit(model, data)\n",
" return stat_val(model, data)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "93c30948-9396-495f-baa0-3f43f3407110",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([32, 1024, 28])\n",
"torch.Size([32, 1024, 28])\n",
"torch.Size([32, 1024, 28])\n",
"torch.Size([32, 1024, 28])\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 3\u001b[0m model \u001b[38;5;241m=\u001b[39m d2l\u001b[38;5;241m.\u001b[39mRNNLM(lstm, vocab_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(data\u001b[38;5;241m.\u001b[39mvocab), lr\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 4\u001b[0m trainer \u001b[38;5;241m=\u001b[39m d2l\u001b[38;5;241m.\u001b[39mTrainer(max_epochs\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, gradient_clip_val\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m#, num_gpus=1\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:208\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, data)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mval_batch_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_epochs):\n\u001b[0;32m--> 208\u001b[0m train_loss, valid_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_epoch\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m train_loss, valid_loss\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:241\u001b[0m, in \u001b[0;36mTrainer.fit_epoch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mval_dataloader:\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 241\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidation_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 242\u001b[0m \u001b[43m \u001b[49m\u001b[43mplot_flag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot_flag\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mval_batch_idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 244\u001b[0m valid_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mnumpy()\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:620\u001b[0m, in \u001b[0;36mRNNLMScratch.validation_step\u001b[0;34m(self, batch, plot_flag)\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvalidation_step\u001b[39m(\u001b[38;5;28mself\u001b[39m, batch, plot_flag\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[0;32m--> 620\u001b[0m l \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbatch\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 621\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m plot_flag:\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mplot(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mppl\u001b[39m\u001b[38;5;124m'\u001b[39m, torch\u001b[38;5;241m.\u001b[39mexp(l), train\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
"File \u001b[0;32m~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:362\u001b[0m, in \u001b[0;36mClassifier.loss\u001b[0;34m(self, y_hat, y, averaged)\u001b[0m\n\u001b[1;32m 360\u001b[0m y_hat \u001b[38;5;241m=\u001b[39m y_hat\u001b[38;5;241m.\u001b[39mreshape((\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, y_hat\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]))\n\u001b[1;32m 361\u001b[0m y \u001b[38;5;241m=\u001b[39m y\u001b[38;5;241m.\u001b[39mreshape((\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m,))\n\u001b[0;32m--> 362\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_hat\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmean\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\n\u001b[1;32m 363\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43maveraged\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnone\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.local/lib/python3.11/site-packages/torch/nn/functional.py:3029\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m 3027\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 3028\u001b[0m reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3029\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data = d2l.TimeMachine(batch_size=1024, num_steps=32)\n",
"lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32, num_layers=2)\n",
"model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=1)\n",
"trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1) #, num_gpus=1\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3ec74a9-c679-4e72-9d63-918027dc3658",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"param_grid = {'num_steps':[8, 16, 32, 64, 128],\n",
" 'num_hiddens':[8, 16, 32, 64, 128],\n",
" 'lr':[0.01,0.1,1,10]}\n",
"param_grid_obj = ParameterGrid(param_grid)\n",
"ppls = []\n",
"for params in param_grid_obj:\n",
" ppl = experient(**params)\n",
" ppls.append(ppl)\n",
" print(params, ppl)"
]
},
{
"cell_type": "markdown",
"id": "c590d3f1-db29-4206-9168-61f66be7193d",
"metadata": {},
"source": [
"# 2. How would you need to change the model to generate proper words rather than just sequences of characters?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d6b7fba-f8de-4349-a0a1-5f1ab8deef2c",
"metadata": {},
"outputs": [],
"source": [
"class WordTimeMachine(d2l.TimeMachine): \n",
" def _tokenize(self, text):\n",
" return text.split(' ')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2162bd3-a15a-4ae5-8847-497dee11f346",
"metadata": {},
"outputs": [],
"source": [
"experient(data_class=WordTimeMachine)"
]
},
{
"cell_type": "markdown",
"id": "91916317-4c1d-4a4e-a858-f53c68a1f008",
"metadata": {},
"source": [
"# 3. Compare the computational cost for GRUs, LSTMs, and regular RNNs for a given hidden dimension. Pay special attention to the training and inference cost."
]
},
{
"cell_type": "markdown",
"id": "e33471ae-458b-426d-a1c6-a351394767bc",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "be93986b-717c-444b-8be5-35144da55c23",
"metadata": {},
"source": [
"# 4. Since the candidate memory cell ensures that the value range is between -1 and 1 by using the tanh function, why does the hidden state need to use the tanh function again to ensure that the output value range is between -1 and 1?"
]
},
{
"cell_type": "markdown",
"id": "9e8970bd-e48d-408b-9c3c-255247e69f03",
"metadata": {},
"source": [
"The hidden state of an LSTM cell is the output that is passed to the next layer or the next time step. Therefore, it needs to have a consistent range of values that can be easily processed by other layers or cells. The tanh function ensures that the hidden state is bounded between -1 and 1, which is a common range for many activation functions and neural network operations. Moreover, the tanh function is a nonlinear function that can introduce some complexity and diversity to the hidden state, which can help the network learn more complex patterns and features. The tanh function also has a nice property that its derivative is easy to compute and does not suffer from the vanishing gradient problem as much as other functions like sigmoid.\n",
"\n",
"Some sources suggest that the tanh function for the hidden state is not necessary and can be replaced by other functions or even omitted¹². However, this may depend on the specific task and data that the LSTM network is trying to model. In general, the tanh function for the hidden state is a reasonable choice that has been widely used and proven to work well in many applications.\n",
"\n",
"\n",
"- (1) What is the intuition of using tanh in LSTM? - Stack Overflow. https://stackoverflow.com/questions/40761185/what-is-the-intuition-of-using-tanh-in-lstm.\n",
"- (2) Keras documentation: LSTM layer. https://keras.io/api/layers/recurrent_layers/lstm/.\n",
"- (3) Why is there tanh(x)*sigmoid(x) in a LSTM cell?. https://ai.stackexchange.com/questions/32505/why-is-there-tanhxsigmoidx-in-a-lstm-cell.\n",
"- (4) What is the intuition of using tanh in LSTM? - 9to5Answer. https://9to5answer.com/what-is-the-intuition-of-using-tanh-in-lstm."
]
},
{
"cell_type": "markdown",
"id": "67892146-f51e-485b-b067-6c81e8fec75c",
"metadata": {},
"source": [
"# 5. Implement an LSTM model for time series prediction rather than character sequence prediction."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7ad43e8-ef0b-4d4f-a15c-7cd9333d42c1",
"metadata": {},
"outputs": [],
"source": [
"class Data(d2l.DataModule):\n",
" def __init__(self, batch_size=16, T=1000, num_train=600, tau=4, randn=0.2):\n",
" self.save_hyperparameters()\n",
" self.time = torch.range(1, T, dtype=torch.float32)\n",
" self.x = torch.sin(0.01*self.time) + torch.randn(T)*randn\n",
" \n",
" def get_dataloader(self, train):\n",
" features = [self.x[i:self.T-self.tau+i] for i in range(self.tau)]\n",
" self.features = torch.stack(features, 1)\n",
" self.labels = self.x[self.tau:].reshape((-1, 1))\n",
" i = slice(0, self.num_train) if train else slice(self.num_train, None)\n",
" return self.get_tensorloader([self.features, self.labels], train, i)\n",
" \n",
"class RNNAutoRegression(d2l.LinearRegression): #@save\n",
" \"\"\"The RNN-based language model implemented with high-level APIs.\"\"\"\n",
" def init_params(self):\n",
" self.linear = nn.LazyLinear(1)\n",
"\n",
" # def output_layer(self, hiddens):\n",
" # return self.linear(hiddens).swapaxes(0, 1)\n",
" \n",
" def __init__(self, rnn,lr=0.01, tau=4, plot_flag=True, emb_len=8):\n",
" super().__init__(lr=lr)\n",
" self.save_hyperparameters()\n",
" self.init_params() \n",
"\n",
" def forward(self, X, state=None):\n",
" rnn_outputs, _ = self.rnn(X, state)\n",
" return self.linear(rnn_outputs)\n",
" # return rnn_outputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a96368d-c457-444e-ab0e-f0008234ec44",
"metadata": {},
"outputs": [],
"source": [
"tau=4\n",
"data = Data(tau=tau)\n",
"lstm = LSTM(num_inputs=tau, num_hiddens=8)\n",
"model = RNNAutoRegression(rnn=lstm, lr=0.01)\n",
"trainer = d2l.Trainer(max_epochs=5)\n",
"trainer.fit(model, data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c0bd41c-f9e0-4071-86eb-ad6d23ac8a78",
"metadata": {},
"outputs": [],
"source": [
"onestep_preds = model(data.features).detach().numpy()\n",
"d2l.plot(data.time[data.tau:], [data.labels, onestep_preds], 'time', 'x',\n",
" legend=['labels', '1-step preds'], figsize=(6, 3))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:d2l]",
"language": "python",
"name": "conda-env-d2l-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}