{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c9dad41e-e7f7-4c2b-a04a-0ed87e1f4eb0", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sys\n", "import torch.nn as nn\n", "import torch\n", "import warnings\n", "sys.path.append('/home/jovyan/work/d2l_solutions/notebooks/exercises/d2l_utils/')\n", "import d2l\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "id": "e15dba16-1429-4cc9-8a55-bc1cbefb997b", "metadata": {}, "source": [ "# 1. Let’s modernize LeNet. Implement and test the following changes:" ] }, { "cell_type": "markdown", "id": "39f2b521-53c1-4caf-8ce7-71eb68d4e02f", "metadata": {}, "source": [ "## 1.1 Replace average pooling with max-pooling." ] }, { "cell_type": "code", "execution_count": 6, "id": "3a8ffbb4-4588-436d-9214-434f022b4f1e", "metadata": { "tags": [] }, "outputs": [], "source": [ "class MaxPoolingLeNet(d2l.Classifier):\n", " def __init__(self, lr=0.1, num_classes=10):\n", " super().__init__()\n", " self.save_hyperparameters()\n", " self.net = nn.Sequential(nn.LazyConv2d(6, kernel_size=5, padding=2),\n", " nn.Sigmoid(),\n", " nn.MaxPool2d(kernel_size=2, stride=2),\n", " nn.LazyConv2d(16, kernel_size=5),\n", " nn.Sigmoid(),\n", " nn.MaxPool2d(kernel_size=2, stride=2),\n", " nn.Flatten(),\n", " nn.LazyLinear(120),\n", " nn.Sigmoid(),\n", " nn.LazyLinear(84),\n", " nn.Sigmoid(),\n", " nn.LazyLinear(num_classes))\n", " \n", "def init_cnn(module):\n", " if type(module) == nn.Linear or type(module) == nn.Conv2d:\n", " nn.init.xavier_uniform_(module.weight)" ] }, { "cell_type": "code", "execution_count": 20, "id": "6b7cf77f-d232-4da6-9a0f-39c076fd3bed", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "acc: 0.32\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = d2l.FashionMNIST(batch_size=256)\n", "model = MaxPoolingLeNet(lr=0.1)\n", "model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", "trainer = d2l.Trainer(max_epochs=10)\n", "trainer.fit(model, data)\n", "y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1)) \n", "print(f'acc: {model.accuracy(y_hat,data.val.targets).item():.2f}')" ] }, { "cell_type": "markdown", "id": "9e9bdaed-0f76-4da2-b78b-52984d477c8b", "metadata": {}, "source": [ "## 1.2 Replace the softmax layer with ReLU." ] }, { "cell_type": "code", "execution_count": 9, "id": "4d58a821-7c34-4cb1-9f54-ecf020496d31", "metadata": { "tags": [] }, "outputs": [], "source": [ "class ReLULeNet(d2l.Classifier):\n", " def __init__(self, lr=0.1, num_classes=10):\n", " super().__init__()\n", " self.save_hyperparameters()\n", " self.net = nn.Sequential(nn.LazyConv2d(6, kernel_size=5, padding=2),\n", " nn.ReLU(),\n", " nn.MaxPool2d(kernel_size=2, stride=2),\n", " nn.LazyConv2d(16, kernel_size=5),\n", " nn.ReLU(),\n", " nn.MaxPool2d(kernel_size=2, stride=2),\n", " nn.Flatten(),\n", " nn.LazyLinear(120),\n", " nn.ReLU(),\n", " nn.LazyLinear(84),\n", " nn.ReLU(),\n", " nn.LazyLinear(num_classes))" ] }, { "cell_type": "code", "execution_count": 10, "id": "72c8f740-176b-47eb-9513-6755bf1fd783", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(69.25442786514759, 14.578803978860378)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model = ReLULeNet(lr=0.1)\n", "model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", "trainer = d2l.Trainer(max_epochs=10)\n", "trainer.fit(model, data)" ] }, { "cell_type": "code", "execution_count": 19, "id": "19322473-0144-4cf1-9d5e-4aa8a7ef7e3e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "acc: 0.80\n" ] } ], "source": [ "y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1)) \n", "print(f'acc: {model.accuracy(y_hat,data.val.targets).item():.2f}')" ] }, { "cell_type": "markdown", "id": "847f12ee-a13a-42e4-ac9b-2c65579bc9a2", "metadata": {}, "source": [ "# 2. Try to change the size of the LeNet style network to improve its accuracy in addition to max-pooling and ReLU." ] }, { "cell_type": "code", "execution_count": 11, "id": "0f0dda4f-68d1-4eaf-9d63-ab1784ab30b8", "metadata": { "tags": [] }, "outputs": [], "source": [ "class ParamLeNet(d2l.Classifier):\n", " def __init__(self, convs, linears, lr=0.1, num_classes=10):\n", " super().__init__()\n", " self.save_hyperparameters()\n", " layers = []\n", " for conv in convs:\n", " layers.append(nn.LazyConv2d(conv[0], kernel_size=conv[1],\n", " padding=conv[2]))\n", " layers.append(nn.ReLU())\n", " layers.append(nn.MaxPool2d(kernel_size=2, stride=2))\n", " layers.append(nn.Flatten())\n", " for linear in linears:\n", " layers.append(nn.LazyLinear(linear))\n", " layers.append(nn.ReLU())\n", " layers.append(nn.LazyLinear(num_classes))\n", " self.net = nn.Sequential(*layers)" ] }, { "cell_type": "markdown", "id": "7aa9af57-da58-4e43-9329-ab9a10a3383e", "metadata": {}, "source": [ "## 2.1 Adjust the convolution window size." ] }, { "cell_type": "code", "execution_count": 22, "id": "ab087100-694d-44a4-9662-28afe8476735", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "convs_list = [[[6,11,5],[16,11,0]],[[6,5,2],[16,5,0]],[[6,3,1],[16,3,0]]]\n", "acc_list = []\n", "for convs in convs_list:\n", " hparams = {'convs':convs, 'linears':[120,84]}\n", " model = ParamLeNet(**hparams)\n", " model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", " trainer = d2l.Trainer(max_epochs=10)\n", " trainer.fit(model, data)\n", " y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1))\n", " acc_list.append(model.accuracy(y_hat,data.val.targets).item())" ] }, { "cell_type": "code", "execution_count": 24, "id": "20264100-62de-4382-b4e2-c582a923e1f1", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "d2l.plot(list(range(len(acc_list))),acc_list,'conv window','acc')" ] }, { "cell_type": "markdown", "id": "40730c0c-b543-4062-88ee-48b92578768b", "metadata": {}, "source": [ "## 2.2 Adjust the number of output channels." ] }, { "cell_type": "code", "execution_count": 25, "id": "044df167-5de6-4589-a1a8-41a66bcc4f51", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "convs_list = [[[16,5,2],[32,5,0]],[[6,5,2],[16,5,0]],[[2,5,2],[8,5,0]]]\n", "acc_list = []\n", "for convs in convs_list:\n", " hparams = {'convs':convs, 'linears':[120,84]}\n", " model = ParamLeNet(**hparams)\n", " model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", " trainer = d2l.Trainer(max_epochs=10)\n", " trainer.fit(model, data)\n", " y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1))\n", " acc_list.append(model.accuracy(y_hat,data.val.targets).item())" ] }, { "cell_type": "code", "execution_count": 26, "id": "6e35f830-173e-4364-8e4e-fa5c5ec7faee", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "d2l.plot(list(range(len(acc_list))),acc_list,'channels','acc')" ] }, { "cell_type": "markdown", "id": "c75292ab-a053-4b3c-9107-ee81204e2672", "metadata": {}, "source": [ "## 2.3 Adjust the number of convolution layers." ] }, { "cell_type": "code", "execution_count": null, "id": "f08d6057-40ea-4f8f-ad2f-025128f16c5c", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n" ], "text/plain": [ "

" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "convs_list = [[[6,5,2],[16,5,2],[32,5,0]],[[6,5,2],[16,5,0]],[[64,5,0]]]\n", "acc_list = []\n", "for convs in convs_list:\n", " hparams = {'convs':convs, 'linears':[120,84]}\n", " model = ParamLeNet(**hparams)\n", " model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", " trainer = d2l.Trainer(max_epochs=10)\n", " trainer.fit(model, data)\n", " y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1))\n", " acc_list.append(model.accuracy(y_hat,data.val.targets).item())" ] }, { "cell_type": "markdown", "id": "72f35a95-2600-4eb8-9b96-4c7b532ca2da", "metadata": {}, "source": [ "## 2.4 Adjust the number of fully connected layers." ] }, { "cell_type": "code", "execution_count": null, "id": "049cf21c-ed01-4388-ab97-e1c9dc38fb37", "metadata": {}, "outputs": [], "source": [ "linears_list = [[256,128,64,32,16],[256,128],[120,84],[64,32]]\n", "acc_list = []\n", "for linears in linears_list:\n", " hparams = {'convs':[[6,5,2],[16,5,0]], 'linears':linears}\n", " model = ParamLeNet(**hparams)\n", " model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", " trainer = d2l.Trainer(max_epochs=10)\n", " trainer.fit(model, data)\n", " y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1))\n", " acc_list.append(model.accuracy(y_hat,data.val.targets).item())" ] }, { "cell_type": "markdown", "id": "7583e801-6811-43ce-85f5-ba296bf86b48", "metadata": {}, "source": [ "## 2.5 Adjust the learning rates and other training details (e.g., initialization and number of epochs)." ] }, { "cell_type": "code", "execution_count": null, "id": "12be3085-5177-41f3-b4cf-7db6f7846736", "metadata": {}, "outputs": [], "source": [ "lr_list = [0.001,0.003,0.01,0.03,0.1,0.3]\n", "acc_list = []\n", "for lr in lr_list:\n", " hparams = {'convs':[[6,5,2],[16,5,0]], 'linears':[120,84],'lr':lr}\n", " model = ParamLeNet(**hparams)\n", " model.apply_init([next(iter(data.get_dataloader(True)))[0]], init_cnn)\n", " trainer = d2l.Trainer(max_epochs=10)\n", " trainer.fit(model, data)\n", " y_hat = model(data.val.data.type(torch.float32).unsqueeze(dim=1))\n", " acc_list.append(model.accuracy(y_hat,data.val.targets).item())" ] }, { "cell_type": "markdown", "id": "984e0140-0731-4b66-914f-d22107cc8e43", "metadata": {}, "source": [ "# 3. Try out the improved network on the original MNIST dataset." ] }, { "cell_type": "markdown", "id": "78893ff7-32dc-432f-8a38-9f7c405b09e0", "metadata": {}, "source": [ "# 4. Display the activations of the first and second layer of LeNet for different inputs (e.g., sweaters and coats)." ] }, { "cell_type": "markdown", "id": "ed94585d-fc96-4842-a99a-658a2699bd47", "metadata": {}, "source": [ "# 5. What happens to the activations when you feed significantly different images into the network (e.g., cats, cars, or even random noise)?" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:d2l]", "language": "python", "name": "conda-env-d2l-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }