{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "<!--NAVIGATION-->\n", "# < [Modules](4-Modules.ipynb) | CNN & LSTM | [Transfer Learning](6-Transfer-Learning.ipynb) >" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Convolutional Neural Networks & Recurrent Neural Networks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Colab only!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# execute only if you're using Google Colab\n", "!wget -q https://raw.githubusercontent.com/ahug/amld-pytorch-workshop/master/binder/requirements.txt -O requirements.txt\n", "!pip install -qr requirements.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "____" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import numpy as np\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torchvision import datasets, transforms\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from collections import OrderedDict\n", "import colorama" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ipywidgets as widgets\n", "from ipywidgets import interact, interactive, fixed, interact_manual" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's define the LeNet-5 architecture" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner. \"Gradient-based learning applied to document recognition.\" Proceedings of the IEEE, 86(11):2278-2324, November 1998.*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: The *Gaussian connections* in the last layer are euclidean radial basis functions for each class to estimate the lack of fit. In our implementation, we use a cross-entropy loss function as it's common nowadays." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class LeNet5(nn.Module):\n", " def __init__(self):\n", " super(LeNet5, self).__init__()\n", " \n", " self.conv_net = nn.Sequential(OrderedDict([\n", " ('C1', nn.Conv2d(1, 6, kernel_size=(5, 5))),\n", " ('Relu1', nn.ReLU()),\n", " \n", " ('S2', nn.MaxPool2d(kernel_size=(2, 2), stride=2)),\n", " ('C3', nn.Conv2d(6, 16, kernel_size=(5, 5))),\n", " ('Relu3', nn.ReLU()),\n", " \n", " ('S4', nn.MaxPool2d(kernel_size=(2, 2), stride=2)),\n", " ('C5', nn.Conv2d(16, 120, kernel_size=(5, 5))),\n", " ('Relu5', nn.ReLU()),\n", " ]))\n", " \n", " self.fully_connected = nn.Sequential(OrderedDict([\n", " ('F6', nn.Linear(120, 84)),\n", " ('Relu6', nn.ReLU()),\n", " ('F7', nn.Linear(84, 10)),\n", " ('LogSoftmax', nn.LogSoftmax(dim=-1))\n", " ]))\n", " \n", " \n", " def forward(self, imgs):\n", " output = self.conv_net(imgs)\n", " output = output.view(imgs.shape[0], -1) # imgs.shape[0] == batch_size\n", " output = self.fully_connected(output)\n", " return output " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An extensive list of all available layer types can be found on https://pytorch.org/docs/stable/nn.html." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Print a network summary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "conv_net = LeNet5()\n", "print(conv_net)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Retrieve trainable parameters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "named_params = list(conv_net.named_parameters())\n", "print(\"len(params): %s\\n\" % len(named_params))\n", "for name, param in named_params:\n", " print(\"%s:\\t%s\" % (name, param.shape))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feed network with a random input" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input = torch.randn(1, 1, 32, 32) # batch_size, num_channels, height, width\n", "out = conv_net(input)\n", "print(\"Log-Probabilities: \\n%s\\n\" % out)\n", "print(\"Probabilities: \\n%s\\n\" % torch.exp(out))\n", "print(\"out.shape: \\n%s\" % (out.shape,))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How can we now actually train our CNN?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_cnn(model, train_loader, test_loader, device, num_epochs=3, lr=0.1, use_scheduler=False):\n", " model.train() # not necessary in our example, but still good practice since modules\n", " # like nn.Dropout, nn.BatchNorm require it\n", " \n", " # define an optimizer\n", " optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", " criterion = torch.nn.CrossEntropyLoss()\n", " \n", " if use_scheduler:\n", " scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, 0.85)\n", "\n", " for epoch in range(num_epochs):\n", " print(\"=\"*40, \"Starting epoch %d\" % (epoch + 1), \"=\"*40)\n", " \n", " model.train() # reset to train mode after accuracy computation\n", " \n", " if use_scheduler:\n", " scheduler.step()\n", " \n", " # dataloader returns batches of images for 'data' and a tensor with their respective labels in 'labels'\n", " for batch_idx, (data, labels) in enumerate(train_loader):\n", " data, labels = data.to(device), labels.to(device)\n", "\n", " optimizer.zero_grad()\n", " \n", " output = model(data)\n", " loss = criterion(output, labels)\n", " loss.backward()\n", " optimizer.step()\n", " \n", " \n", " if batch_idx % 40 == 0:\n", " print(\"Batch %d/%d, Loss=%.4f\" % (batch_idx, len(train_loader), loss.item()))\n", "\n", " train_acc = accuracy(model, train_loader, device)\n", " test_acc = accuracy(model, test_loader, device)\n", " print(colorama.Fore.GREEN, \"\\nAccuracy on training: %.2f%%\" % (100*train_acc))\n", " print(\"Accuracy on test: %.2f%%\" % (100*test_acc), colorama.Fore.RESET)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluate model's accuracy on train/test data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def accuracy(model, dataloader, device):\n", " \"\"\" Computes the model's accuracy on the data provided by 'dataloader'\n", " \"\"\"\n", " model.eval()\n", " \n", " num_correct = 0\n", " num_samples = 0\n", " with torch.no_grad(): # deactivates autograd, reduces memory usage and speeds up computations\n", " for data, labels in dataloader:\n", " data, labels = data.to(device), labels.to(device)\n", "\n", " predictions = model(data).max(1)[1] # indices of the maxima along the second dimension\n", " num_correct += (predictions == labels).sum().item()\n", " num_samples += predictions.shape[0]\n", " \n", " return num_correct / num_samples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How to load the training/test data: *dataloaders*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_data = datasets.MNIST('./data', \n", " train = True, \n", " download = True,\n", " transform = transforms.Compose([\n", " transforms.Resize((32, 32)),\n", " transforms.ToTensor()\n", " ]))\n", "\n", "test_data = datasets.MNIST('./data', \n", " train = False, \n", " download = True,\n", " transform = transforms.Compose([\n", " transforms.Resize((32, 32)),\n", " transforms.ToTensor()\n", " ]))\n", "\n", "train_loader = torch.utils.data.DataLoader(train_data, batch_size=256, shuffle=True)\n", "test_loader = torch.utils.data.DataLoader(test_data, batch_size=1024, shuffle=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's visualize some of the training samples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(16,9))\n", "data, target = next(iter(train_loader))\n", "for i in range(10):\n", " img = data.squeeze(1)[i]\n", " plt.subplot(1, 10, i+1)\n", " plt.imshow(img, cmap=\"gray\", interpolation=\"none\")\n", " plt.xlabel(target[i].item(), fontsize=18)\n", " plt.xticks([])\n", " plt.yticks([]) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Start the training!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "conv_net = conv_net.to(device)\n", "\n", "train_cnn(conv_net, train_loader, test_loader, device, lr=2e-3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's look at some of the model's predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def visualize_predictions(model, dataloader, device):\n", " data, labels = next(iter(dataloader))\n", " data, labels = data[:10].to(device), labels[:10]\n", " predictions = model(data).max(1)[1]\n", " \n", " predictions, data = predictions.cpu(), data.cpu()\n", " \n", " plt.figure(figsize=(16,9))\n", " for i in range(10):\n", " img = data.squeeze(1)[i]\n", " plt.subplot(1, 10, i+1)\n", " plt.imshow(img, cmap=\"gray\", interpolation=\"none\")\n", " plt.xlabel(predictions[i].item(), fontsize=18)\n", " plt.xticks([])\n", " plt.yticks([]) \n", " \n", "visualize_predictions(conv_net, test_loader, device)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# We might look at the LSTM example later if we still have some time left" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Long-Short Term Memory (LSTM)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Long short-term memory (LSTM) are units of a recurrent neural network. They were proposed by Hochreiter et al. in 1997. The LSTM was designed to overcome the [vanishing gradient problem](https://en.wikipedia.org/wiki/Vanishing_gradient_problem) which was inherent to most recurrent neural networks in these days. The vanishing gradient problem becomes especially problematic for longer sequences (such as text) where they significantly slow down learning or in the worst case even prevent convergence. \n", "\n", "_Note: They still don't solve the exploding gradient - a commonly used heuristic is to clip the gradients at a certain threshold._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Illustration by Christopher Olah: http://colah.github.io/posts/2015-08-Understanding-LSTMs/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How to use the torch.nn.LSTM module" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can setup a simple LSTM using the 'torch.nn.LSTM' class" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lstm = torch.nn.LSTM(input_size=10, hidden_size=20, num_layers=2)\n", "dummy_input = torch.randn(5, 3, 10) # (seq_length, batch_size, num_features)\n", "h0 = torch.randn(2, 3, 20)\n", "c0 = torch.randn(2, 3, 20)\n", "output, (hn, cn) = lstm(dummy_input, (h0, c0))\n", "\n", "print(\"output.shape: \\n%s\\n\" % (output.shape,))\n", "print(\"hn.shape: \\n%s\\n\" % (hn.shape,))\n", "print(\"cn.shape: \\n%s\" % (cn.shape,))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`output` contains the hidden states of the last layer for all the timesteps.\n", "_hn_ and _cn_ contain only the hidden/cell state of the last timestep.\n", "Therefore, the last slice of _output_ is actually identical to the hidden state of the last layer." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"output[-1,:,:]: \\n%s\\n\" % output[-1, :, :])\n", "print(\"hn[1:,:,:]: \\n%s\" % hn[1, :, :])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Toy Example - Image classification using an LSTM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can again define our model by subclassing 'nn.Module':" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class ImageLSTM(nn.Module):\n", " def __init__(self, num_features, seq_length, hidden_size, num_layers, num_classes):\n", " super(ImageLSTM, self).__init__() \n", " self.num_features = num_features\n", " self.seq_length = seq_length\n", " self.num_layers = num_layers\n", " self.hidden_size = hidden_size\n", " \n", " self.lstm = nn.LSTM(num_features, hidden_size, num_layers, batch_first=True) \n", " # input.shape = (batch_size, seq_length, num_features)\n", " # if batch_first is 'False' (default) it requires the input to be of\n", " # shape = (seq_len, batch_size, num_features)\n", " self.linear = nn.Linear(hidden_size, num_classes)\n", " \n", " \n", " def forward(self, x):\n", " x = x.squeeze(1).permute(0, 2, 1).view(-1, self.seq_length, self.num_features) # read from left-to-right\n", " \n", " # Set initial hidden and cell states\n", " h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) \n", " c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)\n", " \n", " # Forward propagate LSTM\n", " out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size)\n", " # --> output of last layer\n", " \n", " # Decode the hidden state of the last time step\n", " out = self.linear(out[:, -1, :])\n", " return F.log_softmax(out, dim=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "lstm_model = ImageLSTM(num_features=32, seq_length=32, hidden_size=10, num_layers=3, num_classes=10)\n", "lstm_model = lstm_model.to(device)\n", "\n", "train_cnn(lstm_model, train_loader, test_loader, device, use_scheduler=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## See the model in action!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can look at the outputs of the model which gives us probability estimates for each class" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "data, target = next(iter(train_loader)) # get a sample from the dataloader\n", "\n", "output = lstm_model(data.to(device))\n", "output = output.cpu()\n", "\n", "plt.figure(figsize=(15,5))\n", "plt.subplot(1, 2, 1)\n", "plt.imshow(data[0, 0, :, :], cmap=\"gray\", interpolation=\"None\")\n", "plt.subplot(1, 2, 2)\n", "plt.title(\"Predicted probabilities\")\n", "plt.ylim([0, 1])\n", "plt.bar(torch.arange(10), torch.exp(output[0]).data, tick_label=np.arange(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our LSTM reads the image from left-to-right. So how does the prediction change while reading the image? Use the slider below to explore it yourself!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "img = data[0, 0, :, :].view(32, 32)\n", "\n", "probs = []\n", "for ix in range(1, 33):\n", " lstm_model.seq_length = ix\n", " input = img[:, :ix].view(1, 32, -1).contiguous()\n", " output = lstm_model(input.to(device)).view(-1)\n", " probs.append(torch.exp(output[target[0].item()]).item())\n", "\n", " \n", "def draw(width):\n", " img = data[0, :, :, :].clone().view(32, 32) # get first image from batch\n", " plt.figure(figsize=(16,9))\n", " \n", " mask = torch.zeros(32, 32)\n", " mask[:, :width] = 1\n", " \n", " # draw image with mask\n", " plt.subplot(221)\n", " plt.imshow(img, cmap=\"gray\", interpolation=\"none\")\n", " plt.imshow(mask, cmap=\"gray\", alpha=0.6, interpolation=\"none\")\n", "\n", " plt.subplot(222)\n", " plt.title(\"$P(X=%d)$\" % target[0])\n", " plt.ylim([0, 1])\n", " plt.plot(np.arange(1, 33), probs)\n", " plt.plot(width, probs[width-1], 'or') \n", " \n", " \n", " lstm_input = img[:, :width].view(1, 32, -1).contiguous().to(device)\n", " lstm_model.seq_length = width\n", " output = lstm_model(lstm_input).cpu()\n", " plt.subplot(212)\n", " plt.title(\"Predicted probabilities\")\n", " plt.ylim([0, 1])\n", " plt.bar(torch.arange(10), torch.exp(output[0]).data, tick_label=np.arange(10))\n", " \n", "\n", "interactive_plot = interact(draw, width=widgets.IntSlider(min=1, max=31, step=1))\n", "interactive_plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Don't forget to download the notebook, otherwise your changes will be lost!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "<!--NAVIGATION-->\n", "# < [Modules](4-Modules.ipynb) | CNN & LSTM | [Transfer Learning](6-Transfer-Learning.ipynb) >" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }