{ "cells": [ { "cell_type": "markdown", "id": "07abee6f", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "# Convolutions for Images\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "07444b49", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:01.908966Z", "iopub.status.busy": "2023-08-18T19:41:01.908168Z", "iopub.status.idle": "2023-08-18T19:41:05.382412Z", "shell.execute_reply": "2023-08-18T19:41:05.381355Z" }, "origin_pos": 3, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "from d2l import torch as d2l" ] }, { "cell_type": "markdown", "id": "cd1878b1", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Cross-correlation operation" ] }, { "cell_type": "code", "execution_count": 2, "id": "5e550b4f", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.386933Z", "iopub.status.busy": "2023-08-18T19:41:05.386003Z", "iopub.status.idle": "2023-08-18T19:41:05.393401Z", "shell.execute_reply": "2023-08-18T19:41:05.392355Z" }, "origin_pos": 8, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "def corr2d(X, K): \n", " \"\"\"Compute 2D cross-correlation.\"\"\"\n", " h, w = K.shape\n", " Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))\n", " for i in range(Y.shape[0]):\n", " for j in range(Y.shape[1]):\n", " Y[i, j] = (X[i:i + h, j:j + w] * K).sum()\n", " return Y" ] }, { "cell_type": "markdown", "id": "2613d6a9", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Validate the output of the above implementation" ] }, { "cell_type": "code", "execution_count": 3, "id": "7845059c", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.397828Z", "iopub.status.busy": "2023-08-18T19:41:05.397122Z", "iopub.status.idle": "2023-08-18T19:41:05.427898Z", "shell.execute_reply": "2023-08-18T19:41:05.426544Z" }, "origin_pos": 12, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "tensor([[19., 25.],\n", " [37., 43.]])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])\n", "K = torch.tensor([[0.0, 1.0], [2.0, 3.0]])\n", "corr2d(X, K)" ] }, { "cell_type": "markdown", "id": "00c12716", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Implement a two-dimensional convolutional layer" ] }, { "cell_type": "code", "execution_count": 4, "id": "74d09a7c", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.432542Z", "iopub.status.busy": "2023-08-18T19:41:05.431776Z", "iopub.status.idle": "2023-08-18T19:41:05.444669Z", "shell.execute_reply": "2023-08-18T19:41:05.443731Z" }, "origin_pos": 15, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "class Conv2D(nn.Module):\n", " def __init__(self, kernel_size):\n", " super().__init__()\n", " self.weight = nn.Parameter(torch.rand(kernel_size))\n", " self.bias = nn.Parameter(torch.zeros(1))\n", "\n", " def forward(self, x):\n", " return corr2d(x, self.weight) + self.bias" ] }, { "cell_type": "markdown", "id": "24c71c37", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "A simple application of a convolutional layer:\n", "detecting the edge of an object in an image" ] }, { "cell_type": "code", "execution_count": 5, "id": "3b5cdda0", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.448555Z", "iopub.status.busy": "2023-08-18T19:41:05.447775Z", "iopub.status.idle": "2023-08-18T19:41:05.456977Z", "shell.execute_reply": "2023-08-18T19:41:05.455824Z" }, "origin_pos": 19, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "tensor([[1., 1., 0., 0., 0., 0., 1., 1.],\n", " [1., 1., 0., 0., 0., 0., 1., 1.],\n", " [1., 1., 0., 0., 0., 0., 1., 1.],\n", " [1., 1., 0., 0., 0., 0., 1., 1.],\n", " [1., 1., 0., 0., 0., 0., 1., 1.],\n", " [1., 1., 0., 0., 0., 0., 1., 1.]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = torch.ones((6, 8))\n", "X[:, 2:6] = 0\n", "X" ] }, { "cell_type": "code", "execution_count": 6, "id": "c64588b6", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.463518Z", "iopub.status.busy": "2023-08-18T19:41:05.462855Z", "iopub.status.idle": "2023-08-18T19:41:05.467300Z", "shell.execute_reply": "2023-08-18T19:41:05.466485Z" }, "origin_pos": 23, "tab": [ "pytorch" ] }, "outputs": [], "source": [ "K = torch.tensor([[1.0, -1.0]])" ] }, { "cell_type": "markdown", "id": "4e1d625a", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "We detect $1$ for the edge from white to black\n", "and $-1$ for the edge from black to white" ] }, { "cell_type": "code", "execution_count": 7, "id": "7287547f", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.472001Z", "iopub.status.busy": "2023-08-18T19:41:05.471414Z", "iopub.status.idle": "2023-08-18T19:41:05.478644Z", "shell.execute_reply": "2023-08-18T19:41:05.477751Z" }, "origin_pos": 25, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "tensor([[ 0., 1., 0., 0., 0., -1., 0.],\n", " [ 0., 1., 0., 0., 0., -1., 0.],\n", " [ 0., 1., 0., 0., 0., -1., 0.],\n", " [ 0., 1., 0., 0., 0., -1., 0.],\n", " [ 0., 1., 0., 0., 0., -1., 0.],\n", " [ 0., 1., 0., 0., 0., -1., 0.]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y = corr2d(X, K)\n", "Y" ] }, { "cell_type": "markdown", "id": "a6e055bb", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The kernel `K` only detects vertical edges" ] }, { "cell_type": "code", "execution_count": 8, "id": "c5803e8e", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.482194Z", "iopub.status.busy": "2023-08-18T19:41:05.481611Z", "iopub.status.idle": "2023-08-18T19:41:05.489355Z", "shell.execute_reply": "2023-08-18T19:41:05.488493Z" }, "origin_pos": 27, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "tensor([[0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.],\n", " [0., 0., 0., 0., 0.]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corr2d(X.t(), K)" ] }, { "cell_type": "markdown", "id": "0b90666d", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Learn the kernel that generated `Y` from `X`" ] }, { "cell_type": "code", "execution_count": 9, "id": "cc5935f0", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.493184Z", "iopub.status.busy": "2023-08-18T19:41:05.492373Z", "iopub.status.idle": "2023-08-18T19:41:05.588165Z", "shell.execute_reply": "2023-08-18T19:41:05.586875Z" }, "origin_pos": 30, "tab": [ "pytorch" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch 2, loss 16.481\n", "epoch 4, loss 5.069\n", "epoch 6, loss 1.794\n", "epoch 8, loss 0.688\n", "epoch 10, loss 0.274\n" ] } ], "source": [ "conv2d = nn.LazyConv2d(1, kernel_size=(1, 2), bias=False)\n", "\n", "X = X.reshape((1, 1, 6, 8))\n", "Y = Y.reshape((1, 1, 6, 7))\n", "lr = 3e-2\n", "\n", "for i in range(10):\n", " Y_hat = conv2d(X)\n", " l = (Y_hat - Y) ** 2\n", " conv2d.zero_grad()\n", " l.sum().backward()\n", " conv2d.weight.data[:] -= lr * conv2d.weight.grad\n", " if (i + 1) % 2 == 0:\n", " print(f'epoch {i + 1}, loss {l.sum():.3f}')" ] }, { "cell_type": "markdown", "id": "16459fcb", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Take a look at the kernel tensor we learned" ] }, { "cell_type": "code", "execution_count": 10, "id": "4ab76f3e", "metadata": { "execution": { "iopub.execute_input": "2023-08-18T19:41:05.593720Z", "iopub.status.busy": "2023-08-18T19:41:05.592926Z", "iopub.status.idle": "2023-08-18T19:41:05.601680Z", "shell.execute_reply": "2023-08-18T19:41:05.600494Z" }, "origin_pos": 35, "tab": [ "pytorch" ] }, "outputs": [ { "data": { "text/plain": [ "tensor([[ 1.0398, -0.9328]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conv2d.weight.data.reshape((1, 2))" ] } ], "metadata": { "celltoolbar": "Slideshow", "language_info": { "name": "python" }, "required_libs": [], "rise": { "autolaunch": true, "enable_chalkboard": true, "overlay": "
", "scroll": true } }, "nbformat": 4, "nbformat_minor": 5 }