{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Convolutions\n", "\n", "Import packages for:\n", "1. building a CNN from scratch;\n", "2. using built-in architectures." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.185492Z", "start_time": "2019-07-03T22:12:41.569269Z" } }, "outputs": [], "source": [ "from mxnet import np, npx\n", "from mxnet.gluon import nn\n", "npx.set_np()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The 2D cross-correlation operator:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def corr2d(X, K):\n", " h, w = K.shape\n", " Y = np.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))\n", " for i in range(Y.shape[0]):\n", " for j in range(Y.shape[1]):\n", " Y[i, j] = (X[i: i + h, j: j + w] * K).sum()\n", " return Y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For example, a two-dimensional cross-correlation operation. The shaded portions are the first output element and the input and kernel array elements used in its computation: \n", "\n", "\\begin{align*}\n", "0\\times0+1\\times1+3\\times2+4\\times3=19,\\\\\n", "1\\times0+2\\times1+4\\times2+5\\times3=25,\\\\\n", "3\\times0+4\\times1+6\\times2+7\\times3=37,\\\\\n", "4\\times0+5\\times1+7\\times2+8\\times3=43,\\\\\n", "\n", "\\end{align*}\n", "\n", "\n", "![A simple cross-correlation example.](https://d2l.ai/_images/correlation.svg)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.312247Z", "start_time": "2019-07-03T22:12:43.188173Z" }, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "array([[19., 25.],\n", " [37., 43.]])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n", "K = np.array([[0, 1], [2, 3]])\n", "corr2d(X, K)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![A simple cross-correlation example.](https://d2l.ai/_images/correlation.svg)\n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The convolutional layers\n", "\n", "$\\mathbf Y = \\mathbf X \\star \\mathbf W + b$" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.320777Z", "start_time": "2019-07-03T22:12:43.314285Z" }, "attributes": { "classes": [], "id": "", "n": "70" } }, "outputs": [], "source": [ "class Conv2D(nn.Block):\n", " def __init__(self, kernel_size, **kwargs):\n", " super(Conv2D, self).__init__(**kwargs)\n", " self.weight = self.params.get('weight', shape=kernel_size)\n", " self.bias = self.params.get('bias', shape=(1,))\n", "\n", " def forward(self, x):\n", " return corr2d(x, self.weight.data()) + self.bias.data()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Check the output from the convolution layers. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.343747Z", "start_time": "2019-07-03T22:12:43.325742Z" } }, "outputs": [], "source": [ "def comp_conv2d(conv2d, X):\n", " conv2d.initialize()\n", " # Add batch and channel dimension.\n", " X = X.reshape((1, 1) + X.shape)\n", " Y = conv2d(X)\n", " # Exclude the first two dimensions\n", " return Y.reshape(Y.shape[2:])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "Padding & Stride" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.364745Z", "start_time": "2019-07-03T22:12:43.345529Z" } }, "outputs": [ { "data": { "text/plain": [ "(4, 4)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = np.random.uniform(size=(8, 8))\n", "conv2d = nn.Conv2D(channels=1, kernel_size=3, padding=1, strides=2)\n", "comp_conv2d(conv2d, X).shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\\begin{align}\n", "\\text{ Output shape} & = \\lfloor(n_h-k_h+p_h+s_h)/s_h\\rfloor \\times \\lfloor(n_w-k_w+p_w+s_w)/s_w\\rfloor \\\\\n", " & = \\lfloor(8 - 3 + 1 + 2) / 2\\rfloor \\times \\lfloor(8 - 3 + 1 + 2) / 2\\rfloor \\\\\n", " & = (4, 4)\n", "\\end{align}" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "A slightly more complicated example." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-07-03T22:12:43.382376Z", "start_time": "2019-07-03T22:12:43.368194Z" } }, "outputs": [ { "data": { "text/plain": [ "(2, 2)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = np.random.uniform(size=(8, 8))\n", "conv2d = nn.Conv2D(1, kernel_size=(3, 5), padding=(0, 1), strides=(3, 4))\n", "comp_conv2d(conv2d, X).shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\\begin{align}\n", "\\text{ Output shape} & = \\lfloor(n_h-k_h+p_h+s_h)/s_h\\rfloor \\times \\lfloor(n_w-k_w+p_w+s_w)/s_w\\rfloor \\\\\n", " & = \\lfloor(8 - 3 + 0 + 3)/3\\rfloor \\times \\lfloor(8 - 5 + 1 + 4)/4\\rfloor \\\\\n", " & = (2, 2)\n", "\\end{align}" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Pooling" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A 2D pooling operator" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[4., 5.],\n", " [7., 8.]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def pool2d(X, pool_size, mode='max'):\n", " p_h, p_w = pool_size\n", " Y = np.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))\n", " for i in range(Y.shape[0]):\n", " for j in range(Y.shape[1]):\n", " if mode == 'max':\n", " Y[i, j] = np.max(X[i: i + p_h, j: j + p_w])\n", " elif mode == 'avg':\n", " Y[i, j] = X[i: i + p_h, j: j + p_w].mean()\n", " return Y\n", "\n", "X = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n", "pool2d(X, (2, 2))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Pooling with Padding and Stride " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[[[ 0. 1. 2. 3.]\n", " [ 4. 5. 6. 7.]\n", " [ 8. 9. 10. 11.]\n", " [12. 13. 14. 15.]]]]\n" ] }, { "data": { "text/plain": [ "array([[[[ 5., 7.],\n", " [13., 15.]]]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = np.arange(16).reshape((1, 1, 4, 4))\n", "print(X)\n", "pool2d = nn.MaxPool2D(pool_size=3, padding=1, strides=2)\n", "pool2d(X)\n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Multiple channels pooling" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[[[ 0. 1. 2. 3.]\n", " [ 4. 5. 6. 7.]\n", " [ 8. 9. 10. 11.]\n", " [12. 13. 14. 15.]]\n", "\n", " [[ 1. 2. 3. 4.]\n", " [ 5. 6. 7. 8.]\n", " [ 9. 10. 11. 12.]\n", " [13. 14. 15. 16.]]]]\n", "shape : (1, 2, 4, 4)\n" ] }, { "data": { "text/plain": [ "array([[[[ 5., 7.],\n", " [13., 15.]],\n", "\n", " [[ 6., 8.],\n", " [14., 16.]]]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = np.concatenate((X, X + 1), axis=1)\n", "print(X)\n", "print(\"Input shape :\", X.shape)\n", "\n", "pool2d = nn.MaxPool2D(pool_size=3, padding=1, strides=2)\n", "pool2d(X)" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }