{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "# Configuration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<!---\n",
    "Latex Macros\n",
    "-->\n",
    "$$\n",
    "\\newcommand{\\Xs}{\\mathcal{X}}\n",
    "\\newcommand{\\Ys}{\\mathcal{Y}}\n",
    "\\newcommand{\\y}{\\mathbf{y}}\n",
    "\\newcommand{\\repr}{\\mathbf{f}}\n",
    "\\newcommand{\\repry}{\\mathbf{g}}\n",
    "\\newcommand{\\x}{\\mathbf{x}}\n",
    "\\newcommand{\\vocab}{V}\n",
    "\\newcommand{\\params}{\\boldsymbol{\\theta}}\n",
    "\\newcommand{\\param}{\\theta}\n",
    "\\DeclareMathOperator{\\perplexity}{PP}\n",
    "\\DeclareMathOperator{\\argmax}{argmax}\n",
    "\\DeclareMathOperator{\\argmin}{argmin}\n",
    "\\newcommand{\\train}{\\mathcal{D}}\n",
    "\\newcommand{\\counts}[2]{\\#_{#1}(#2) }\n",
    "\\newcommand{\\indi}{\\mathbb{I}}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "%load_ext autoreload\n",
    "%load_ext tikzmagic\n",
    "%autoreload 2\n",
    "import sys\n",
    "sys.path.append(\"..\")\n",
    "import numpy as np\n",
    "\n",
    "#reveal configuration\n",
    "from notebook.services.config import ConfigManager\n",
    "cm = ConfigManager()\n",
    "cm.update('livereveal', {\n",
    "        'theme': 'white',\n",
    "        'transition': 'none',\n",
    "        'controls': 'false',\n",
    "        'progress': 'true',\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".red { color: #E41A1C; }\n",
       ".orange { color: #FF7F00 }\n",
       ".yellow { color: #FFC020 }         \n",
       ".green { color: #4DAF4A }                  \n",
       ".blue { color: #377EB8; }\n",
       ".purple { color: #984EA3 }       \n",
       "       \n",
       "h1 {\n",
       "    color: #377EB8;\n",
       "}\n",
       "       \n",
       "ctb_global_show div.ctb_hideshow.ctb_show {\n",
       "    display: inline;\n",
       "} \n",
       "         \n",
       "div.tabContent {\n",
       "    padding: 0px;\n",
       "    background: #ffffff;     \n",
       "    border: 0px;                        \n",
       "}  \n",
       "         \n",
       ".left {\n",
       "    float: left;\n",
       "    width: 50%;\n",
       "    vertical-align: text-top;\n",
       "}\n",
       "\n",
       ".right {\n",
       "    margin-left: 50%;\n",
       "    vertical-align: text-top;                            \n",
       "}    \n",
       "               \n",
       ".small {         \n",
       "    zoom: 0.9;\n",
       "    -ms-zoom: 0.9;\n",
       "    -webkit-zoom: 0.9;\n",
       "    -moz-transform:  scale(0.9,0.9);\n",
       "    -moz-transform-origin: left center;  \n",
       "}          \n",
       "         \n",
       ".verysmall {         \n",
       "    zoom: 0.75;\n",
       "    -ms-zoom: 0.75;\n",
       "    -webkit-zoom: 0.75;\n",
       "    -moz-transform:  scale(0.75,0.75);\n",
       "    -moz-transform-origin: left center;  \n",
       "}         \n",
       "   \n",
       "        \n",
       ".tiny {         \n",
       "    zoom: 0.6;\n",
       "    -ms-zoom: 0.6;\n",
       "    -webkit-zoom: 0.6;\n",
       "    -moz-transform:  scale(0.6,0.6);\n",
       "    -moz-transform-origin: left center;  \n",
       "}         \n",
       "         \n",
       "         \n",
       ".rendered_html blockquote {\n",
       "    border-left-width: 0px;\n",
       "    padding: 15px;\n",
       "    margin: 0px;    \n",
       "    width: 100%;                            \n",
       "}         \n",
       "         \n",
       ".rendered_html th {\n",
       "    padding: 0.5em;  \n",
       "    border: 0px;                            \n",
       "}         \n",
       "         \n",
       ".rendered_html td {\n",
       "    padding: 0.25em;\n",
       "    border: 0px;                                                        \n",
       "}    \n",
       "     \n",
       "#for reveal         \n",
       ".aside .controls, .reveal .controls {\n",
       "    display: none !important;                            \n",
       "    width: 0px !important;\n",
       "    height: 0px !important;\n",
       "}\n",
       "    \n",
       ".rise-enabled .reveal .slide-number {\n",
       "    right: 25px;\n",
       "    bottom: 25px;                        \n",
       "    font-size: 200%;     \n",
       "    color: #377EB8;                        \n",
       "}         \n",
       "         \n",
       ".rise-enabled .reveal .progress span {\n",
       "    background: #377EB8;\n",
       "}     \n",
       "         \n",
       ".present .top {\n",
       "    position: fixed !important;\n",
       "    top: 0 !important;                                   \n",
       "}                  \n",
       "    \n",
       ".present .rendered_html * + p, .present .rendered_html p, .present .rendered_html * + br, .present .rendered_html br {\n",
       "    margin: 0.5em 0;                            \n",
       "}  \n",
       "         \n",
       ".present tr, .present td {\n",
       "    border: 0px;\n",
       "    padding: 0.35em;                            \n",
       "}      \n",
       "         \n",
       ".present th {\n",
       "    border: 1px;\n",
       "}\n",
       "         \n",
       "present .prompt {\n",
       "    min-width: 0px !important;\n",
       "    transition-duration: 0s !important;\n",
       "}     \n",
       "         \n",
       ".prompt {\n",
       "    min-width: 0px !important;\n",
       "    transition-duration: 0s !important;                            \n",
       "}         \n",
       "         \n",
       ".rise-enabled .cell li {\n",
       "    line-height: 135%;\n",
       "}\n",
       "         \n",
       "</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<style>\n",
    ".red { color: #E41A1C; }\n",
    ".orange { color: #FF7F00 }\n",
    ".yellow { color: #FFC020 }         \n",
    ".green { color: #4DAF4A }                  \n",
    ".blue { color: #377EB8; }\n",
    ".purple { color: #984EA3 }       \n",
    "       \n",
    "h1 {\n",
    "    color: #377EB8;\n",
    "}\n",
    "       \n",
    "ctb_global_show div.ctb_hideshow.ctb_show {\n",
    "    display: inline;\n",
    "} \n",
    "         \n",
    "div.tabContent {\n",
    "    padding: 0px;\n",
    "    background: #ffffff;     \n",
    "    border: 0px;                        \n",
    "}  \n",
    "         \n",
    ".left {\n",
    "    float: left;\n",
    "    width: 50%;\n",
    "    vertical-align: text-top;\n",
    "}\n",
    "\n",
    ".right {\n",
    "    margin-left: 50%;\n",
    "    vertical-align: text-top;                            \n",
    "}    \n",
    "               \n",
    ".small {         \n",
    "    zoom: 0.9;\n",
    "    -ms-zoom: 0.9;\n",
    "    -webkit-zoom: 0.9;\n",
    "    -moz-transform:  scale(0.9,0.9);\n",
    "    -moz-transform-origin: left center;  \n",
    "}          \n",
    "         \n",
    ".verysmall {         \n",
    "    zoom: 0.75;\n",
    "    -ms-zoom: 0.75;\n",
    "    -webkit-zoom: 0.75;\n",
    "    -moz-transform:  scale(0.75,0.75);\n",
    "    -moz-transform-origin: left center;  \n",
    "}         \n",
    "   \n",
    "        \n",
    ".tiny {         \n",
    "    zoom: 0.6;\n",
    "    -ms-zoom: 0.6;\n",
    "    -webkit-zoom: 0.6;\n",
    "    -moz-transform:  scale(0.6,0.6);\n",
    "    -moz-transform-origin: left center;  \n",
    "}         \n",
    "         \n",
    "         \n",
    ".rendered_html blockquote {\n",
    "    border-left-width: 0px;\n",
    "    padding: 15px;\n",
    "    margin: 0px;    \n",
    "    width: 100%;                            \n",
    "}         \n",
    "         \n",
    ".rendered_html th {\n",
    "    padding: 0.5em;  \n",
    "    border: 0px;                            \n",
    "}         \n",
    "         \n",
    ".rendered_html td {\n",
    "    padding: 0.25em;\n",
    "    border: 0px;                                                        \n",
    "}    \n",
    "     \n",
    "#for reveal         \n",
    ".aside .controls, .reveal .controls {\n",
    "    display: none !important;                            \n",
    "    width: 0px !important;\n",
    "    height: 0px !important;\n",
    "}\n",
    "    \n",
    ".rise-enabled .reveal .slide-number {\n",
    "    right: 25px;\n",
    "    bottom: 25px;                        \n",
    "    font-size: 200%;     \n",
    "    color: #377EB8;                        \n",
    "}         \n",
    "         \n",
    ".rise-enabled .reveal .progress span {\n",
    "    background: #377EB8;\n",
    "}     \n",
    "         \n",
    ".present .top {\n",
    "    position: fixed !important;\n",
    "    top: 0 !important;                                   \n",
    "}                  \n",
    "    \n",
    ".present .rendered_html * + p, .present .rendered_html p, .present .rendered_html * + br, .present .rendered_html br {\n",
    "    margin: 0.5em 0;                            \n",
    "}  \n",
    "         \n",
    ".present tr, .present td {\n",
    "    border: 0px;\n",
    "    padding: 0.35em;                            \n",
    "}      \n",
    "         \n",
    ".present th {\n",
    "    border: 1px;\n",
    "}\n",
    "         \n",
    "present .prompt {\n",
    "    min-width: 0px !important;\n",
    "    transition-duration: 0s !important;\n",
    "}     \n",
    "         \n",
    ".prompt {\n",
    "    min-width: 0px !important;\n",
    "    transition-duration: 0s !important;                            \n",
    "}         \n",
    "         \n",
    ".rise-enabled .cell li {\n",
    "    line-height: 135%;\n",
    "}\n",
    "         \n",
    "</style>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<center><h1>An Introduction to Deep Learning for Natural Language Processing</h1></center><br>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# A Subjective History of Deep Learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Disclaimer\n",
    "- The field of Deep Learning is young but fast-changing and diverse due to very active research\n",
    "- I can only give you a small overview on Deep Learning\n",
    "- I won't talk about vision, convolutional networks etc.\n",
    "- Many things that I explain today will be outdated next year/month"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<center><h2>A More or Less Objective View</h2></center>\n",
    "<br><br><br>\n",
    "<img  src=\"../img/schmidthuber.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<center><h2>A Personal View</h2></center>\n",
    "<br>\n",
    "<span class=red>Feature Engineering</span>, Classification, Support Vector Machines\n",
    "\n",
    "<img  src=\"../img/personal_2011.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/features.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/margin.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<center><h2>Machine Learning</h2></center>\n",
    "<img  src=\"../img/ml.png\" width=1000/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "Graphical Models, Structured Prediction, Probabilistic Inference, <span class=red>Feature Engineering</span>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<img  src=\"../img/sequence.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<img  src=\"../img/factor.png\" width=700/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<img  src=\"../img/feature_classes.png\" width=700/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "Relation Extraction, Matrix Factorization, <span class=green>Representation Learning</span>\n",
    "<img  src=\"../img/factorisation.png\" width=800/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<span class=green>Representation Learning</span>, **<span class=blue>Deep Learning</span>**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false,
    "scrolled": false,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "<img src=\"../img/lstm.svg\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Deep Learning in a Nutshell"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img  src=\"../img/alexnet.png\" width=900/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img  src=\"../img/filters.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## The Success Story of Deep Learning\n",
    "- State of the art performance for countless real-world tasks (too much to list)\n",
    "- Huge investements from industry (Google, Facebook, Apple etc.)\n",
    "- Many new Deep Learning start-ups\n",
    "- Very active and open research community\n",
    "- \"There's something magical about Recurrent Neural Networks\" -- Andrej Karpathy "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/aihires.png\" width=\"1000\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/countries_capitals.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/w2v.png\" width=1000/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/emojis.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/atari.gif\" width=1000/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/kubrick.jpg\" width=1000/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/style_transfer_1.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/style_transfer_2.png\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/alphago.jpg\" width=\"600\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Continuous Optimization, Modularity and Backpropagation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Preliminaries: Model\n",
    "\n",
    "Change of notation: \n",
    "$$\n",
    "s_\\params(\\x,y) \\in \\mathbb{R}\n",
    "$$\n",
    "becomes\n",
    "$$\n",
    "f_\\params(\\x)_y \\in  \\mathbb{R}\n",
    "$$\n",
    "\n",
    "where $f_\\params(\\x) \\in \\mathbb{R}^{|\\Ys|}$ represents the scores for each possible solution $y$\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Preliminaries: Model\n",
    "\n",
    "- Model: some function $f_\\theta$ parameterized by $\\theta$ that we want to learn from data $\\mathcal{D}=\\{(x_i,y_i)\\}$, for example\n",
    "  - Linear Regression\n",
    "$$\n",
    "f_\\theta(\\mathbf{x}) = \\mathbf{Wx} + \\mathbf{b} \\quad\\text{with }\\theta = \\{\\mathbf{W}, \\mathbf{b}\\}\n",
    "$$\n",
    "\n",
    "  - Logistic Regression\n",
    "$$\n",
    "f_\\theta(\\mathbf{x}) = \\frac{1}{1 + e^{-(\\mathbf{Wx} + \\mathbf{b})}} \\quad\\text{with }\\theta = \\{\\mathbf{W}, \\mathbf{b}\\}\n",
    "$$\n",
    "  - 3-layer Perceptron\n",
    "$$\n",
    "f_\\theta(\\mathbf{x}) = \\text{tanh}(\\mathbf{W}_3\\text{tanh}(\\mathbf{W}_2\\text{tanh}(\\mathbf{W}_1\\mathbf{x} + \\mathbf{b}_1)+\\mathbf{b}_2)+\\mathbf{b}_3)\\\\ \\quad\\text{with }\\theta = \\{\\mathbf{W}_1, \\mathbf{W}_2, \\mathbf{W}_3, \\mathbf{b}_1, \\mathbf{b}_2, \\mathbf{b}_3\\}\n",
    "$$  \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Preliminaries: Loss Functions\n",
    "A function $\\mathcal{L}$ that given a model $f_\\theta$, input $x$ and gold output $y$ measures how far we are away from the truth, for example \n",
    "  - Squared distance\n",
    "$$\n",
    "\\mathcal{L}(f_\\theta, x, y) = ||f_\\theta(x) - y||^2\n",
    "$$\n",
    "  - Logistic\n",
    "$$\n",
    "\\mathcal{L}(f_\\theta, x, y) = \\log(1 + f_\\theta(yx))\n",
    "$$  \n",
    "  - Hinge\n",
    "$$\n",
    "\\mathcal{L}(f_\\theta, x, y) = \\max(0,1-yf_\\theta(x))\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Stochastic Gradient Descent\n",
    "\n",
    "Goal: find parameters $\\theta$ of model $f_\\theta$ that minimize loss function $\\mathcal{L}$\n",
    "\n",
    "1. Initialize parameters $\\theta$\n",
    "2. Shuffle training data $\\mathcal{D}$\n",
    "  - For every example $(x_i,y_i) \\in \\mathcal{T}$ \n",
    "    1. Find direction of parameters that improves loss \n",
    "      - Calculate gradient of parameters w.r.t. loss $\\frac{\\partial \\mathcal{L}(f_\\theta, x_i, y_i)}{\\partial \\theta}$\n",
    "    2. Update parameters with learning rate $\\alpha$  \n",
    "      - $\\theta := \\theta - \\alpha*\\frac{\\partial \\mathcal{L}(f_\\theta, x_i, y_i)}{\\partial \\theta}$\n",
    "  - Go to 2.    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/surface.png\" width=1000/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/momentum.gif\" width=800/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Perceptron: A Single Neuron"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true
   },
   "source": [
    "<img src=\"../img/single_neuron.svg\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\\begin{align}\n",
    "z &= \\text{sigmoid}(x_1*w_1 + x_2*w_2 + x_3*w_3 + x_4*w_4 + b)\\\\\n",
    "  &= \\text{sigmoid}(\\mathbf{x}\\cdot\\mathbf{w} + b) \\quad\\text{with }\\mathbf{x},\\mathbf{w}\\in\\mathbb{R}^4\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Multiple Neurons"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "<img src=\"../img/multiple_neurons.svg\" width=800/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true
   },
   "source": [
    "\\begin{align}\n",
    "z_1 &= \\text{sigmoid}(\\mathbf{x}\\cdot\\mathbf{w_1} + b_1)\\\\\n",
    "z_2 &= \\text{sigmoid}(\\mathbf{x}\\cdot\\mathbf{w_2} + b_2)\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Multiple Neurons\n",
    "\n",
    "$f_\\theta: \\mathbb{R}^4 \\to \\mathbb{R}^2$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true
   },
   "source": [
    "<img src=\"../img/multiple_neurons_2.svg\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true
   },
   "source": [
    "\\begin{align}\n",
    "\\mathbf{z} &= \\text{sigmoid}(\\mathbf{W}\\mathbf{x} + \\mathbf{b}) \\quad\\text{ with } \\mathbf{W}\\in\\mathbb{R}^{2\\times4}, \\mathbf{b}\\in\\mathbb{R}^{2}\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Modularity: Multi-layer Perceptron"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false
   },
   "source": [
    "<img src=\"../img/mlp.svg\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=right><div class=top><div class=small>\n",
    "<div style=\"margin-bottom: 60%;\"></div>\n",
    "\\begin{align}\n",
    "f_{1,\\theta} &: \\mathbb{R}^5 \\to \\mathbb{R}^3\\\\\n",
    "f_{2,\\theta} &: \\mathbb{R}^3 \\to \\mathbb{R}^3\\\\\n",
    "f_{3,\\theta} &: \\mathbb{R}^3 \\to \\mathbb{R}^1\\\\\n",
    "g_\\theta &= f_{3,\\theta} \\circ f_{2,\\theta} \\circ f_{1,\\theta}\\\\\n",
    "g_\\theta(\\mathbf{x}) &= f_{3,\\theta}(f_{2,\\theta}(f_{1,\\theta}(\\mathbf{x})))\\\\\n",
    "g_\\theta &: \\mathbb{R}^5 \\to \\mathbb{R}^1\n",
    "\\end{align}\n",
    "</div>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Calculation of Gradients\n",
    "<br>\n",
    "<div class=verysmall>\n",
    "\\begin{align}\n",
    "g_\\theta(\\mathbf{x}) &= \\text{sigmoid}(\\mathbf{W}^{1\\times 3}_3\\text{sigmoid}(\\mathbf{W}^{3\\times 3}_2\\text{sigmoid}(\\mathbf{W}^{3\\times 5}_1\\mathbf{x}+\\mathbf{b}_1)+\\mathbf{b}_2)+\\mathbf{b}_3)\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{1\\times 3}_3} &= \\text{ ?}\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_3} &= \\text{ ?}\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{3\\times 3}_2} &= \\text{ ?}\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_2} &= \\text{ ?}\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{W}^{3\\times 5}_1} &= \\text{ ?}\\\\\n",
    "\\frac{\\partial \\mathcal{L}(f_\\theta, \\mathbf{x}, \\mathbf{y})}{\\partial \\mathbf{b}_1} &= \\text{ ?}\n",
    "\\end{align}\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Chain Rule\n",
    "\n",
    "\\begin{align}\n",
    "\\frac{\\partial f \\circ g}{\\partial \\theta} &= \\frac{\\partial f \\circ g}{\\partial g} \\frac{\\partial g}{\\partial \\theta}\\\\\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "<div class=small>\n",
    "\\begin{align}\n",
    "\\frac{\\partial \\mathcal{L}(\\text{sigmoid}(\\mathbf{W}\\mathbf{x}),\\mathbf{y})}{\\partial \\mathbf{W}} &= \\frac{\\partial \\mathcal{L}(\\text{sigmoid}(\\mathbf{W}\\mathbf{x}),\\mathbf{y})}{\\partial \\text{ sigmoid}(\\mathbf{W}\\mathbf{x})} \\frac{\\partial \\text{ sigmoid}(\\mathbf{W}\\mathbf{x})}{\\partial \\mathbf{Wx}} \\frac{\\partial{\\mathbf{Wx}}}{\\partial\\mathbf{W}}\n",
    "\\end{align}\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "\\begin{align}\n",
    "\\mathbf{h} &= \\mathbf{W}\\mathbf{x}\\\\\n",
    "\\mathbf{z} &= \\text{sigmoid}(\\mathbf{h})\\\\\n",
    "\\mathcal{L}(\\mathbf{z},\\mathbf{y}) &= \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "\\begin{align}\n",
    "\\frac{\\mathcal{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}}{\\partial \\mathbf{W}} &= \\frac{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}{\\partial\\mathbf{z}} \\frac{\\partial\\mathbf{z}}{\\partial \\mathbf{h}} \\frac{\\partial \\mathbf{h}}{\\partial \\mathbf{W}}\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Example cont."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "\\begin{align}\n",
    "\\mathbf{h} &= \\mathbf{W}\\mathbf{x}\\\\\n",
    "\\mathbf{z} &= \\text{sigmoid}(\\mathbf{h})\\\\\n",
    "\\mathcal{L}(\\mathbf{z},\\mathbf{y}) &= \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "\\begin{align}\n",
    "\\frac{\\mathcal{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}}{\\partial \\mathbf{W}} &= \\frac{\\partial \\frac{1}{2}||\\mathbf{z} - \\mathbf{y}||^2}{\\partial\\mathbf{z}} \\frac{\\partial\\mathbf{z}}{\\partial \\mathbf{h}} \\frac{\\partial \\mathbf{h}}{\\partial \\mathbf{W}}\\\\\n",
    "\\partial \\mathbf{z} &= \\mathbf{z}-\\mathbf{y}\\\\\n",
    "\\partial \\mathbf{h} &= \\partial \\mathbf{z}\\,\\text{sigmoid}(\\mathbf{h})\\,(1 - \\text{sigmoid}(\\mathbf{h}))\\\\\n",
    "\\partial \\mathbf{W} &= \\partial\\mathbf{h}\\otimes\\mathbf{x}\n",
    "\\end{align}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Module"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false
   },
   "source": [
    "<img src=\"../img/dl_module.svg\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Backpropagation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false
   },
   "source": [
    "<img src=\"../img/backprop.svg\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Deep Learning Libraries\n",
    "- pytorch\n",
    "- dynet\n",
    "- Theano\n",
    "- DeepLearning4J\n",
    "- autograd\n",
    "- **TensorFlow**\n",
    "- ..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "<img  src=\"../img/tensorflow.jpg\"/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "seed = 0\n",
    "#input\n",
    "input_sz = 3\n",
    "output_sz = 1\n",
    "x = tf.placeholder(\"float\")\n",
    "#parameters\n",
    "W = tf.Variable(tf.random_uniform([output_sz,input_sz], -0.1, 0.1, seed=seed))\n",
    "b = tf.Variable(tf.zeros(output_sz))\n",
    "#f_theta\n",
    "z = tf.nn.sigmoid(tf.matmul(W,x) + b) #sigmoid(Wx + b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-0.07982747,  0.09403337,  0.06975283]], dtype=float32)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess = tf.Session()\n",
    "sess.run(tf.global_variables_initializer()) #initialize W and b\n",
    "sess.run(W)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.], dtype=float32)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess.run(b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Logistic Regression cont."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Forward: $\\mathbf{z} = f_\\theta(\\mathbf{x})$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.64387923]], dtype=float32)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess.run(z, feed_dict={x: [[-5.5],[2.0],[-0.5]]})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "Backward: $\\partial\\mathbf{W},\\partial\\mathbf{b},\\partial\\mathbf{x}$ given upstream gradient $\\partial\\mathbf{z}$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([[-0.13708647,  0.04984963, -0.01246241]], dtype=float32),\n",
       " array([ 0.02492481], dtype=float32),\n",
       " array([[ 0.00034354],\n",
       "        [-0.00093437],\n",
       "        [-0.00204336]], dtype=float32)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess.run(tf.global_variables_initializer())\n",
    "gradz = [[0.1]] \n",
    "grad = tf.gradients(z,[W, b, x], grad_ys=gradz)\n",
    "sess.run(grad, feed_dict={x: [[-5.5],[2.0],[-0.5]]})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Multi-layer Perceptron"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.35592151]], dtype=float32)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#input\n",
    "x = tf.placeholder(tf.float32, shape=[5,1])\n",
    "#parameters\n",
    "W1 = tf.Variable(tf.random_uniform([3,5], seed=seed))\n",
    "b1 = tf.Variable(tf.zeros([3,1]))\n",
    "W2 = tf.Variable(tf.random_uniform([3,3], seed=seed))\n",
    "b2 = tf.Variable(tf.zeros([3,1]))\n",
    "W3 = tf.Variable(tf.random_uniform([1,3], seed=seed))\n",
    "b3 = tf.Variable(tf.zeros([1,1]))\n",
    "#model\n",
    "h1 = tf.nn.sigmoid(tf.matmul(W1,x) + b1) \n",
    "h2 = tf.nn.sigmoid(tf.matmul(W2,h1) + b2)\n",
    "mlp_z = tf.matmul(W3,h2) + b3 \n",
    "\n",
    "sess.run(tf.global_variables_initializer())\n",
    "x_value = [[-5.5], [2.0], [-0.5], [2.0], [4.0]]\n",
    "sess.run(mlp_z, feed_dict={x: x_value})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hideOutput": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.07483862]]\n",
      "[[ 0.00129254]]\n",
      "[[  7.06945139e-06]]\n",
      "[[  3.01882075e-08]]\n",
      "[[  1.25567112e-10]]\n"
     ]
    }
   ],
   "source": [
    "target_z = tf.constant([[1.0]]) # what the output should be\n",
    "loss = tf.square(target_z - mlp_z) # the loss function \n",
    "optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)\n",
    "opt_op = optimizer.minimize(loss) # the TF operation that performs optimisation steps\n",
    "sess.run(tf.global_variables_initializer())\n",
    "for epoch in range(0,5):\n",
    "    _, loss_value = sess.run([opt_op, loss], feed_dict={x: x_value})\n",
    "    if epoch % 1 == 0:\n",
    "        print(loss_value)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "It learned!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.99999923]], dtype=float32)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess.run(mlp_z, feed_dict={x: x_value})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Next\n",
    "\n",
    "Input are always (continuous) **vectors**. \n",
    "\n",
    "What vectors to use in NLP?  "
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "latex_envs": {
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 0
  },
  "livereveal": {
   "theme": "white",
   "transition": "concave"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}