{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<script>\n",
       "  function code_toggle() {\n",
       "    if (code_shown){\n",
       "      $('div.input').hide('500');\n",
       "      $('#toggleButton').val('Show Code')\n",
       "    } else {\n",
       "      $('div.input').show('500');\n",
       "      $('#toggleButton').val('Hide Code')\n",
       "    }\n",
       "    code_shown = !code_shown\n",
       "  }\n",
       "\n",
       "  $( document ).ready(function(){\n",
       "    code_shown=false;\n",
       "    $('div.input').hide()\n",
       "  });\n",
       "</script>\n",
       "<form action=\"javascript:code_toggle()\"><input type=\"submit\" id=\"toggleButton\" value=\"Show Code\"></form>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<script>\n",
    "  function code_toggle() {\n",
    "    if (code_shown){\n",
    "      $('div.input').hide('500');\n",
    "      $('#toggleButton').val('Show Code')\n",
    "    } else {\n",
    "      $('div.input').show('500');\n",
    "      $('#toggleButton').val('Hide Code')\n",
    "    }\n",
    "    code_shown = !code_shown\n",
    "  }\n",
    "\n",
    "  $( document ).ready(function(){\n",
    "    code_shown=false;\n",
    "    $('div.input').hide()\n",
    "  });\n",
    "</script>\n",
    "<form action=\"javascript:code_toggle()\"><input type=\"submit\" id=\"toggleButton\" value=\"Show Code\"></form>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "import random\n",
    "from IPython.display import Image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "# Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "data": {
      "application/javascript": [
       "require(['base/js/utils'],\n",
       "function(utils) {\n",
       "   utils.load_extensions('calico-spell-check', 'calico-document-tools', 'calico-cell-tools');\n",
       "});\n"
      ],
      "text/plain": [
       "<IPython.core.display.Javascript object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%javascript\n",
    "require(['base/js/utils'],\n",
    "function(utils) {\n",
    "   utils.load_extensions('calico-spell-check', 'calico-document-tools', 'calico-cell-tools');\n",
    "});"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'theme': 'white',\n",
       " 'transition': 'none',\n",
       " 'controls': 'true',\n",
       " 'progress': 'true'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Reveal.js\n",
    "from notebook.services.config import ConfigManager\n",
    "cm = ConfigManager()\n",
    "cm.update('livereveal', {\n",
    "        'theme': 'white',\n",
    "        'transition': 'none',\n",
    "        'controls': 'true',\n",
    "        'progress': 'true',\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext tikzmagic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".red { color: #E41A1C; }\n",
       ".orange { color: #FF7F00 }\n",
       ".yellow { color: #FFC020 }         \n",
       ".green { color: #4DAF4A }                  \n",
       ".blue { color: #377EB8; }\n",
       ".purple { color: #984EA3 }       \n",
       "       \n",
       "h1 {\n",
       "    color: #377EB8;\n",
       "}\n",
       "       \n",
       "ctb_global_show div.ctb_hideshow.ctb_show {\n",
       "    display: inline;\n",
       "} \n",
       "         \n",
       "div.tabContent {\n",
       "    padding: 0px;\n",
       "    background: #ffffff;     \n",
       "    border: 0px;                        \n",
       "}  \n",
       "         \n",
       ".left {\n",
       "    float: left;\n",
       "    width: 50%;\n",
       "    vertical-align: text-top;\n",
       "}\n",
       "\n",
       ".right {\n",
       "    margin-left: 50%;\n",
       "    vertical-align: text-top;                            \n",
       "}    \n",
       "               \n",
       ".small {         \n",
       "    zoom: 0.9;\n",
       "    -ms-zoom: 0.9;\n",
       "    -webkit-zoom: 0.9;\n",
       "    -moz-transform:  scale(0.9,0.9);\n",
       "    -moz-transform-origin: left center;  \n",
       "}          \n",
       "         \n",
       ".verysmall {         \n",
       "    zoom: 0.75;\n",
       "    -ms-zoom: 0.75;\n",
       "    -webkit-zoom: 0.75;\n",
       "    -moz-transform:  scale(0.75,0.75);\n",
       "    -moz-transform-origin: left center;  \n",
       "}         \n",
       "   \n",
       "        \n",
       ".tiny {         \n",
       "    zoom: 0.6;\n",
       "    -ms-zoom: 0.6;\n",
       "    -webkit-zoom: 0.6;\n",
       "    -moz-transform:  scale(0.6,0.6);\n",
       "    -moz-transform-origin: left center;  \n",
       "}         \n",
       "         \n",
       "         \n",
       ".rendered_html blockquote {\n",
       "    border-left-width: 0px;\n",
       "    padding: 15px;\n",
       "    margin: 0px;    \n",
       "    width: 100%;                            \n",
       "}         \n",
       "         \n",
       ".rendered_html th {\n",
       "    padding: 0.5em;  \n",
       "    border: 0px;                            \n",
       "}         \n",
       "         \n",
       ".rendered_html td {\n",
       "    padding: 0.25em;\n",
       "    border: 0px;                                                        \n",
       "}            \n",
       "         \n",
       ".rise-enabled .reveal .progress span {\n",
       "    background: #377EB8;\n",
       "}     \n",
       "         \n",
       ".present .top {\n",
       "    position: fixed !important;\n",
       "    top: 0 !important;                                   \n",
       "}                  \n",
       "    \n",
       ".present .rendered_html * + p, .present .rendered_html p, .present .rendered_html * + br, .present .rendered_html br {\n",
       "    margin: 0.5em 0;                            \n",
       "}  \n",
       "         \n",
       ".present tr, .present td {\n",
       "    border: 0px;\n",
       "    padding: 0.35em;                            \n",
       "}      \n",
       "         \n",
       ".present th {\n",
       "    border: 1px;\n",
       "}\n",
       "         \n",
       "present .prompt {\n",
       "    min-width: 0px !important;\n",
       "    transition-duration: 0s !important;\n",
       "}     \n",
       "         \n",
       ".prompt {\n",
       "    min-width: 0px !important;\n",
       "    transition-duration: 0s !important;                            \n",
       "}         \n",
       "         \n",
       ".rise-enabled .cell li {\n",
       "    line-height: 135%;\n",
       "}\n",
       "         \n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<style>\n",
    ".red { color: #E41A1C; }\n",
    ".orange { color: #FF7F00 }\n",
    ".yellow { color: #FFC020 }         \n",
    ".green { color: #4DAF4A }                  \n",
    ".blue { color: #377EB8; }\n",
    ".purple { color: #984EA3 }       \n",
    "       \n",
    "h1 {\n",
    "    color: #377EB8;\n",
    "}\n",
    "       \n",
    "ctb_global_show div.ctb_hideshow.ctb_show {\n",
    "    display: inline;\n",
    "} \n",
    "         \n",
    "div.tabContent {\n",
    "    padding: 0px;\n",
    "    background: #ffffff;     \n",
    "    border: 0px;                        \n",
    "}  \n",
    "         \n",
    ".left {\n",
    "    float: left;\n",
    "    width: 50%;\n",
    "    vertical-align: text-top;\n",
    "}\n",
    "\n",
    ".right {\n",
    "    margin-left: 50%;\n",
    "    vertical-align: text-top;                            \n",
    "}    \n",
    "               \n",
    ".small {         \n",
    "    zoom: 0.9;\n",
    "    -ms-zoom: 0.9;\n",
    "    -webkit-zoom: 0.9;\n",
    "    -moz-transform:  scale(0.9,0.9);\n",
    "    -moz-transform-origin: left center;  \n",
    "}          \n",
    "         \n",
    ".verysmall {         \n",
    "    zoom: 0.75;\n",
    "    -ms-zoom: 0.75;\n",
    "    -webkit-zoom: 0.75;\n",
    "    -moz-transform:  scale(0.75,0.75);\n",
    "    -moz-transform-origin: left center;  \n",
    "}         \n",
    "   \n",
    "        \n",
    ".tiny {         \n",
    "    zoom: 0.6;\n",
    "    -ms-zoom: 0.6;\n",
    "    -webkit-zoom: 0.6;\n",
    "    -moz-transform:  scale(0.6,0.6);\n",
    "    -moz-transform-origin: left center;  \n",
    "}         \n",
    "         \n",
    "         \n",
    ".rendered_html blockquote {\n",
    "    border-left-width: 0px;\n",
    "    padding: 15px;\n",
    "    margin: 0px;    \n",
    "    width: 100%;                            \n",
    "}         \n",
    "         \n",
    ".rendered_html th {\n",
    "    padding: 0.5em;  \n",
    "    border: 0px;                            \n",
    "}         \n",
    "         \n",
    ".rendered_html td {\n",
    "    padding: 0.25em;\n",
    "    border: 0px;                                                        \n",
    "}            \n",
    "         \n",
    ".rise-enabled .reveal .progress span {\n",
    "    background: #377EB8;\n",
    "}     \n",
    "         \n",
    ".present .top {\n",
    "    position: fixed !important;\n",
    "    top: 0 !important;                                   \n",
    "}                  \n",
    "    \n",
    ".present .rendered_html * + p, .present .rendered_html p, .present .rendered_html * + br, .present .rendered_html br {\n",
    "    margin: 0.5em 0;                            \n",
    "}  \n",
    "         \n",
    ".present tr, .present td {\n",
    "    border: 0px;\n",
    "    padding: 0.35em;                            \n",
    "}      \n",
    "         \n",
    ".present th {\n",
    "    border: 1px;\n",
    "}\n",
    "         \n",
    "present .prompt {\n",
    "    min-width: 0px !important;\n",
    "    transition-duration: 0s !important;\n",
    "}     \n",
    "         \n",
    ".prompt {\n",
    "    min-width: 0px !important;\n",
    "    transition-duration: 0s !important;                            \n",
    "}         \n",
    "         \n",
    ".rise-enabled .cell li {\n",
    "    line-height: 135%;\n",
    "}\n",
    "         \n",
    "</style>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Recurrent Neural Networks\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "* Recurrent Neural Network (RNN) Language Models\n",
    "* Training Problems and Solutions\n",
    "  - Vanishing and Exploding Gradients\n",
    "  - Long Short-Term Memory (LSTM) Networks\n",
    "* RNN applications / Flavours of RNNs\n",
    "* Variations (characters, bi-directional RNNs)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "# Reminder: Language Models (LM)\n",
    "\n",
    "A LM computes a **probability** for a **sequence of words**\n",
    "\n",
    "$$p(\\langle w_{1}, \\ldots, w_{d} \\rangle)$$\n",
    "\n",
    "Useful in a myriad of NLP tasks involving text generation, e.g.\n",
    "- Machine Translation,\n",
    "- Speech Recognition, \n",
    "- Summarisation.. \n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    "p(\\langle \\text{Statistical}, \\text{Natural}, \\text{Language}, \\text{Processing} \\rangle) > \\\\\n",
    "p(\\langle \\text{Statistical}, \\text{Language}, \\text{Natural}, \\text{Processing} \\rangle)\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Reminder: $n$-Gram Language Models\n",
    "\n",
    "In *$n$-gram language models*, the probability $p(w_{1}, \\ldots, w_{d}) = \\prod_{i=1}^{d} p(w_{i} \\mid w_{1}, \\ldots, w_{i - 1})$ is **approximated** as:\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    "p(w_{1}, \\ldots, w_{d}) & \\approx \\prod_{i=1}^{d} p(w_{i} \\mid w_{i - (n - 1)}, \\ldots, w_{i - 1}) \\\\\n",
    "& \\approx \\prod_{i=1}^{d} \\frac{\\text{count}(w_{i - (n - 1)}, \\ldots, w_{i})}{\\text{count}(w_{i - (n - 1)}, \\ldots, w_{i - 1})}\n",
    "\\end{aligned}\n",
    "$$\n",
    "\n",
    "Example with a **bigram** ($n = 2$) **language model**:\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    " p(\\langle \\text{Natural}, & \\text{Language}, \\text{Processing} \\rangle) \\approx \\\\\n",
    " & p(\\text{Natural}){}\\cdot{}p(\\text{Language} \\mid \\text{Natural}) \\cdot p(\\text{Processing} \\mid \\text{Language})\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Disadvantages of $n$-Gram Language Models\n",
    "\n",
    "* Limited and small context window\n",
    "* No generalization to similar words and contexts (essentially one-hot featurisation)\n",
    "\n",
    "How can we address these shortcomings?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Reminder: Feed-forward Neural Networks\n",
    "<center><img src=\"../img/mlp.svg\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "But how do we use a neural network to encode a **sequence**?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Recurrent Neural Networks\n",
    "\n",
    "* RNNs share the weights at each time step\n",
    "* The output $y_{t}$ at time $t$ depends on all previous words: $w_{t}, w_{t - 1}, \\ldots, w_{1}$\n",
    "* Size scales with **number of words**, not **sequence length**!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hideCode": false,
    "hide_input": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "<center><img src=\"../img/rnn_unrolled.png\" width=50%></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "<div class=\"left\"><div><div class=\"small\">\n",
    "<div style=\"margin-bottom: 00%;\"></div>\n",
    "\\begin{align}\n",
    "\\mathbf{h}_t &= f_{\\theta}(\\mathbf{x}_{t}, \\mathbf{h}_{t - 1}) \\\\\n",
    "    f_{\\theta} \\; & \\text{is a } \\textbf{transition function} \\text { with parameters } \\theta\\\\\n",
    "    \\theta \\; & \\text{can be } \\textbf{learned from data}\\\\\n",
    "\\end{align}\n",
    "</div></div></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Recurrent Neural Network Language Model\n",
    "\n",
    "<center><img src=\"../img/rnnlm.png\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# A Recurrent Neural Network LM\n",
    "\n",
    "Consider the following sentence:\n",
    "\n",
    "$$\\langle w_{1}, \\ldots, w_{t - 1}, w_{t}, w_{t + 1}, \\ldots, w_{d})$$\n",
    "\n",
    "First apply word embedding (e.g., [word2vec](dl-representations_simple.ipynb)):\n",
    "$$\n",
    "\\begin{aligned}\n",
    " \\mathbf{x}_{1} & = \\text{encode}(w_{t}) \\in \\mathbb{R}^{d_{e}}\\\\\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "At each single time step $t$, the hidden state $\\mathbf{h}_t$ is given by:\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    "\\mathbf{h}_t & = \\sigma(\\mathbf{W}^h \\mathbf{h}_{t-1}+ \\mathbf{W}^x \\mathbf{x}_t) \\in \\mathbb{R}^{d_{h}}\\\\\n",
    "\\end{aligned}\n",
    "$$\n",
    "Input matrix: $\\mathbf{W}^x \\in \\mathbb{R}^{d_{h} \\times d_{x}}$\n",
    "\n",
    "Transition matrix: $\\mathbf{W}^h \\in \\mathbb{R}^{d_{h} \\times d_{h}}$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "The output $\\hat{\\mathbf{y}}_t \\in [0, 1]^{|V|}$, a **probability distribution** over words in $V$, is given by:\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    "\\hat{\\mathbf{y}}_{t} & = \\text{softmax}(\\mathbf{W}^o \\mathbf{h}_{t}) \\\\\n",
    "\\end{aligned}\n",
    "$$\n",
    "\n",
    "Output matrix: $\\mathbf{W}^o \\in \\mathbb{R}^{|V| \\times d_{h}}$\n",
    "\n",
    "and softmax is defined as:\n",
    "$$\n",
    "\\text{softmax}(z) = \\frac{1}{\\sum_{i=1}^{d_z} e^{z_i}} \\left(e^{z_1}, e^{z_2},\\ldots,e^{z_{d_z}}\\right)\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "The probability that the $t$-th word in the sequence is $w_{j}$ is given by:\n",
    "\n",
    "$$p(w_{j} \\mid w_{t}, \\ldots, w_{1}) = \\hat{\\mathbf{y}}_{t, j}$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Example\n",
    "\n",
    "Consider the word sequence $\\text{encode}(\\text{Natural}, \\text{Language}, \\text{Processing}) \\rightarrow (\\mathbf{x}_{1}, \\mathbf{x}_{2}, \\mathbf{x}_{3})$\n",
    "\n",
    "Reminder: $\\mathbf{h}_t = \\sigma(\\mathbf{W}^h \\mathbf{h}_{t-1}+ \\mathbf{W}^x \\mathbf{x}_t + \\mathbf{b})$\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    " \\mathbf{h}_1 = \\sigma(\\mathbf{W}^h \\mathbf{h}_{0} + \\mathbf{W}^x \\mathbf{x}_1) &\\;& \\hat{\\mathbf{y}}_{1} = \\text{softmax}(\\mathbf{W}^o \\mathbf{h}_{1}) \\\\\n",
    " \\mathbf{h}_2 = \\sigma(\\mathbf{W}^h \\mathbf{h}_{1} + \\mathbf{W}^x \\mathbf{x}_2) &\\;& \\hat{\\mathbf{y}}_{2} = \\text{softmax}(\\mathbf{W}^o \\mathbf{h}_{2}) \\\\\n",
    " \\mathbf{h}_3 = \\sigma(\\mathbf{W}^h \\mathbf{h}_{2} + \\mathbf{W}^x \\mathbf{x}_3) &\\;& \\hat{\\mathbf{y}}_{3} = \\text{softmax}(\\mathbf{W}^o \\mathbf{h}_{3}) \\\\\n",
    "\\end{aligned}\n",
    "$$\n",
    "\n",
    "$$p(\\text{Natural}, \\text{Language}, \\text{Processing}) = \\hat{\\mathbf{y}}_{1, [\\text{Natural}]} \\; \\hat{\\mathbf{y}}_{2, [\\text{Language}]} \\; \\hat{\\mathbf{y}}_{3, [\\text{Processing}]}$$\n",
    "\n",
    "- Initial state: $\\mathbf{h}_{0} \\in \\mathbb{R}^{d_{h}}$\n",
    "- Input matrix: $\\mathbf{W}^x \\in \\mathbb{R}^{d_{h} \\times d_{x}}$, Transition matrix: $\\mathbf{W}^h \\in \\mathbb{R}^{d_{h} \\times d_{h}}$, Output matrix: $\\mathbf{W}^o \\in \\mathbb{R}^{|V| \\times d_{h}}$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "<center><img src=\"../img/quiz_time.png\"></center>\n",
    "\n",
    "# [tinyurl.com/diku-nlp-unk](https://tinyurl.com/diku-nlp-unk)\n",
    "([Responses](https://docs.google.com/forms/d/1Ze1pUQkraElKVcMU8zsNNe7GFmRQluffFi-J_aIvvCk/edit#responses))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Objective Function\n",
    "\n",
    "Recall that $\\hat{\\mathbf{y}}_{t} \\in \\mathbb{R}^{|V|}$ is a probability distribution over the vocabulary $V$.\n",
    "\n",
    "We can train a RNN by minimizing the **cross-entropy loss**, predicting **words** instead of classes:\n",
    "\n",
    "$$\n",
    "\\begin{aligned}\n",
    "J_{t} = - \\sum_{i = 1}^{|V|} \\mathbf{y}_{t, i} \\log \\hat{\\mathbf{y}}_{t, i}, \\quad \\text{where} \\quad \\mathbf{y}_{t, i} = \\left\\{\\begin{array}{ll}1 \\; \\text{if the $t$-th word is $w_{i}$,}\\\\0 \\, \\text{otherwise.}\\end{array} \\right.\n",
    "\\end{aligned}\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Problem - Training RNNs is Hard\n",
    "\n",
    "- **Vanishing** and **exploding** gradients [<span class=blue>Pascanu et al. 2013</span>].\n",
    "\n",
    "Why? Multiply the same matrix $\\mathbf{W}^{h}$ at each time step during forward propagation. The norm of the gradient might either tend to 0 (**vanish**) or be too large (**explode**).\n",
    "\n",
    "<center>\n",
    "<img src=\"rnn-figures/error_surface.png\" width=\"40%\"/>\n",
    "</center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Related Problem - Long-Term Dependencies\n",
    "\n",
    "Words from time steps far away are hardly considered when training to predict the next word.\n",
    "\n",
    "Example:\n",
    "- John walked to the hallway.\n",
    "- Mary walked in too.\n",
    "- Daniel moved to the garden.\n",
    "- John said \"Hi\" to \\_\\_\\_\\_."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "or\n",
    "\n",
    "- When I moved to France, I quickly ran into difficulties communicating, because I don't speak any \\_\\_\\_\\_."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Vanishing/Exploding Gradients - Solutions\n",
    "\n",
    "Several solutions in the literature:\n",
    "\n",
    "- Bound the gradient to a threshold (**Gradient Clipping**)<br>[<span class=blue>Pascanu et al. 2013</span>]\n",
    "\n",
    "- Use $\\text{ReLU}(x) = \\max(0, x)$ (**Re**ctified **L**inear **U**nits) or similar non-linearities instead of $\\text{sigmoid}(x)$ or $\\text{tanh}(x)$<br>[<span class=blue>Glorot et al. 2011</span>].\n",
    "\n",
    "- Clever initialization of the transition matrix ($\\mathbf{W}^h = \\mathbf{I}$)<br>[<span class=blue>Socher et al. 2013</span>, <span class=blue>Le et al. 2015</span>].\n",
    "\n",
    "- Use different recurrent models that favour backpropagation<br>LSTM [<span class=blue>Hochreiter et al. 1997</span>], GRU [<span class=blue>Chung et al. 2014</span>]."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Long Short-Term Memory (LSTM) Networks\n",
    "\n",
    "- Can adaptively learn what to **keep** (store) into memory (gate $\\mathbf{i}_{t}$), **forget** (gate $\\mathbf{f}_{t}$) and **output** (gate $\\mathbf{o}_{t}$)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hideCode": false,
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "<center><img src=\"../img/rnn_overview.png\" width=\"60%\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "<center><img src=\"../img/lstm_internal-v2.png\" width=\"60%\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Long Short-Term Memory (LSTM) Networks\n",
    "\n",
    "- Can adaptively learn what to **keep** (store) into memory (gate $\\mathbf{i}_{t}$), **forget** (gate $\\mathbf{f}_{t}$) and **output** (gate $\\mathbf{o}_{t}$)\n",
    "- Memory from current time step: $\\mathbf{c}_{t}$; hidden state from previous time step: $h_{t-1}$\n",
    "- Element-wise multiplication: <span style=\"background-color: black; color: white\">x</span>; element-wise summation: <span style=\"background-color: black; color: white\">+</span>\n",
    "<center><img src=\"../img/lstm_internal-v2.png\" width=70%></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "<center><img src=\"../img/lstm_equations-v2.png\", width=600></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Three Flavours of RNNs in NLP ##\n",
    "\n",
    "Encoder-only:\n",
    "* Many-to-one (sequence classification)\n",
    "* One-to-one (sequence tagging/labeling)\n",
    "\n",
    "Encoder-decoder:\n",
    "* Many-to-many (sequence-to-sequence or seq2seq)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## Many-to-one ##\n",
    "\n",
    "<center><img src=\"../img/many_to_one.png\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## One-to-one ##\n",
    "<center><img src=\"../img/one_to_one2.png\"></center>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## Many-to-many ##\n",
    "<center><img src=\"../img/many_to_many.png\" width=\"80%\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Sequence-to-Sequence Models\n",
    "\n",
    "Recurrent Neural Networks are extremely powerful and flexible\n",
    "- They can also learn to **generate** sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "Seq2Seq models are composed by:\n",
    "- **Encoder** - Gets the input and outputs $\\mathbf{v} \\in \\mathbb{R}^{d}$\n",
    "- **Decoder** - Gets $\\mathbf{v}$ and generates the output sequence"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "Seq2Seq models are widely popular in e.g.:\n",
    "- *Neural Machine Translation*\n",
    "- *Text Summarisation*\n",
    "- *Learning to Execute*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "Problem - for sequence labeling, you may need to incorporate information from both the **preceding** and **following** words. Can RNNs do that?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Bidirectional RNNs\n",
    "<center><img src=\"../img/bilstm.png\" width=\"60%\"></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "- $\\overleftarrow{\\mathbf{h}}_t$ and $\\overrightarrow{\\mathbf{h}}_t$ represent (summarise) both the **past** and the **future** around a given sequence element."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Stacking (Deep LSTMs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<img src=\"../img/deep_bilstm.png?0.17567720960245992\" width=\"800\"/>"
      ],
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Image(url='../img/deep_bilstm.png'+'?'+str(random.random()), width=800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "# Levels of Granularity\n",
    "\n",
    "RNNs with tokenization at the character level for:\n",
    "\n",
    "* Language modeling ([Karpathy, 2015](http://karpathy.github.io/2015/05/21/rnn-effectiveness/))\n",
    "* Word representation (e.g., [Pinter et al. 2017](https://aclanthology.org/D17-1010/))\n",
    "\n",
    "RNNs for sentence representation:\n",
    "* Skip-Thought Vectors ([Kiros et al., 2015](https://proceedings.neurips.cc/paper/2015/file/f442d33fa06832082290ad8544a8da27-Paper.pdf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<img src=\"../img/pinter_char_rnn.png?0.24592729817508263\" width=\"400\"/>"
      ],
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Image(url='../img/pinter_char_rnn.png'+'?'+str(random.random()), width=400)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Summary\n",
    "\n",
    "* RNNs are *sequence models* that take word embeddings and encode whole sequences\n",
    "* They can be used as language models with unbounded (but decaying) history window\n",
    "* Many architectural variations:\n",
    "    * LSTM\n",
    "    * BiLSTM\n",
    "    * Stacked (multi-layer) (Bi)LSTM\n",
    "* Many possible use cases:\n",
    "    * One-to-one\n",
    "    * One-to-many\n",
    "    * Many-to-many (sequence-to-sequence)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "hide_input": true,
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    " # Additional Reading #\n",
    "* [*The Unreasonable Effectiveness of Recurrent Neural Networks.* Blog post by Andrej Karpathy](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)\n",
    "* [Jurafsky & Martin (see Chapter 9)](https://web.stanford.edu/~jurafsky/slp3/9.pdf)"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "hide_input": true,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  },
  "latex_envs": {
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 0
  },
  "livereveal": {
   "theme": "white",
   "transition": "concave"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}