{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "rnn-language-model.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "53ZwO7dzC8kj"
      },
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pantelis-nlp/tutorial-nlp-notebooks/blob/main/rnn_language_model.ipynb)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qaJ0WRAhG0cA"
      },
      "source": [
        "data = 'Chios island is crescent or kidney shaped, 50 km (31 mi) long from north to south, and 29 km (18 mi) at its widest, covering an area of 842.289 km2 (325.210 sq mi).[2] The terrain is mountainous and arid, with a ridge of mountains running the length of the island. The two largest of these mountains, Pelineon (1,297 m (4,255 ft)) and Epos (1,188 m (3,898 ft)), are situated in the north of the island. The center of the island is divided between east and west by a range of smaller peaks, known as Provatas.'\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7fAPuTnEF8Xj"
      },
      "source": [
        "# see here for notation http://cs231n.stanford.edu/slides/2018/cs231n_2018_lecture10.pdf\n",
        "\"\"\"\n",
        "Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)\n",
        "BSD License\n",
        "\"\"\"\n",
        "import numpy as np\n",
        "\n",
        "# data I/O\n",
        "#data = open('input.txt', 'r').read() # should be simple plain text file - you can use any (small) file in txt format from the web or type your own. \n",
        "\n",
        "chars = list(set(data))\n",
        "data_size, vocab_size = len(data), len(chars)\n",
        "print('data has %d characters, %d unique.' % (data_size, vocab_size))\n",
        "char_to_ix = { ch:i for i,ch in enumerate(chars) }\n",
        "ix_to_char = { i:ch for i,ch in enumerate(chars) }\n",
        "\n",
        "# hyperparameters\n",
        "hidden_size = 100 # size of hidden layer of neurons\n",
        "seq_length = 25 # number of steps to unroll the RNN for\n",
        "learning_rate = 1e-1\n",
        "\n",
        "# model parameters\n",
        "Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden\n",
        "Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden\n",
        "Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output\n",
        "bh = np.zeros((hidden_size, 1)) # hidden bias\n",
        "by = np.zeros((vocab_size, 1)) # output bias\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IJM73XLoY0q3"
      },
      "source": [
        "def sample(h, seed_ix, n):\n",
        "  \"\"\" \n",
        "  sample a sequence of integers from the model \n",
        "  h is memory state, seed_ix is seed letter for first time step\n",
        "  \"\"\"\n",
        "  x = np.zeros((vocab_size, 1))\n",
        "  x[seed_ix] = 1\n",
        "  ixes = []\n",
        "  for t in range(n):\n",
        "    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)\n",
        "    y = np.dot(Why, h) + by\n",
        "    p = np.exp(y) / np.sum(np.exp(y))\n",
        "    ix = np.random.choice(range(vocab_size), p=p.ravel())\n",
        "    x = np.zeros((vocab_size, 1))\n",
        "    x[ix] = 1\n",
        "    ixes.append(ix)\n",
        "  return ixes"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XU236iBMYtWS"
      },
      "source": [
        "def lossFun(inputs, targets, hprev):\n",
        "  \"\"\"\n",
        "  inputs,targets are both list of integers.\n",
        "  hprev is Hx1 array of initial hidden state\n",
        "  returns the loss, gradients on model parameters, and last hidden state\n",
        "  \"\"\"\n",
        "\n",
        "  xs, hs, ys, ps = {}, {}, {}, {}\n",
        "  hs[-1] = np.copy(hprev)\n",
        "  loss = 0\n",
        "  # forward pass\n",
        "  for t in range(len(inputs)):\n",
        "    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation\n",
        "    xs[t][inputs[t]] = 1\n",
        "    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state\n",
        "    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars\n",
        "    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars\n",
        "    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)\n",
        "  # backward pass: compute gradients going backwards\n",
        "  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
        "  dbh, dby = np.zeros_like(bh), np.zeros_like(by)\n",
        "  dhnext = np.zeros_like(hs[0])\n",
        "  for t in reversed(range(len(inputs))):\n",
        "    dy = np.copy(ps[t])\n",
        "    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here\n",
        "    dWhy += np.dot(dy, hs[t].T)\n",
        "    dby += dy\n",
        "    dh = np.dot(Why.T, dy) + dhnext # backprop into h\n",
        "    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity\n",
        "    dbh += dhraw\n",
        "    dWxh += np.dot(dhraw, xs[t].T)\n",
        "    dWhh += np.dot(dhraw, hs[t-1].T)\n",
        "    dhnext = np.dot(Whh.T, dhraw)\n",
        "  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:\n",
        "    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients\n",
        "  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]\n",
        "\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qkOdfAJPY81L"
      },
      "source": [
        "n, p = 0, 0\n",
        "mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
        "mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad\n",
        "smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0\n",
        "\n",
        "while True:\n",
        "  # prepare inputs (we're sweeping from left to right in steps seq_length long)\n",
        "  if p+seq_length+1 >= len(data) or n == 0: \n",
        "    hprev = np.zeros((hidden_size,1)) # reset RNN memory\n",
        "    p = 0 # go from start of data\n",
        "  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
        "  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
        "\n",
        "  # sample from the model now and then\n",
        "  if n % 1000 == 0:\n",
        "    sample_ix = sample(hprev, inputs[0], 200)\n",
        "    txt = ''.join(ix_to_char[ix] for ix in sample_ix)\n",
        "    print('----\\n %s \\n----' % (txt, ))\n",
        "\n",
        "  # forward seq_length characters through the net and fetch gradient\n",
        "  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)\n",
        "  smooth_loss = smooth_loss * 0.999 + loss * 0.001\n",
        "  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress\n",
        "  \n",
        "  # perform parameter update with Adagrad\n",
        "  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], \n",
        "                                [dWxh, dWhh, dWhy, dbh, dby], \n",
        "                                [mWxh, mWhh, mWhy, mbh, mby]):\n",
        "    mem += dparam * dparam\n",
        "    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update\n",
        "\n",
        "  p += seq_length # move data pointer\n",
        "  n += 1 # iteration counter "
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}