{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Installing packages:\n",
      "\t.package(path: \"/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_01_matmul\")\n",
      "\t\tFastaiNotebook_01_matmul\n",
      "With SwiftPM flags: []\n",
      "Working in: /tmp/tmpp03bd5c9\n",
      "Fetching https://github.com/mxcl/Path.swift\n",
      "Fetching https://github.com/JustHTTP/Just\n",
      "Completed resolution in 1.40s\n",
      "Cloning https://github.com/mxcl/Path.swift\n",
      "Resolving https://github.com/mxcl/Path.swift at 0.16.2\n",
      "Cloning https://github.com/JustHTTP/Just\n",
      "Resolving https://github.com/JustHTTP/Just at 0.7.1\n",
      "Compile Swift Module 'Just' (1 sources)\n",
      "Compile Swift Module 'Path' (9 sources)\n",
      "Compile Swift Module 'FastaiNotebook_01_matmul' (2 sources)\n",
      "Compile Swift Module 'jupyterInstalledPackages' (1 sources)\n",
      "Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so\n",
      "Initializing Swift...\n",
      "Loading library...\n",
      "Installation complete!\n"
     ]
    }
   ],
   "source": [
    "%install '.package(path: \"$cwd/FastaiNotebook_01_matmul\")' FastaiNotebook_01_matmul"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import FastaiNotebook_01_matmul"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "import Path\n",
    "import TensorFlow"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The forward and backward passes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "public typealias TF=Tensor<Float>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "public func normalize(_ x:TF, mean:TF, std:TF) -> TF {\n",
    "    return (x-mean)/std\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Normalize the training and validation sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let trainMean = xTrain.mean()\n",
    "let trainStd  = xTrain.standardDeviation()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xTrain = normalize(xTrain, mean: trainMean, std: trainStd)\n",
    "xValid = normalize(xValid, mean: trainMean, std: trainStd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "//export\n",
    "public func testNearZero(_ a:TF, tolerance:Float=1e-3) {\n",
    "    assert(abs(a)<tolerance, \"Near zero: \\(a)\")\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "testNearZero(xTrain.mean())\n",
    "testNearZero(xTrain.standardDeviation() - 1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "60000 784 10\r\n"
     ]
    }
   ],
   "source": [
    "let (n,m) = (xTrain.shape[0],xTrain.shape[1])\n",
    "let c = yTrain.max()+1\n",
    "print(n,m,c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Foundations version"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "//num hidden\n",
    "let nh = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// simplified kaiming init / he init\n",
    "let w1:TF = TF(randomNormal: [m,nh]) / sqrt(Float(m))\n",
    "let b1:TF = TF(repeating: 0.0, shape: [nh])\n",
    "let w2:TF = TF(randomNormal: [nh,1]) / sqrt(Float(nh))\n",
    "let b2:TF = TF(repeating: 0.0, shape: [1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "testNearZero(w1.mean())\n",
    "testNearZero(w1.standardDeviation()-1/sqrt(Float(m)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "▿ 2 elements\n",
       "  - .0 : 0.006017743\n",
       "  - .1 : 1.0076997\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// This should be ~ (0,1) (mean,std)...\n",
    "(xValid.mean(),xValid.standardDeviation())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func lin(_ x:TF, _ w:TF, _ b:TF) -> TF {return matmul(x, w) + b}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let t = lin(xValid, w1, b1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "▿ 2 elements\n",
       "  - .0 : -0.08092515\n",
       "  - .1 : 0.9930747\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "//...so should this, because we used kaiming init, which is designed to do this\n",
    "(t.mean(),t.standardDeviation())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func myRelu(_ x:TF) -> TF {return max(x, 0)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let t = myRelu(lin(xValid, w1, b1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "▿ 2 elements\n",
       "  - .0 : 0.35371977\n",
       "  - .1 : 0.54694015\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "//...actually it really should be this!\n",
    "(t.mean(),t.standardDeviation())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// kaiming init / he init for relu\n",
    "let w1:TF = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "▿ 2 elements\n",
       "  - .0 : -0.0002575889\n",
       "  - .1 : 0.050299395\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(w1.mean(),w1.standardDeviation())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "▿ 2 elements\n",
       "  - .0 : 0.5851609\n",
       "  - .1 : 0.8190965\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "let t = myRelu(lin(xValid, w1, b1))\n",
    "(t.mean(),t.standardDeviation())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func model(_ xb: TF) -> TF{\n",
    "    let l1 = lin(xb, w1, b1)\n",
    "    let l2 = myRelu(l1)\n",
    "    let l3 = lin(l2, w2, b2)\n",
    "    return l3\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9236246999999999 ms\r\n"
     ]
    }
   ],
   "source": [
    "time(repeating: 10) {let _ = model(xValid)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loss function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let preds = model(xTrain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "public func mse(_ out:TF, _ targ:TF) -> TF {\n",
    "    return (out.squeezingShape(at: -1) - targ).squared().mean()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var yTrainF = TF(yTrain)\n",
    "var yValidF = TF(yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35.50708\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mse(preds, yTrainF)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Gradients and backward pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To store the gradients a bit like in PyTorch we introduce a Tensor with grad class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the python notebook: classes are reference types (which means they are mutable) while structures are value types."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TensorWithGrad {\n",
    "    var inner: TF\n",
    "    var grad:  TF\n",
    "    \n",
    "    init(_ x: TF) {\n",
    "        inner = x\n",
    "        grad = TF(repeating: 0.0, shape:x.shape)\n",
    "    } \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func lin(_ x:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad) -> TensorWithGrad {\n",
    "    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)\n",
    "}\n",
    "func myRelu(_ x:TensorWithGrad) -> TensorWithGrad {return TensorWithGrad(max(x.inner, 0))}\n",
    "func mse(_ inp: TensorWithGrad, _ targ : TF) -> TF{\n",
    "    //grad of loss with respect to output of previous layer\n",
    "    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func mseGrad(_ inp: TensorWithGrad, _ targ : TF){\n",
    "    //grad of loss with respect to output of previous layer\n",
    "    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func reluGrad(_ inp: TensorWithGrad, _ out:TensorWithGrad){\n",
    "    //grad of relu with respect to input activations\n",
    "    inp.grad = (inp.inner .> 0).selecting(out.grad, TF(repeating:0.0, shape:inp.inner.shape))\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is our python version (we've renamed the python `g` to `grad` for consistency):\n",
    "\n",
    "```python\n",
    "def lin_grad(inp, out, w, b):\n",
    "    inp.grad = out.grad @ w.t()\n",
    "    w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)\n",
    "    b.grad = out.grad.sum(0)\n",
    "```\n",
    "\n",
    "In Swift `@` is spelled `•`, which is <kbd>option</kbd>-<kbd>8</kbd> on Mac or <kbd>compose</kbd>-<kbd>.</kbd>-<kbd>=</kbd> elsewhere. Or just use the `matmul()` function we've seen already."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){\n",
    "    //grad of relu with respect to input activations\n",
    "    inp.grad = out.grad • w.inner.transposed()\n",
    "    w.grad = inp.inner.transposed() • out.grad\n",
    "    b.grad = out.grad.sum(squeezingAxes: 0)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let w1a = TensorWithGrad(w1)\n",
    "let b1a = TensorWithGrad(b1)\n",
    "let w2a = TensorWithGrad(w2)\n",
    "let b2a = TensorWithGrad(b2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func forwardAndBackward(_ inp:TensorWithGrad, _ targ:TF){\n",
    "    //forward pass:\n",
    "    let l1 = lin(inp, w1a, b1a)\n",
    "    let l2 = myRelu(l1)\n",
    "    let out = lin(l2, w2a, b2a)\n",
    "    //we don't actually need the loss in backward!\n",
    "    let loss = mse(out, targ)\n",
    "    \n",
    "    //backward pass:\n",
    "    mseGrad(out, targ)\n",
    "    linGrad(l2, out, w2a, b2a)\n",
    "    reluGrad(l1, l2)\n",
    "    linGrad(inp, l1, w1a, b1a)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let inp = TensorWithGrad(xTrain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forwardAndBackward(inp, yTrainF)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare to swift autodiff now. We have to mark the function as @differentiable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@differentiable\n",
    "func forward(_ inp:TF, _ targ:TF, w1:TF, b1:TF, \n",
    "            w2:TF, b2:TF) -> TF{\n",
    "    let l1 = matmul(inp, w1) + b1\n",
    "    let l2 = relu(l1)\n",
    "    let l3 = matmul(l2, w2) + b2\n",
    "    return (l3.squeezingShape(at: -1) - targ).squared().mean()\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then we can ask for the gradients of anything like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let xGrad = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}\n",
    "let w1Grad = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}\n",
    "let b1Grad = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}\n",
    "let w2Grad = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}\n",
    "let b2Grad = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note the big difference with PyTorch: in PyTorch the tensors remember how they were created when you have `requires_grad=True` so that when you arrive at a final number a call the `backward` pass, they can compute their respective gradients.\n",
    "\n",
    "In swift for TensorFlow, the Tensor don't store anything, so you have to specify the whole function you want executed when computing the gradients."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "testNearZero(xGrad - inp.grad)\n",
    "testNearZero(w1Grad - w1a.grad)\n",
    "testNearZero(b1Grad - b1a.grad)\n",
    "testNearZero(w2Grad - w2a.grad)\n",
    "testNearZero(b2Grad - b2a.grad)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23.6698357 ms\r\n"
     ]
    }
   ],
   "source": [
    "time(repeating: 10) { forwardAndBackward(inp, yTrainF) }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's a bit inefficient to have to ask for the gradients of every parameter in a different function call. The swifty way of doing this is to regroup all our parameters in a structure (which will be our model later on). As long as they all conform to the protocol `Differentiable`, we can make this structure conform to `Differentiable` without having to implement anything and it will just work."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "public struct myParams: Differentiable {\n",
    "    public var x, w1, b1, w2, b2: TF\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let grads = gradient(at: allParams) { \n",
    "     allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "testNearZero(xGrad  - grads.x)\n",
    "testNearZero(w1Grad - grads.w1)\n",
    "testNearZero(b1Grad - grads.b1)\n",
    "testNearZero(w2Grad - grads.w2)\n",
    "testNearZero(b2Grad - grads.b2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you wanted the value for your loss as well as the gradients, you just have to use `valueWithGradient`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let (loss,grads) = valueWithGradient(at: allParams) { \n",
    "    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "testNearZero(xGrad  - grads.x)\n",
    "testNearZero(w1Grad - grads.w1)\n",
    "testNearZero(b1Grad - grads.b1)\n",
    "testNearZero(w2Grad - grads.w2)\n",
    "testNearZero(b2Grad - grads.b2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In terms of timing our implementaiton gives:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24.5198722 ms\r\n"
     ]
    }
   ],
   "source": [
    "time(repeating: 10) { forwardAndBackward(inp, yTrainF) }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24.631195299999998 ms\r\n"
     ]
    }
   ],
   "source": [
    "time(repeating: 10) { let _ = valueWithGradient(at: allParams) { \n",
    "    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "notebookToScript(fname: (Path.cwd / \"02_fully_connected.ipynb\").string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Swift",
   "language": "swift",
   "name": "swift"
  },
  "language_info": {
   "file_extension": ".swift",
   "mimetype": "text/x-swift",
   "name": "swift",
   "version": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}