{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sequential Layer\n",
    "\n",
    "This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Installing packages:\n",
      "\t.package(path: \"/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_00_load_data\")\n",
      "\t\tFastaiNotebook_00_load_data\n",
      "With SwiftPM flags: []\n",
      "Working in: /tmp/tmp2npmmpfo\n",
      "Fetching https://github.com/mxcl/Path.swift\n",
      "Fetching https://github.com/JustHTTP/Just\n",
      "Completed resolution in 1.21s\n",
      "Cloning https://github.com/JustHTTP/Just\n",
      "Resolving https://github.com/JustHTTP/Just at 0.7.1\n",
      "Cloning https://github.com/mxcl/Path.swift\n",
      "Resolving https://github.com/mxcl/Path.swift at 0.16.2\n",
      "Compile Swift Module 'Path' (9 sources)\n",
      "Compile Swift Module 'Just' (1 sources)\n",
      "Compile Swift Module 'FastaiNotebook_00_load_data' (1 sources)\n",
      "Compile Swift Module 'jupyterInstalledPackages' (1 sources)\n",
      "Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so\n",
      "Initializing Swift...\n",
      "Loading library...\n",
      "Installation complete!\n"
     ]
    }
   ],
   "source": [
    "%install '.package(path: \"$cwd/FastaiNotebook_00_load_data\")' FastaiNotebook_00_load_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import FastaiNotebook_00_load_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors\n",
    "/// for our sequential layer type.\n",
    "public struct DiffList<U: Differentiable & AdditiveArithmetic & Equatable & VectorNumeric>: KeyPathIterable {\n",
    "    public var u: [U] = []\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "extension DiffList: Equatable {\n",
    "    public static func == (lhs: DiffList, rhs: DiffList) -> Bool {\n",
    "       if lhs.u.count != rhs.u.count { return false }\n",
    "       for i in 0..<lhs.u.count {\n",
    "          if lhs.u[i] != rhs.u[i] { return false }\n",
    "       }\n",
    "       return true\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "// Note: it'd be good to split up this cell into multiple, but there's a circular dependency between adding\n",
    "// derivatives to the AdditiveArithmetic protocol, and marking the type as differentiable. Retroactive conformances\n",
    "// would probably help here.\n",
    "\n",
    "extension DiffList: AdditiveArithmetic {\n",
    "    public static var zero: DiffList {\n",
    "        get { return DiffList() }\n",
    "    }\n",
    "    \n",
    "    @differentiable(vjp: _vjpAdd(lhs:rhs:))\n",
    "    public static func + (lhs: DiffList, rhs: DiffList) -> DiffList {\n",
    "        precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n",
    "                     \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n",
    "        if lhs.u.count == 0 { return rhs }\n",
    "        if rhs.u.count == 0 { return lhs }\n",
    "        var output: [U] = []\n",
    "        for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }\n",
    "        return DiffList(u: output)\n",
    "    }\n",
    "    \n",
    "    public static func _vjpAdd(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {\n",
    "        return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n",
    "            precondition(v.u.count == lhsCount || lhsCount == 0, \n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n",
    "            precondition(v.u.count == rhsCount || rhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n",
    "            var lhsOutput: [U]  = []\n",
    "            var rhsOutput: [U]  = []\n",
    "            // Unbroadcasting\n",
    "            if lhsCount != 0 { lhsOutput = v.u }\n",
    "            if rhsCount != 0 { rhsOutput = v.u }\n",
    "            return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n",
    "        })\n",
    "    }\n",
    "\n",
    "    @differentiable(vjp: _vjpSubtract(lhs:rhs:))\n",
    "    public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {\n",
    "        precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n",
    "                     \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n",
    "        if lhs.u.count == 0 { return rhs }\n",
    "        if rhs.u.count == 0 { return lhs }\n",
    "        var output: [U] = []\n",
    "        for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }\n",
    "        return DiffList(u: output)\n",
    "    }\n",
    "\n",
    "    public static func _vjpSubtract(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {\n",
    "        return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n",
    "            precondition(v.u.count == lhsCount || lhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n",
    "            precondition(v.u.count == rhsCount || rhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n",
    "\n",
    "            var lhsOutput: [U]  = []\n",
    "            var rhsOutput: [U]  = []\n",
    "            // Unbroadcasting\n",
    "            if lhsCount != 0 { lhsOutput = v.u }\n",
    "            if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }\n",
    "            return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n",
    "        })\n",
    "    }\n",
    "}\n",
    "\n",
    "extension DiffList: VectorNumeric {\n",
    "    public typealias Scalar = U.Scalar\n",
    "    \n",
    "    public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {\n",
    "        return DiffList(u: rhs.u.map( { $0 * lhs } ))\n",
    "    }\n",
    "}\n",
    "\n",
    "extension DiffList: Differentiable {\n",
    "    public typealias TangentVector = DiffList\n",
    "    public typealias CotangentVector = DiffList\n",
    "    public typealias AllDifferentiableVariables = DiffList\n",
    "\n",
    "    public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n",
    "        return cotangent\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "import TensorFlow  // Defines Layer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "/// A struct that contains a number of layers within it.\n",
    "public struct SequentialLayer<U: Layer>: KeyPathIterable where \n",
    "    U.Input == U.Output,\n",
    "    U.AllDifferentiableVariables: VectorNumeric,\n",
    "    U.AllDifferentiableVariables == U.CotangentVector {\n",
    "\n",
    "    public var layers: [U]\n",
    "\n",
    "    public init(layers: [U]) {\n",
    "        self.layers = layers\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "// Not strictly necessary, but nice to have.\n",
    "extension SequentialLayer: Equatable where U: Equatable {\n",
    "    public static func ==(lhs: SequentialLayer, rhs: SequentialLayer) -> Bool {\n",
    "        return lhs.layers == rhs.layers\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "// Mark SequentialLayer as Differentiable\n",
    "extension SequentialLayer: Differentiable {\n",
    "    public typealias AllDifferentiableVariables = DiffList<U.AllDifferentiableVariables>\n",
    "    public typealias TangentVector = DiffList<U.TangentVector>\n",
    "    public typealias CotangentVector = DiffList<U.CotangentVector>\n",
    "\n",
    "    public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n",
    "        precondition(cotangent.u.count == layers.count, \"Differing # of layers: \\(cotangent.u.count) and \\(layers.count)\")\n",
    "        return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))\n",
    "    }\n",
    "\n",
    "    public func moved(along direction: TangentVector) -> SequentialLayer {\n",
    "        precondition(direction.u.count == layers.count, \"Differing # of layers: \\(direction.u.count) and \\(layers.count)\")\n",
    "        return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))\n",
    "    }\n",
    "\n",
    "    public var allDifferentiableVariables: AllDifferentiableVariables {\n",
    "        get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }\n",
    "        set {\n",
    "            precondition(newValue.u.count == layers.count, \"Differing # of layers: \\(newValue.u.count) and \\(layers.count)\")\n",
    "            for i in 0..<layers.count { layers[i].allDifferentiableVariables = newValue.u[i] }\n",
    "        }\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "// Conform SequentialLayer to the Layer protocol\n",
    "\n",
    "extension SequentialLayer: Layer {\n",
    "\n",
    "    public typealias Input = U.Input\n",
    "    public typealias Output = U.Output\n",
    "  \n",
    "    @differentiable(vjp: _appliedDifferentiating(to:in:))\n",
    "    public func applied(to input: Input, in context: Context) -> Output {\n",
    "        var tmp = input\n",
    "        for layer in layers { tmp = layer.applied(to: tmp, in: context) }\n",
    "        return tmp\n",
    "    }\n",
    "   \n",
    "    public func _appliedDifferentiating(to input: Input, in context: Context) -> (\n",
    "        Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {\n",
    "        \n",
    "        var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []\n",
    "        var tmp = input\n",
    "        for layer in layers {\n",
    "            let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in\n",
    "                return layer.applied(to: input, in: context)\n",
    "            }\n",
    "            tmp = output\n",
    "            pullbacks.append(pullback)\n",
    "        }\n",
    "        \n",
    "        return (tmp, { input in\n",
    "            var allDiffVars: [U.AllDifferentiableVariables] = []\n",
    "            var tmp = input\n",
    "                    \n",
    "            for pb in pullbacks.reversed() {\n",
    "                let (diffVars, input) = pb(tmp)\n",
    "                tmp = input\n",
    "                allDiffVars.append(diffVars)\n",
    "            }\n",
    "                      \n",
    "            return (DiffList(u: allDiffVars.reversed()), tmp)\n",
    "        })\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct MyModel: Layer {\n",
    "    var layers: SequentialLayer<Dense<Float>>\n",
    "    \n",
    "    init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {\n",
    "        // Make the dense layers.\n",
    "        \n",
    "        // TODO(saeta): Clean up this code.\n",
    "        var input = inputSize\n",
    "        var output = outputSize\n",
    "        if hiddenUnits.count > 0 { output = hiddenUnits[0] }\n",
    "        var layers: [Dense<Float>] = []\n",
    "        for i in 0..<hiddenUnits.count {\n",
    "            output = hiddenUnits[i]\n",
    "            print(\"Making Dense<Float>(inputSize: \\(input), outputSize: \\(output))\")\n",
    "            layers.append(Dense<Float>(inputSize: input, outputSize: output))\n",
    "            input = output\n",
    "        }\n",
    "        print(\"Making Dense<Float>(inputSize: \\(output), outputSize: \\(outputSize))\")\n",
    "        layers.append(Dense<Float>(inputSize: output, outputSize: outputSize))\n",
    "        \n",
    "        self.layers = SequentialLayer(layers: layers)\n",
    "    }\n",
    "    \n",
    "    @differentiable\n",
    "    func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {\n",
    "        return layers.applied(to: input, in: context)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct FixedModel: Layer {\n",
    "    var d1 = Dense<Float>(inputSize: 784, outputSize: 30)\n",
    "    var d2 = Dense<Float>(inputSize: 30, outputSize: 10)\n",
    "    \n",
    "    @differentiable\n",
    "    func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {\n",
    "        return input.sequenced(in: context, through: d1, d2)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz...\n",
      "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz...\n",
      "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz...\n",
      "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz...\n"
     ]
    }
   ],
   "source": [
    "var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{\n",
    "    let m = x.max(alongAxes: -1)\n",
    "    return m + log(exp(x-m).sum(alongAxes: -1))\n",
    "}\n",
    "func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{\n",
    "    return activations - logSumExp(activations)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let trainingContext = Context(learningPhase: .training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let lr:Float = 0.5   // learning rate\n",
    "let epochs = 1      // how many epochs to train for\n",
    "let bs:Int32=64                         // batch size\n",
    "let (n,m) = (60000,784)  // MNIST dataset size\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var modelFixed = FixedModel()\n",
    "let modelFixedStart = modelFixed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Making Dense<Float>(inputSize: 784, outputSize: 30)\n",
      "Making Dense<Float>(inputSize: 30, outputSize: 10)\n"
     ]
    }
   ],
   "source": [
    "var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)\n",
    "let modelFlexStart = modelFlex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "public func accuracy(_ output: Tensor<Float>, _ target: Tensor<Int32>) -> Tensor<Float>{\n",
    "    let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)\n",
    "    return corrects.mean()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let inferenceContext = Context(learningPhase: .inference)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.193\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1426\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-("
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/Int(bs)){\n",
    "        let startIdx = Int32(i) * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb, in: trainingContext)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        var parameters = modelFixed.allDifferentiableVariables\n",
    "        for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){ \n",
    "            parameters[keyPath: kp] -= lr * grads[keyPath:kp]\n",
    "        }\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/Int(bs)){\n",
    "        let startIdx = Int32(i) * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb, in: trainingContext)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        var parameters = modelFlex.allDifferentiableVariables\n",
    "        for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){ \n",
    "            parameters[keyPath: kp] -= lr * grads[keyPath: kp]\n",
    "        }\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.193\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1426\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let optimizerFixed = SGD<FixedModel, Float>(learningRate: lr)\n",
    "// let optimizerFlex = SGD<MyModel, Float>(learningRate: lr)  // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "/// Stochastic gradient descent (SGD) optimizer.\n",
    "///\n",
    "/// An optimizer that implements stochastic gradient descent, with support for momentum, learning\n",
    "/// rate decay, and Nesterov momentum.\n",
    "public class SimpleSGD<Model: Layer, Scalar: TensorFlowFloatingPoint>: Optimizer\n",
    "    where Model.AllDifferentiableVariables == Model.CotangentVector {\n",
    "    /// The learning rate.\n",
    "    public var learningRate: Scalar\n",
    "\n",
    "    public init(\n",
    "        learningRate: Scalar = 0.01\n",
    "    ) {\n",
    "        precondition(learningRate >= 0, \"Learning rate must be non-negative\")\n",
    "\n",
    "        self.learningRate = learningRate\n",
    "    }\n",
    "\n",
    "    public func update(_ model: inout Model.AllDifferentiableVariables,\n",
    "                       along direction: Model.CotangentVector) {\n",
    "        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {\n",
    "            model[keyPath: kp] -= -learningRate * direction[keyPath: kp]\n",
    "        }\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let simpleOptFlex = SimpleSGD<MyModel, Float>(learningRate: lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/Int(bs)){\n",
    "        let startIdx = Int32(i) * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb, in: trainingContext)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.098\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/Int(bs)){\n",
    "        let startIdx = Int32(i) * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb, in: trainingContext)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "//        optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n",
    "        simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.098\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false\n"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Swift",
   "language": "swift",
   "name": "swift"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}