{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sequential Layer\n",
    "\n",
    "This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%install '.package(path: \"$cwd/FastaiNotebook_00_load_data\")' FastaiNotebook_00_load_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import FastaiNotebook_00_load_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors\n",
    "/// for our sequential layer type.\n",
    "public struct DiffList<U: Differentiable & AdditiveArithmetic & Equatable & VectorNumeric>: KeyPathIterable {\n",
    "    public var u: [U] = []\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "extension DiffList: Equatable {\n",
    "    public static func == (lhs: DiffList, rhs: DiffList) -> Bool {\n",
    "       if lhs.u.count != rhs.u.count { return false }\n",
    "       for i in 0..<lhs.u.count {\n",
    "          if lhs.u[i] != rhs.u[i] { return false }\n",
    "       }\n",
    "       return true\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "extension DiffList: AdditiveArithmetic {\n",
    "    public static var zero: DiffList {\n",
    "        get { return DiffList() }\n",
    "    }\n",
    "    \n",
    "    @differentiable(vjp: _vjpAdd(lhs:rhs:))\n",
    "    public static func + (lhs: DiffList, rhs: DiffList) -> DiffList {\n",
    "        precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n",
    "                     \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n",
    "        if lhs.u.count == 0 { return rhs }\n",
    "        if rhs.u.count == 0 { return lhs }\n",
    "        var output: [U] = []\n",
    "        for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }\n",
    "        return DiffList(u: output)\n",
    "    }\n",
    "    \n",
    "    public static func _vjpAdd(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {\n",
    "        return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n",
    "            precondition(v.u.count == lhsCount || lhsCount == 0, \n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n",
    "            precondition(v.u.count == rhsCount || rhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n",
    "            var lhsOutput: [U]  = []\n",
    "            var rhsOutput: [U]  = []\n",
    "            // Unbroadcasting\n",
    "            if lhsCount != 0 { lhsOutput = v.u }\n",
    "            if rhsCount != 0 { rhsOutput = v.u }\n",
    "            return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n",
    "        })\n",
    "    }\n",
    "\n",
    "    @differentiable(vjp: _vjpSubtract(lhs:rhs:))\n",
    "    public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {\n",
    "        precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n",
    "                     \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n",
    "        if lhs.u.count == 0 { return rhs }\n",
    "        if rhs.u.count == 0 { return lhs }\n",
    "        var output: [U] = []\n",
    "        for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }\n",
    "        return DiffList(u: output)\n",
    "    }\n",
    "\n",
    "    public static func _vjpSubtract(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {\n",
    "        return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n",
    "            precondition(v.u.count == lhsCount || lhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n",
    "            precondition(v.u.count == rhsCount || rhsCount == 0,\n",
    "                         \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n",
    "\n",
    "            var lhsOutput: [U]  = []\n",
    "            var rhsOutput: [U]  = []\n",
    "            // Unbroadcasting\n",
    "            if lhsCount != 0 { lhsOutput = v.u }\n",
    "            if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }\n",
    "            return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n",
    "        })\n",
    "    }\n",
    "}\n",
    "\n",
    "extension DiffList: VectorNumeric {\n",
    "    public typealias Scalar = U.Scalar\n",
    "    \n",
    "    public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {\n",
    "        return DiffList(u: rhs.u.map( { $0 * lhs } ))\n",
    "    }\n",
    "}\n",
    "\n",
    "extension DiffList: Differentiable {\n",
    "    public typealias TangentVector = DiffList\n",
    "    public typealias CotangentVector = DiffList\n",
    "    public typealias AllDifferentiableVariables = DiffList\n",
    "\n",
    "    public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n",
    "        return cotangent\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "import TensorFlow  // Defines Layer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "/// A struct that contains a number of layers within it.\n",
    "public struct SequentialLayer<U: Layer>: KeyPathIterable where \n",
    "    U.Input == U.Output,\n",
    "    U.AllDifferentiableVariables: VectorNumeric,\n",
    "    U.AllDifferentiableVariables == U.CotangentVector {\n",
    "\n",
    "    public var layers: [U]\n",
    "\n",
    "    public init(layers: [U]) {\n",
    "        self.layers = layers\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "// Mark SequentialLayer as Differentiable\n",
    "extension SequentialLayer: Differentiable {\n",
    "    public typealias AllDifferentiableVariables = DiffList<U.AllDifferentiableVariables>\n",
    "    public typealias TangentVector = DiffList<U.TangentVector>\n",
    "    public typealias CotangentVector = DiffList<U.CotangentVector>\n",
    "\n",
    "    public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n",
    "        precondition(cotangent.u.count == layers.count, \"Differing # of layers: \\(cotangent.u.count) and \\(layers.count)\")\n",
    "        return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))\n",
    "    }\n",
    "\n",
    "    public func moved(along direction: TangentVector) -> SequentialLayer {\n",
    "        precondition(direction.u.count == layers.count, \"Differing # of layers: \\(direction.u.count) and \\(layers.count)\")\n",
    "        return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))\n",
    "    }\n",
    "\n",
    "    public var allDifferentiableVariables: AllDifferentiableVariables {\n",
    "        get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }\n",
    "        set {\n",
    "            precondition(newValue.u.count == layers.count, \"Differing # of layers: \\(newValue.u.count) and \\(layers.count)\")\n",
    "            for i in 0..<layers.count { layers[i].allDifferentiableVariables = newValue.u[i] }\n",
    "        }\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "// export\n",
    "\n",
    "extension SequentialLayer: Layer {\n",
    "\n",
    "    public typealias Input = U.Input\n",
    "    public typealias Output = U.Output\n",
    "  \n",
    "    @differentiable(vjp: _appliedDifferentiating(to:))\n",
    "    public func applied(to input: Input) -> Output {\n",
    "        var tmp = input\n",
    "        for layer in layers { tmp = layer.applied(to: tmp) }\n",
    "        return tmp\n",
    "    }\n",
    "   \n",
    "    public func _appliedDifferentiating(to input: Input) -> (\n",
    "        Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {\n",
    "        \n",
    "        var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []\n",
    "        var tmp = input\n",
    "        for layer in layers {\n",
    "            let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in\n",
    "                return layer.applied(to: input)\n",
    "            }\n",
    "            tmp = output\n",
    "            pullbacks.append(pullback)\n",
    "        }\n",
    "        \n",
    "        return (tmp, { input in\n",
    "            var allDiffVars: [U.AllDifferentiableVariables] = []\n",
    "            var tmp = input\n",
    "                    \n",
    "            for pb in pullbacks.reversed() {\n",
    "                let (diffVars, input) = pb(tmp)\n",
    "                tmp = input\n",
    "                allDiffVars.append(diffVars)\n",
    "            }\n",
    "                      \n",
    "            return (DiffList(u: allDiffVars.reversed()), tmp)\n",
    "        })\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct MyModel: Layer {\n",
    "    var layers: SequentialLayer<Dense<Float>>\n",
    "    \n",
    "    init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {\n",
    "        // Make the dense layers.\n",
    "        \n",
    "        // TODO(saeta): Clean up this code.\n",
    "        var input = inputSize\n",
    "        var output = outputSize\n",
    "        if hiddenUnits.count > 0 { output = hiddenUnits[0] }\n",
    "        var layers: [Dense<Float>] = []\n",
    "        for i in 0..<hiddenUnits.count {\n",
    "            output = hiddenUnits[i]\n",
    "            print(\"Making Dense<Float>(inputSize: \\(input), outputSize: \\(output))\")\n",
    "            layers.append(Dense<Float>(inputSize: input, outputSize: output))\n",
    "            input = output\n",
    "        }\n",
    "        print(\"Making Dense<Float>(inputSize: \\(output), outputSize: \\(outputSize))\")\n",
    "        layers.append(Dense<Float>(inputSize: output, outputSize: outputSize))\n",
    "        \n",
    "        self.layers = SequentialLayer(layers: layers)\n",
    "    }\n",
    "    \n",
    "    @differentiable\n",
    "    func applied(to input: Tensor<Float>) -> Tensor<Float> {\n",
    "        return layers.applied(to: input)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct FixedModel: Layer {\n",
    "    var d1 = Dense<Float>(inputSize: 784, outputSize: 30)\n",
    "    var d2 = Dense<Float>(inputSize: 30, outputSize: 10)\n",
    "    \n",
    "    @differentiable\n",
    "    func applied(to input: Tensor<Float>) -> Tensor<Float> {\n",
    "        return input.sequenced(through: d1, d2)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{\n",
    "    let m = x.max(alongAxes: -1)\n",
    "    return m + log(exp(x-m).sum(alongAxes: -1))\n",
    "}\n",
    "func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{\n",
    "    return activations - logSumExp(activations)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Context.local.learningPhase = .training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let lr:Float = 0.5   // learning rate\n",
    "let epochs = 1      // how many epochs to train for\n",
    "let bs=64                         // batch size\n",
    "let (n,m) = (60000,784)  // MNIST dataset size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var modelFixed = FixedModel()\n",
    "let modelFixedStart = modelFixed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)\n",
    "let modelFlexStart = modelFlex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "public func accuracy<Model: Layer>(_ model: Model, inputs: Tensor<Float>, target: Tensor<Int32>) -> Tensor<Float> where Model.Input == Tensor<Float>, Model.Output == Tensor<Float> {\n",
    "    return withLearningPhase(.inference) {\n",
    "        let output = model.applied(to: inputs)\n",
    "        let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)\n",
    "        return corrects.mean()\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFlex, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFixed, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-("
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/bs){\n",
    "        let startIdx = i * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        var parameters = modelFixed.allDifferentiableVariables\n",
    "        for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){ \n",
    "            parameters[keyPath: kp] -= lr * grads[keyPath:kp]\n",
    "        }\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/bs){\n",
    "        let startIdx = i * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        var parameters = modelFlex.allDifferentiableVariables\n",
    "        for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){ \n",
    "            parameters[keyPath: kp] -= lr * grads[keyPath: kp]\n",
    "        }\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFlex, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFixed, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let optimizerFixed = SGD<FixedModel, Float>(learningRate: lr)\n",
    "// let optimizerFlex = SGD<MyModel, Float>(learningRate: lr)  // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "/// Stochastic gradient descent (SGD) optimizer.\n",
    "///\n",
    "/// An optimizer that implements stochastic gradient descent, with support for momentum, learning\n",
    "/// rate decay, and Nesterov momentum.\n",
    "public class SimpleSGD<Model: Layer, Scalar: TensorFlowFloatingPoint>: Optimizer\n",
    "    where Model.AllDifferentiableVariables == Model.CotangentVector {\n",
    "    /// The learning rate.\n",
    "    public var learningRate: Scalar\n",
    "\n",
    "    public init(\n",
    "        learningRate: Scalar = 0.01\n",
    "    ) {\n",
    "        precondition(learningRate >= 0, \"Learning rate must be non-negative\")\n",
    "\n",
    "        self.learningRate = learningRate\n",
    "    }\n",
    "\n",
    "    public func update(_ model: inout Model.AllDifferentiableVariables,\n",
    "                       along direction: Model.CotangentVector) {\n",
    "        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {\n",
    "            model[keyPath: kp] -= -learningRate * direction[keyPath: kp]\n",
    "        }\n",
    "    }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "let simpleOptFlex = SimpleSGD<MyModel, Float>(learningRate: lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/bs){\n",
    "        let startIdx = i * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "        optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFixed, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFixedStart.d1.weight == modelFixed.d1.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for epoch in 1...epochs{\n",
    "    for i in 0..<((n-1)/bs){\n",
    "        let startIdx = i * bs\n",
    "        let endIdx = startIdx + bs\n",
    "        let xb = xTrain[startIdx..<endIdx]\n",
    "        let yb = yTrain[startIdx..<endIdx]\n",
    "        let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in\n",
    "            let preds = model.applied(to: xb)\n",
    "            return softmaxCrossEntropy(logits: preds, labels: yb)\n",
    "        }\n",
    "//        optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n",
    "        simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(modelFlex, inputs: xValid, target: yValid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Swift",
   "language": "swift",
   "name": "swift"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}