{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sequential Layer\n", "\n", "This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%install '.package(path: \"$cwd/FastaiNotebook_00_load_data\")' FastaiNotebook_00_load_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import FastaiNotebook_00_load_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "\n", "/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors\n", "/// for our sequential layer type.\n", "public struct DiffList: KeyPathIterable {\n", " public var u: [U] = []\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "extension DiffList: Equatable {\n", " public static func == (lhs: DiffList, rhs: DiffList) -> Bool {\n", " if lhs.u.count != rhs.u.count { return false }\n", " for i in 0.. DiffList {\n", " precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n", " \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n", " if lhs.u.count == 0 { return rhs }\n", " if rhs.u.count == 0 { return lhs }\n", " var output: [U] = []\n", " for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {\n", " return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n", " precondition(v.u.count == lhsCount || lhsCount == 0, \n", " \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n", " precondition(v.u.count == rhsCount || rhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n", " var lhsOutput: [U] = []\n", " var rhsOutput: [U] = []\n", " // Unbroadcasting\n", " if lhsCount != 0 { lhsOutput = v.u }\n", " if rhsCount != 0 { rhsOutput = v.u }\n", " return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n", " })\n", " }\n", "\n", " @differentiable(vjp: _vjpSubtract(lhs:rhs:))\n", " public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {\n", " precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n", " \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n", " if lhs.u.count == 0 { return rhs }\n", " if rhs.u.count == 0 { return lhs }\n", " var output: [U] = []\n", " for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {\n", " return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n", " precondition(v.u.count == lhsCount || lhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n", " precondition(v.u.count == rhsCount || rhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n", "\n", " var lhsOutput: [U] = []\n", " var rhsOutput: [U] = []\n", " // Unbroadcasting\n", " if lhsCount != 0 { lhsOutput = v.u }\n", " if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }\n", " return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n", " })\n", " }\n", "}\n", "\n", "extension DiffList: VectorNumeric {\n", " public typealias Scalar = U.Scalar\n", " \n", " public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {\n", " return DiffList(u: rhs.u.map( { $0 * lhs } ))\n", " }\n", "}\n", "\n", "extension DiffList: Differentiable {\n", " public typealias TangentVector = DiffList\n", " public typealias CotangentVector = DiffList\n", " public typealias AllDifferentiableVariables = DiffList\n", "\n", " public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n", " return cotangent\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "import TensorFlow // Defines Layer." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "\n", "/// A struct that contains a number of layers within it.\n", "public struct SequentialLayer: KeyPathIterable where \n", " U.Input == U.Output,\n", " U.AllDifferentiableVariables: VectorNumeric,\n", " U.AllDifferentiableVariables == U.CotangentVector {\n", "\n", " public var layers: [U]\n", "\n", " public init(layers: [U]) {\n", " self.layers = layers\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "// Mark SequentialLayer as Differentiable\n", "extension SequentialLayer: Differentiable {\n", " public typealias AllDifferentiableVariables = DiffList\n", " public typealias TangentVector = DiffList\n", " public typealias CotangentVector = DiffList\n", "\n", " public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n", " precondition(cotangent.u.count == layers.count, \"Differing # of layers: \\(cotangent.u.count) and \\(layers.count)\")\n", " return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))\n", " }\n", "\n", " public func moved(along direction: TangentVector) -> SequentialLayer {\n", " precondition(direction.u.count == layers.count, \"Differing # of layers: \\(direction.u.count) and \\(layers.count)\")\n", " return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))\n", " }\n", "\n", " public var allDifferentiableVariables: AllDifferentiableVariables {\n", " get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }\n", " set {\n", " precondition(newValue.u.count == layers.count, \"Differing # of layers: \\(newValue.u.count) and \\(layers.count)\")\n", " for i in 0.. Output {\n", " var tmp = input\n", " for layer in layers { tmp = layer.applied(to: tmp) }\n", " return tmp\n", " }\n", " \n", " public func _appliedDifferentiating(to input: Input) -> (\n", " Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {\n", " \n", " var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []\n", " var tmp = input\n", " for layer in layers {\n", " let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in\n", " return layer.applied(to: input)\n", " }\n", " tmp = output\n", " pullbacks.append(pullback)\n", " }\n", " \n", " return (tmp, { input in\n", " var allDiffVars: [U.AllDifferentiableVariables] = []\n", " var tmp = input\n", " \n", " for pb in pullbacks.reversed() {\n", " let (diffVars, input) = pb(tmp)\n", " tmp = input\n", " allDiffVars.append(diffVars)\n", " }\n", " \n", " return (DiffList(u: allDiffVars.reversed()), tmp)\n", " })\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "struct MyModel: Layer {\n", " var layers: SequentialLayer>\n", " \n", " init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {\n", " // Make the dense layers.\n", " \n", " // TODO(saeta): Clean up this code.\n", " var input = inputSize\n", " var output = outputSize\n", " if hiddenUnits.count > 0 { output = hiddenUnits[0] }\n", " var layers: [Dense] = []\n", " for i in 0..(inputSize: \\(input), outputSize: \\(output))\")\n", " layers.append(Dense(inputSize: input, outputSize: output))\n", " input = output\n", " }\n", " print(\"Making Dense(inputSize: \\(output), outputSize: \\(outputSize))\")\n", " layers.append(Dense(inputSize: output, outputSize: outputSize))\n", " \n", " self.layers = SequentialLayer(layers: layers)\n", " }\n", " \n", " @differentiable\n", " func applied(to input: Tensor) -> Tensor {\n", " return layers.applied(to: input)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "struct FixedModel: Layer {\n", " var d1 = Dense(inputSize: 784, outputSize: 30)\n", " var d2 = Dense(inputSize: 30, outputSize: 10)\n", " \n", " @differentiable\n", " func applied(to input: Tensor) -> Tensor {\n", " return input.sequenced(through: d1, d2)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSumExp(_ x: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let m = x.max(alongAxes: -1)\n", " return m + log(exp(x-m).sum(alongAxes: -1))\n", "}\n", "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - logSumExp(activations)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Context.local.learningPhase = .training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let lr:Float = 0.5 // learning rate\n", "let epochs = 1 // how many epochs to train for\n", "let bs=64 // batch size\n", "let (n,m) = (60000,784) // MNIST dataset size" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var modelFixed = FixedModel()\n", "let modelFixedStart = modelFixed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)\n", "let modelFlexStart = modelFlex" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public func accuracy(_ model: Model, inputs: Tensor, target: Tensor) -> Tensor where Model.Input == Tensor, Model.Output == Tensor {\n", " return withLearningPhase(.inference) {\n", " let output = model.applied(to: inputs)\n", " let corrects = Tensor(output.argmax(squeezingAxis: 1) .== target)\n", " return corrects.mean()\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFlex, inputs: xValid, target: yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFixed, inputs: xValid, target: yValid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-(" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/bs){\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " var parameters = modelFixed.allDifferentiableVariables\n", " for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ \n", " parameters[keyPath: kp] -= lr * grads[keyPath:kp]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/bs){\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " var parameters = modelFlex.allDifferentiableVariables\n", " for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ \n", " parameters[keyPath: kp] -= lr * grads[keyPath: kp]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFlex, inputs: xValid, target: yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFixed, inputs: xValid, target: yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let optimizerFixed = SGD(learningRate: lr)\n", "// let optimizerFlex = SGD(learningRate: lr) // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "/// Stochastic gradient descent (SGD) optimizer.\n", "///\n", "/// An optimizer that implements stochastic gradient descent, with support for momentum, learning\n", "/// rate decay, and Nesterov momentum.\n", "public class SimpleSGD: Optimizer\n", " where Model.AllDifferentiableVariables == Model.CotangentVector {\n", " /// The learning rate.\n", " public var learningRate: Scalar\n", "\n", " public init(\n", " learningRate: Scalar = 0.01\n", " ) {\n", " precondition(learningRate >= 0, \"Learning rate must be non-negative\")\n", "\n", " self.learningRate = learningRate\n", " }\n", "\n", " public func update(_ model: inout Model.AllDifferentiableVariables,\n", " along direction: Model.CotangentVector) {\n", " for kp in model.recursivelyAllWritableKeyPaths(to: Tensor.self) {\n", " model[keyPath: kp] -= -learningRate * direction[keyPath: kp]\n", " }\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let simpleOptFlex = SimpleSGD(learningRate: lr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/bs){\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFixed, inputs: xValid, target: yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/bs){\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", "// optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n", " simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy(modelFlex, inputs: xValid, target: yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Swift", "language": "swift", "name": "swift" } }, "nbformat": 4, "nbformat_minor": 2 }