{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sequential Layer\n", "\n", "This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Installing packages:\n", "\t.package(path: \"/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_00_load_data\")\n", "\t\tFastaiNotebook_00_load_data\n", "With SwiftPM flags: []\n", "Working in: /tmp/tmp2npmmpfo\n", "Fetching https://github.com/mxcl/Path.swift\n", "Fetching https://github.com/JustHTTP/Just\n", "Completed resolution in 1.21s\n", "Cloning https://github.com/JustHTTP/Just\n", "Resolving https://github.com/JustHTTP/Just at 0.7.1\n", "Cloning https://github.com/mxcl/Path.swift\n", "Resolving https://github.com/mxcl/Path.swift at 0.16.2\n", "Compile Swift Module 'Path' (9 sources)\n", "Compile Swift Module 'Just' (1 sources)\n", "Compile Swift Module 'FastaiNotebook_00_load_data' (1 sources)\n", "Compile Swift Module 'jupyterInstalledPackages' (1 sources)\n", "Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so\n", "Initializing Swift...\n", "Loading library...\n", "Installation complete!\n" ] } ], "source": [ "%install '.package(path: \"$cwd/FastaiNotebook_00_load_data\")' FastaiNotebook_00_load_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import FastaiNotebook_00_load_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "\n", "/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors\n", "/// for our sequential layer type.\n", "public struct DiffList: KeyPathIterable {\n", " public var u: [U] = []\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "extension DiffList: Equatable {\n", " public static func == (lhs: DiffList, rhs: DiffList) -> Bool {\n", " if lhs.u.count != rhs.u.count { return false }\n", " for i in 0.. DiffList {\n", " precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n", " \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n", " if lhs.u.count == 0 { return rhs }\n", " if rhs.u.count == 0 { return lhs }\n", " var output: [U] = []\n", " for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {\n", " return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n", " precondition(v.u.count == lhsCount || lhsCount == 0, \n", " \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n", " precondition(v.u.count == rhsCount || rhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n", " var lhsOutput: [U] = []\n", " var rhsOutput: [U] = []\n", " // Unbroadcasting\n", " if lhsCount != 0 { lhsOutput = v.u }\n", " if rhsCount != 0 { rhsOutput = v.u }\n", " return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n", " })\n", " }\n", "\n", " @differentiable(vjp: _vjpSubtract(lhs:rhs:))\n", " public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {\n", " precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,\n", " \"DiffList size mis-match: lhs: \\(lhs.u.count), rhs: \\(rhs.u.count)\")\n", " if lhs.u.count == 0 { return rhs }\n", " if rhs.u.count == 0 { return lhs }\n", " var output: [U] = []\n", " for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {\n", " return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in\n", " precondition(v.u.count == lhsCount || lhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), lhs: \\(lhsCount)\")\n", " precondition(v.u.count == rhsCount || rhsCount == 0,\n", " \"DiffList gradient size mis-match: v: \\(v.u.count), rhs: \\(rhsCount)\")\n", "\n", " var lhsOutput: [U] = []\n", " var rhsOutput: [U] = []\n", " // Unbroadcasting\n", " if lhsCount != 0 { lhsOutput = v.u }\n", " if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }\n", " return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))\n", " })\n", " }\n", "}\n", "\n", "extension DiffList: VectorNumeric {\n", " public typealias Scalar = U.Scalar\n", " \n", " public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {\n", " return DiffList(u: rhs.u.map( { $0 * lhs } ))\n", " }\n", "}\n", "\n", "extension DiffList: Differentiable {\n", " public typealias TangentVector = DiffList\n", " public typealias CotangentVector = DiffList\n", " public typealias AllDifferentiableVariables = DiffList\n", "\n", " public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n", " return cotangent\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "import TensorFlow // Defines Layer." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "\n", "/// A struct that contains a number of layers within it.\n", "public struct SequentialLayer: KeyPathIterable where \n", " U.Input == U.Output,\n", " U.AllDifferentiableVariables: VectorNumeric,\n", " U.AllDifferentiableVariables == U.CotangentVector {\n", "\n", " public var layers: [U]\n", "\n", " public init(layers: [U]) {\n", " self.layers = layers\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "// Not strictly necessary, but nice to have.\n", "extension SequentialLayer: Equatable where U: Equatable {\n", " public static func ==(lhs: SequentialLayer, rhs: SequentialLayer) -> Bool {\n", " return lhs.layers == rhs.layers\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "// Mark SequentialLayer as Differentiable\n", "extension SequentialLayer: Differentiable {\n", " public typealias AllDifferentiableVariables = DiffList\n", " public typealias TangentVector = DiffList\n", " public typealias CotangentVector = DiffList\n", "\n", " public func tangentVector(from cotangent: CotangentVector) -> TangentVector {\n", " precondition(cotangent.u.count == layers.count, \"Differing # of layers: \\(cotangent.u.count) and \\(layers.count)\")\n", " return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))\n", " }\n", "\n", " public func moved(along direction: TangentVector) -> SequentialLayer {\n", " precondition(direction.u.count == layers.count, \"Differing # of layers: \\(direction.u.count) and \\(layers.count)\")\n", " return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))\n", " }\n", "\n", " public var allDifferentiableVariables: AllDifferentiableVariables {\n", " get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }\n", " set {\n", " precondition(newValue.u.count == layers.count, \"Differing # of layers: \\(newValue.u.count) and \\(layers.count)\")\n", " for i in 0.. Output {\n", " var tmp = input\n", " for layer in layers { tmp = layer.applied(to: tmp, in: context) }\n", " return tmp\n", " }\n", " \n", " public func _appliedDifferentiating(to input: Input, in context: Context) -> (\n", " Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {\n", " \n", " var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []\n", " var tmp = input\n", " for layer in layers {\n", " let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in\n", " return layer.applied(to: input, in: context)\n", " }\n", " tmp = output\n", " pullbacks.append(pullback)\n", " }\n", " \n", " return (tmp, { input in\n", " var allDiffVars: [U.AllDifferentiableVariables] = []\n", " var tmp = input\n", " \n", " for pb in pullbacks.reversed() {\n", " let (diffVars, input) = pb(tmp)\n", " tmp = input\n", " allDiffVars.append(diffVars)\n", " }\n", " \n", " return (DiffList(u: allDiffVars.reversed()), tmp)\n", " })\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "struct MyModel: Layer {\n", " var layers: SequentialLayer>\n", " \n", " init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {\n", " // Make the dense layers.\n", " \n", " // TODO(saeta): Clean up this code.\n", " var input = inputSize\n", " var output = outputSize\n", " if hiddenUnits.count > 0 { output = hiddenUnits[0] }\n", " var layers: [Dense] = []\n", " for i in 0..(inputSize: \\(input), outputSize: \\(output))\")\n", " layers.append(Dense(inputSize: input, outputSize: output))\n", " input = output\n", " }\n", " print(\"Making Dense(inputSize: \\(output), outputSize: \\(outputSize))\")\n", " layers.append(Dense(inputSize: output, outputSize: outputSize))\n", " \n", " self.layers = SequentialLayer(layers: layers)\n", " }\n", " \n", " @differentiable\n", " func applied(to input: Tensor, in context: Context) -> Tensor {\n", " return layers.applied(to: input, in: context)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "struct FixedModel: Layer {\n", " var d1 = Dense(inputSize: 784, outputSize: 30)\n", " var d2 = Dense(inputSize: 30, outputSize: 10)\n", " \n", " @differentiable\n", " func applied(to input: Tensor, in context: Context) -> Tensor {\n", " return input.sequenced(in: context, through: d1, d2)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz...\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz...\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz...\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz...\n" ] } ], "source": [ "var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSumExp(_ x: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let m = x.max(alongAxes: -1)\n", " return m + log(exp(x-m).sum(alongAxes: -1))\n", "}\n", "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - logSumExp(activations)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let trainingContext = Context(learningPhase: .training)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let lr:Float = 0.5 // learning rate\n", "let epochs = 1 // how many epochs to train for\n", "let bs:Int32=64 // batch size\n", "let (n,m) = (60000,784) // MNIST dataset size\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var modelFixed = FixedModel()\n", "let modelFixedStart = modelFixed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "true\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Making Dense(inputSize: 784, outputSize: 30)\n", "Making Dense(inputSize: 30, outputSize: 10)\n" ] } ], "source": [ "var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)\n", "let modelFlexStart = modelFlex" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "true\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public func accuracy(_ output: Tensor, _ target: Tensor) -> Tensor{\n", " let corrects = Tensor(output.argmax(squeezingAxis: 1) .== target)\n", " return corrects.mean()\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let inferenceContext = Context(learningPhase: .inference)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.193\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1426\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-(" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " var parameters = modelFixed.allDifferentiableVariables\n", " for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ \n", " parameters[keyPath: kp] -= lr * grads[keyPath:kp]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " var parameters = modelFlex.allDifferentiableVariables\n", " for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ \n", " parameters[keyPath: kp] -= lr * grads[keyPath: kp]\n", " }\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.193\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1426\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "true\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "true\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let optimizerFixed = SGD(learningRate: lr)\n", "// let optimizerFlex = SGD(learningRate: lr) // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "/// Stochastic gradient descent (SGD) optimizer.\n", "///\n", "/// An optimizer that implements stochastic gradient descent, with support for momentum, learning\n", "/// rate decay, and Nesterov momentum.\n", "public class SimpleSGD: Optimizer\n", " where Model.AllDifferentiableVariables == Model.CotangentVector {\n", " /// The learning rate.\n", " public var learningRate: Scalar\n", "\n", " public init(\n", " learningRate: Scalar = 0.01\n", " ) {\n", " precondition(learningRate >= 0, \"Learning rate must be non-negative\")\n", "\n", " self.learningRate = learningRate\n", " }\n", "\n", " public func update(_ model: inout Model.AllDifferentiableVariables,\n", " along direction: Model.CotangentVector) {\n", " for kp in model.recursivelyAllWritableKeyPaths(to: Tensor.self) {\n", " model[keyPath: kp] -= -learningRate * direction[keyPath: kp]\n", " }\n", " }\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let simpleOptFlex = SimpleSGD(learningRate: lr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.098\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "false\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFixedStart.d1.weight == modelFixed.d1.weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", "// optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n", " simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.098\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "false\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Swift", "language": "swift", "name": "swift" } }, "nbformat": 4, "nbformat_minor": 2 }