{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Installing packages:\n", "\t.package(path: \"/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_02a_why_sqrt5\")\n", "\t\tFastaiNotebook_02a_why_sqrt5\n", "With SwiftPM flags: []\n", "Working in: /tmp/tmph6cxueln\n", "Fetching https://github.com/mxcl/Path.swift\n", "Fetching https://github.com/JustHTTP/Just\n", "Completed resolution in 2.07s\n", "Cloning https://github.com/JustHTTP/Just\n", "Resolving https://github.com/JustHTTP/Just at 0.7.1\n", "Cloning https://github.com/mxcl/Path.swift\n", "Resolving https://github.com/mxcl/Path.swift at 0.16.2\n", "Compile Swift Module 'Just' (1 sources)\n", "Compile Swift Module 'Path' (9 sources)\n", "Compile Swift Module 'FastaiNotebook_02a_why_sqrt5' (5 sources)\n", "Compile Swift Module 'jupyterInstalledPackages' (1 sources)\n", "Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so\n", "Initializing Swift...\n", "Loading library...\n", "Installation complete!\n" ] } ], "source": [ "%install '.package(path: \"$cwd/FastaiNotebook_02a_why_sqrt5\")' FastaiNotebook_02a_why_sqrt5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import FastaiNotebook_02a_why_sqrt5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "import Path\n", "import TensorFlow" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/\".fastai\"/\"data\"/\"mnist_tst\", flat: true)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let trainMean = xTrain.mean()\n", "let trainStd = xTrain.standardDeviation()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xTrain = normalize(xTrain, mean: trainMean, std: trainStd)\n", "xValid = normalize(xValid, mean: trainMean, std: trainStd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "60000 784 10\r\n" ] } ], "source": [ "let (n,m) = (Int(xTrain.shape[0]),Int(xTrain.shape[1]))\n", "let c = yTrain.max()+1\n", "print(n,m,c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Those can't be used to define a model cause they're not Ints though..." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let (n,m) = (60000,784)\n", "let c = 10\n", "let nHid = 50" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public struct MyModel: Layer {\n", " public var layer1: FADense\n", " public var layer2: FADense\n", " \n", " public init(nIn: Int, nHid: Int, nOut: Int){\n", " layer1 = FADense(inputSize: nIn, outputSize: nHid, activation: relu)\n", " layer2 = FADense(inputSize: nHid, outputSize: nOut)\n", " }\n", " \n", " @differentiable\n", " public func applied(to input: Tensor, in context: Context) -> Tensor {\n", " return input.sequenced(in: context, through: layer1, layer2)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var model = MyModel(nIn: m, nHid: nHid, nOut: c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let pred = model.applied(to: xTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cross entropy loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let exped = exp(activations) \n", " return log(exped / exped.sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[5, 0, 4]\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yTrain[0..<3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "▿ 3 elements\n", " - .0 : -2.0715566\n", " - .1 : -2.028722\n", " - .2 : -2.1230843\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(smPred[0][5],smPred[1][0],smPred[2][4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func nll(_ input: Tensor, _ target :Tensor) -> Tensor \n", " where Scalar:TensorFlowFloatingPoint{\n", " let idx: Tensor = Raw.range(start: Tensor(0), limit: Tensor(60000), delta: Tensor(1))\n", " let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])\n", " let losses = Raw.gatherNd(params: input, indices: indices)\n", " return -losses.mean()\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.6719995\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0019715 ms\r\n" ] } ], "source": [ "time(repeating: 100){ let _ = nll(smPred, yTrain) }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simplify `logSoftmax` with log formulas." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - log(exp(activations).sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.6719995\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use LogSumExp trick" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "▿ TensorShape\n", " ▿ dimensions : 2 elements\n", " - 0 : 60000\n", " - 1 : 1\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "smPred.max(alongAxes: -1).shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSumExp(_ x: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let m = x.max(alongAxes: -1)\n", " return m + log(exp(x-m).sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - logSumExp(activations)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.6719997\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In S4TF nll loss is combined with softmax in:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.6719995\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)\n", "loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.33812186 ms\r\n" ] } ], "source": [ "time(repeating: 100){ let _ = nll(logSoftmax(pred), yTrain)}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0861205000000003 ms\r\n" ] } ], "source": [ "time(repeating: 100){ let _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic training loop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Basically the training loop repeats over the following steps:\n", "- get the output of the model on a batch of inputs\n", "- compare the output to the labels we have and compute a loss\n", "- calculate the gradients of the loss with respect to every parameter of the model\n", "- update said parameters with those gradients to make them a little bit better" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "public func accuracy(_ output: Tensor, _ target: Tensor) -> Tensor{\n", " let corrects = Tensor(output.argmax(squeezingAxis: 1) .== target)\n", " return corrects.mean()\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.14516667\r\n" ] } ], "source": [ "print(accuracy(pred, yTrain))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ -1.7508852, -0.42652333, 0.015924871, 1.145769, -2.150978, 0.233792, 0.027202263,\r\n", " 0.7222841, 0.66757137, 2.54584] TensorShape(dimensions: [64, 10])\r\n" ] } ], "source": [ "let bs:Int32=64 // batch size\n", "let xb = xTrain[0.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Loop by hand" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " model.layer1.weight -= lr * grads.layer1.weight\n", " model.layer1.bias -= lr * grads.layer1.bias\n", " model.layer2.weight -= lr * grads.layer2.weight\n", " model.layer2.bias -= lr * grads.layer2.bias\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9305\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let preds = model.applied(to: xValid)\n", "accuracy(preds, yValid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "93% in one epoch, not too bad!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Naming all the parameters is a bit boring. We can use `AllDifferentiableVariables` objects to access them all." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " var parameters = model.allDifferentiableVariables\n", " for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ \n", " parameters[keyPath: kp] -= lr * grads[keyPath:kp]\n", " }\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we can use a S4TF optimizer to do the step for us." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let optimizer = SGD(learningRate: lr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for i in 0..<((n-1)/Int(bs)){\n", " let startIdx = Int32(i) * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. Tensor in\n", " let preds = model.applied(to: xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", " }\n", " optimizer.update(&model.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can create a swift `Dataset` from our arrays. It will automatically batch things for us." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "public struct DataBatch: TensorGroup {\n", " public var xb: Inputs\n", " public var yb: Labels \n", " \n", " public init(xb: Inputs, yb: Labels){\n", " self.xb = xb\n", " self.yb = yb\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let train_ds:Dataset = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(Int64(bs))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for batch in train_ds{\n", " let (loss, grads) = model.valueWithGradient { model -> Tensor in\n", " let preds = model.applied(to: batch.xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: batch.yb)\n", " }\n", " optimizer.update(&model.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This `Dataset` can also do the shuffle for us:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for batch in train_ds.shuffled(){\n", " let (loss, grads) = model.valueWithGradient { model -> Tensor in\n", " let preds = model.applied(to: batch.xb, in: trainingContext)\n", " return softmaxCrossEntropy(logits: preds, labels: batch.yb)\n", " }\n", " optimizer.update(&model.allDifferentiableVariables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training loop" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public func train(\n", " _ model: inout Opt.Model,\n", " on dataset: Dataset>,\n", " using optimizer: inout Opt,\n", " lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Labels) -> Tensor\n", ") where Opt.Model.Input: TensorGroup,\n", " Opt.Model.CotangentVector == Opt.Model.AllDifferentiableVariables,\n", " Opt.Scalar: TensorFlowFloatingPoint\n", "{\n", " let context = Context(learningPhase: .training)\n", " for batch in dataset {\n", " let (loss, 𝛁model) = model.valueWithGradient { model -> Tensor in \n", " let pred = model.applied(to: batch.xb, in: context) \n", " return lossFunc(pred, batch.yb)\n", " }\n", " optimizer.update(&model.allDifferentiableVariables, along: 𝛁model)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var optimizer = SGD(learningRate: lr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "error: Execution was interrupted, reason: signal SIGSEGV: address access protected (fault address: 0x2b10cd0).\nThe process has been left at the point where it was interrupted, use \"thread return -x\" to return to the state before expression evaluation.\n" ] } ], "source": [ "train(&model, on: train_ds, using: &optimizer, lossFunc: softmaxCrossEntropy)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "notebookToScript(fname: (Path.cwd / \"03_minibatch_training.ipynb\").string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Swift", "language": "swift", "name": "swift" } }, "nbformat": 4, "nbformat_minor": 2 }