{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Minibatch training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Installing packages:\n", "\t.package(path: \"/home/jupyter/notebooks/swift/FastaiNotebook_02a_why_sqrt5\")\n", "\t\tFastaiNotebook_02a_why_sqrt5\n", "With SwiftPM flags: []\n", "Working in: /tmp/tmpocdb5lqi/swift-install\n", "[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift\n", "[2/3] Merging module jupyterInstalledPackages\n", "Initializing Swift...\n", "Installation complete!\n" ] } ], "source": [ "%install-location $cwd/swift-install\n", "%install '.package(path: \"$cwd/FastaiNotebook_02a_why_sqrt5\")' FastaiNotebook_02a_why_sqrt5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "//export\n", "import Path\n", "import TensorFlow" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import FastaiNotebook_02a_why_sqrt5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our labels will be integeres from now on, so to go with our `TF` abbreviation, we introduce `TI`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "public typealias TI = Tensor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We gather the MNIST data like in the previous notebooks." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/\".fastai\"/\"data\"/\"mnist_tst\", flat: true)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let trainMean = xTrain.mean()\n", "let trainStd = xTrain.std()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xTrain = normalize(xTrain, mean: trainMean, std: trainStd)\n", "xValid = normalize(xValid, mean: trainMean, std: trainStd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "60000 784 10\r\n" ] } ], "source": [ "let (n,m) = (xTrain.shape[0],xTrain.shape[1])\n", "let c = yTrain.max().scalarized()+1\n", "print(n,m,c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We also define a simple model using our `FADense` layers." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let nHid = 50" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public struct MyModel: Layer {\n", " public var layer1: FADense\n", " public var layer2: FADense\n", " \n", " public init(nIn: Int, nHid: Int, nOut: Int){\n", " layer1 = FADense(nIn, nHid, activation: relu)\n", " layer2 = FADense(nHid, nOut)\n", " }\n", " \n", " @differentiable\n", " public func callAsFunction(_ input: TF) -> TF {\n", " return input.sequenced(through: layer1, layer2)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let pred = model(xTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cross entropy loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before we can train our model, we need to have a loss function. We saw how to write `logSoftMax` from scratch in PyTorch, but let's do it once in swift too." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let exped = exp(activations) \n", " return log(exped / exped.sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[5, 0, 4]\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yTrain[0..<3]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "▿ 3 elements\n", " - .0 : -2.237128\n", " - .1 : -1.8154038\n", " - .2 : -2.7488613\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(smPred[0][5],smPred[1][0],smPred[2][4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func nll(_ input: Tensor, _ target :TI) -> Tensor \n", " where Scalar:TensorFlowFloatingPoint{\n", " let idx: TI = Raw.range(start: Tensor(0), limit: Tensor(numericCast(target.shape[0])), delta: Tensor(1))\n", " let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])\n", " let losses = Raw.gatherNd(params: input, indices: indices)\n", " return -losses.mean()\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4874432\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average: 0.7255254900000001 ms, min: 0.572271 ms, max: 1.416526 ms\r\n" ] } ], "source": [ "time(repeating: 100){ let _ = nll(smPred, yTrain) }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simplify `logSoftmax` with log formulas." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - log(exp(activations).sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4874432\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We know use the LogSumExp trick" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "▿ [60000, 1]\n", " ▿ dimensions : 2 elements\n", " - 0 : 60000\n", " - 1 : 1\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "smPred.max(alongAxes: -1).shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSumExp(_ x: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " let m = x.max(alongAxes: -1)\n", " return m + log(exp(x-m).sum(alongAxes: -1))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{\n", " return activations - logSumExp(activations)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let smPred = logSoftmax(pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4874432\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nll(smPred, yTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In S4TF nll loss is combined with softmax in:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.4874432\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)\n", "loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average: 0.9475197999999999 ms, min: 0.813438 ms, max: 1.452436 ms\r\n" ] } ], "source": [ "time(repeating: 100){ _ = nll(logSoftmax(pred), yTrain)}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "average: 0.2755439800000001 ms, min: 0.184429 ms, max: 0.518362 ms\r\n" ] } ], "source": [ "time(repeating: 100){ _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic training loop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Basically the training loop repeats over the following steps:\n", "- get the output of the model on a batch of inputs\n", "- compare the output to the labels we have and compute a loss\n", "- calculate the gradients of the loss with respect to every parameter of the model\n", "- update said parameters with those gradients to make them a little bit better" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "public func accuracy(_ output: TF, _ target: TI) -> TF{\n", " let corrects = TF(output.argmax(squeezingAxis: 1) .== target)\n", " return corrects.mean()\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have a raw model for now, so it should be as good as random: 10% accuracy." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.088983335\r\n" ] } ], "source": [ "print(accuracy(pred, yTrain))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So let's begin with a minibatch." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 0.71162695, 1.0687346, 0.3600209, 0.055102475, 0.7039629, 0.49937636, -0.22974207,\r\n", " -0.3584919, 0.27834803, 0.4121637] [64, 10]\r\n" ] } ], "source": [ "let bs=64 // batch size\n", "let xb = xTrain[0..1 statement in it)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let (loss, grads) = model.valueWithGradient { model -> TF in\n", " let preds = model(xb)\n", " return softmaxCrossEntropy(logits: preds, labels: yb)\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The full loop by hand would look like this:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1 ... epochs {\n", " for i in 0 ..< (n-1)/bs {\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx..80%` in one epoch, not too bad!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We use a shorcut: `model.variables` stands for `model.allDifferentiableVariables` in S4TF. It extracts from our model a new struct with only the trainable parameters. For instance if `model` is a BatchNorm layer, it has four tensor of floats: running mean, runing std, weights and bias. The corresponding `model.variables` only has the weights and bias tensors.\n", "\n", "When we get the gradients of our model, we have another structure of the same type, and it's possible to perform basic arithmetic on those structures to make the update step super simple:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1 ... epochs {\n", " for i in 0 ..< (n-1)/bs {\n", " let startIdx = i * bs\n", " let endIdx = startIdx + bs\n", " let xb = xTrain[startIdx.. UnfoldSequence,Int>\n", "{\n", " return sequence(state: start) { (batchStart) -> Range? in\n", " let remaining = end - batchStart\n", " guard remaining > 0 else { return nil}\n", " let currentBs = min(bs,remaining)\n", " let batchEnd = batchStart.advanced(by: currentBs)\n", " defer { batchStart = batchEnd }\n", " return batchStart ..< batchEnd\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1 ... epochs{\n", " for b in batchedRanges(start: 0, end: n, bs: bs) {\n", " let (xb,yb) = (xTrain[b],yTrain[b])\n", " let (loss, grads) = model.valueWithGradient {\n", " softmaxCrossEntropy(logits: $0(xb), labels: yb)\n", " }\n", " optimizer.update(&model.variables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can create a swift `Dataset` from our arrays. It will automatically batch things for us:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "// export\n", "public struct DataBatch: TensorGroup {\n", " public var xb: Inputs\n", " public var yb: Labels\n", " \n", " public init(xb: Inputs, yb: Labels){ (self.xb,self.yb) = (xb,yb) }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "let trainDs = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(bs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for batch in trainDs {\n", " let (loss, grads) = model.valueWithGradient {\n", " softmaxCrossEntropy(logits: $0(xb), labels: yb)\n", " }\n", " optimizer.update(&model.variables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This `Dataset` can also do the shuffle for us:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for epoch in 1...epochs{\n", " for batch in trainDs.shuffled(sampleCount: yTrain.shape[0], randomSeed: 42){\n", " let (loss, grads) = model.valueWithGradient {\n", " softmaxCrossEntropy(logits: $0(xb), labels: yb)\n", " }\n", " optimizer.update(&model.variables, along: grads)\n", " }\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training loop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With everything before, we can now write a generic training loop. It needs two generic types: the optimizer (`Opt`) and the labels (`Label`):" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "public func train(\n", " _ model: inout Opt.Model,\n", " on ds: Dataset>,\n", " using opt: inout Opt,\n", " lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Label) -> Tensor\n", ") where Opt.Model: Layer,\n", " Opt.Model.Input: TensorGroup,\n", " Opt.Model.TangentVector == Opt.Model.AllDifferentiableVariables,\n", " Opt.Scalar: TensorFlowFloatingPoint\n", "{\n", " for batch in ds {\n", " let (loss, 𝛁model) = model.valueWithGradient {\n", " lossFunc($0(batch.xb), batch.yb)\n", " }\n", " opt.update(&model.variables, along: 𝛁model)\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))\n", "var optimizer = SGD(for: model, learningRate: lr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train(&model, on: trainDs, using: &optimizer, lossFunc: softmaxCrossEntropy)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9022\n" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let preds = model(xValid)\n", "accuracy(preds, yValid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "success\r\n" ] } ], "source": [ "import NotebookExport\n", "let exporter = NotebookExport(Path.cwd/\"03_minibatch_training.ipynb\")\n", "print(exporter.export(usingPrefix: \"FastaiNotebook_\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Swift", "language": "swift", "name": "swift" } }, "nbformat": 4, "nbformat_minor": 2 }