In [None]:
%install '.package(path: "$cwd/FastaiNotebook_01_matmul")' FastaiNotebook_01_matmul

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_01_matmul")
		FastaiNotebook_01_matmul
With SwiftPM flags: []
Working in: /tmp/tmpp03bd5c9
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 1.40s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'FastaiNotebook_01_matmul' (2 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!


In [None]:
import FastaiNotebook_01_matmul

In [None]:
// export
import Path
import TensorFlow

## The forward and backward passes

In [None]:
// export
public typealias TF=Tensor

In [None]:
// export
public func normalize(_ x:TF, mean:TF, std:TF) -> TF {
 return (x-mean)/std
}

In [None]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Normalize the training and validation sets.

In [None]:
let trainMean = xTrain.mean()
let trainStd = xTrain.standardDeviation()

In [None]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [None]:
//export
public func testNearZero(_ a:TF, tolerance:Float=1e-3) {
 assert(abs(a) TF {return matmul(x, w) + b}

In [None]:
let t = lin(xValid, w1, b1)

In [None]:
//...so should this, because we used kaiming init, which is designed to do this
(t.mean(),t.standardDeviation())

▿ 2 elements
 - .0 : -0.08092515
 - .1 : 0.9930747


In [None]:
func myRelu(_ x:TF) -> TF {return max(x, 0)}

In [None]:
let t = myRelu(lin(xValid, w1, b1))

In [None]:
//...actually it really should be this!
(t.mean(),t.standardDeviation())

▿ 2 elements
 - .0 : 0.35371977
 - .1 : 0.54694015


In [None]:
// kaiming init / he init for relu
let w1:TF = TF(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [None]:
(w1.mean(),w1.standardDeviation())

▿ 2 elements
 - .0 : -0.0002575889
 - .1 : 0.050299395


In [None]:
let t = myRelu(lin(xValid, w1, b1))
(t.mean(),t.standardDeviation())

▿ 2 elements
 - .0 : 0.5851609
 - .1 : 0.8190965


In [None]:
func model(_ xb: TF) -> TF{
 let l1 = lin(xb, w1, b1)
 let l2 = myRelu(l1)
 let l3 = lin(l2, w2, b2)
 return l3
}

In [None]:
time(repeating: 10) {let _ = model(xValid)}

0.9236246999999999 ms


### Loss function

In [None]:
let preds = model(xTrain)

In [None]:
// export
public func mse(_ out:TF, _ targ:TF) -> TF {
 return (out.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
var yTrainF = TF(yTrain)
var yValidF = TF(yValid)

In [None]:
mse(preds, yTrainF)

35.50708


### Gradients and backward pass

To store the gradients a bit like in PyTorch we introduce a Tensor with grad class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the python notebook: classes are reference types (which means they are mutable) while structures are value types.

In [None]:
class TensorWithGrad {
 var inner: TF
 var grad: TF
 
 init(_ x: TF) {
 inner = x
 grad = TF(repeating: 0.0, shape:x.shape)
 } 
}

In [None]:
func lin(_ x:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad) -> TensorWithGrad {
 return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func myRelu(_ x:TensorWithGrad) -> TensorWithGrad {return TensorWithGrad(max(x.inner, 0))}
func mse(_ inp: TensorWithGrad, _ targ : TF) -> TF{
 //grad of loss with respect to output of previous layer
 return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [None]:
func mseGrad(_ inp: TensorWithGrad, _ targ : TF){
 //grad of loss with respect to output of previous layer
 inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

In [None]:
func reluGrad(_ inp: TensorWithGrad, _ out:TensorWithGrad){
 //grad of relu with respect to input activations
 inp.grad = (inp.inner .> 0).selecting(out.grad, TF(repeating:0.0, shape:inp.inner.shape))
}

This is our python version (we've renamed the python `g` to `grad` for consistency):

```python
def lin_grad(inp, out, w, b):
 inp.grad = out.grad @ w.t()
 w.grad = (inp.unsqueeze(-1) * out.grad.unsqueeze(1)).sum(0)
 b.grad = out.grad.sum(0)
```

In Swift `@` is spelled `•`, which is option-8 on Mac or compose-.-= elsewhere. Or just use the `matmul()` function we've seen already.

In [None]:
func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
 //grad of relu with respect to input activations
 inp.grad = out.grad • w.inner.transposed()
 w.grad = inp.inner.transposed() • out.grad
 b.grad = out.grad.sum(squeezingAxes: 0)
}

In [None]:
let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [None]:
func forwardAndBackward(_ inp:TensorWithGrad, _ targ:TF){
 //forward pass:
 let l1 = lin(inp, w1a, b1a)
 let l2 = myRelu(l1)
 let out = lin(l2, w2a, b2a)
 //we don't actually need the loss in backward!
 let loss = mse(out, targ)
 
 //backward pass:
 mseGrad(out, targ)
 linGrad(l2, out, w2a, b2a)
 reluGrad(l1, l2)
 linGrad(inp, l1, w1a, b1a)
}

In [None]:
let inp = TensorWithGrad(xTrain)

In [None]:
forwardAndBackward(inp, yTrainF)

Let's compare to swift autodiff now. We have to mark the function as @differentiable

In [None]:
@differentiable
func forward(_ inp:TF, _ targ:TF, w1:TF, b1:TF, 
 w2:TF, b2:TF) -> TF{
 let l1 = matmul(inp, w1) + b1
 let l2 = relu(l1)
 let l3 = matmul(l2, w2) + b2
 return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

Then we can ask for the gradients of anything like this:

In [None]:
let xGrad = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w1Grad = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b1Grad = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w2Grad = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b2Grad = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

Note the big difference with PyTorch: in PyTorch the tensors remember how they were created when you have `requires_grad=True` so that when you arrive at a final number a call the `backward` pass, they can compute their respective gradients.

In swift for TensorFlow, the Tensor don't store anything, so you have to specify the whole function you want executed when computing the gradients.

In [None]:
testNearZero(xGrad - inp.grad)
testNearZero(w1Grad - w1a.grad)
testNearZero(b1Grad - b1a.grad)
testNearZero(w2Grad - w2a.grad)
testNearZero(b2Grad - b2a.grad)

In [None]:
time(repeating: 10) { forwardAndBackward(inp, yTrainF) }

23.6698357 ms


It's a bit inefficient to have to ask for the gradients of every parameter in a different function call. The swifty way of doing this is to regroup all our parameters in a structure (which will be our model later on). As long as they all conform to the protocol `Differentiable`, we can make this structure conform to `Differentiable` without having to implement anything and it will just work.

In [None]:
public struct myParams: Differentiable {
 public var x, w1, b1, w2, b2: TF
}

In [None]:
let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)

In [None]:
let grads = gradient(at: allParams) { 
 allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

In [None]:
testNearZero(xGrad - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

If you wanted the value for your loss as well as the gradients, you just have to use `valueWithGradient`.

In [None]:
let (loss,grads) = valueWithGradient(at: allParams) { 
 allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

In [None]:
testNearZero(xGrad - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

In terms of timing our implementaiton gives:

In [None]:
time(repeating: 10) { forwardAndBackward(inp, yTrainF) }

24.5198722 ms


In [None]:
time(repeating: 10) { let _ = valueWithGradient(at: allParams) { 
 allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
 }
}

24.631195299999998 ms


### Export

In [None]:
notebookToScript(fname: (Path.cwd / "02_fully_connected.ipynb").string)