# Sequential Layer

This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together.

In [None]:
%install '.package(path: "$cwd/FastaiNotebook_00_load_data")' FastaiNotebook_00_load_data

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_00_load_data")
		FastaiNotebook_00_load_data
With SwiftPM flags: []
Working in: /tmp/tmp2npmmpfo
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 1.21s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'FastaiNotebook_00_load_data' (1 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!


In [None]:
import FastaiNotebook_00_load_data

In [None]:
// export

/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors
/// for our sequential layer type.
public struct DiffList: KeyPathIterable {
 public var u: [U] = []
}

In [None]:
// export
extension DiffList: Equatable {
 public static func == (lhs: DiffList, rhs: DiffList) -> Bool {
 if lhs.u.count != rhs.u.count { return false }
 for i in 0.. DiffList {
 precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,
 "DiffList size mis-match: lhs: \(lhs.u.count), rhs: \(rhs.u.count)")
 if lhs.u.count == 0 { return rhs }
 if rhs.u.count == 0 { return lhs }
 var output: [U] = []
 for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {
 return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in
 precondition(v.u.count == lhsCount || lhsCount == 0, 
 "DiffList gradient size mis-match: v: \(v.u.count), lhs: \(lhsCount)")
 precondition(v.u.count == rhsCount || rhsCount == 0,
 "DiffList gradient size mis-match: v: \(v.u.count), rhs: \(rhsCount)")
 var lhsOutput: [U] = []
 var rhsOutput: [U] = []
 // Unbroadcasting
 if lhsCount != 0 { lhsOutput = v.u }
 if rhsCount != 0 { rhsOutput = v.u }
 return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))
 })
 }

 @differentiable(vjp: _vjpSubtract(lhs:rhs:))
 public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {
 precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,
 "DiffList size mis-match: lhs: \(lhs.u.count), rhs: \(rhs.u.count)")
 if lhs.u.count == 0 { return rhs }
 if rhs.u.count == 0 { return lhs }
 var output: [U] = []
 for i in 0.. (DiffList, (DiffList) -> (DiffList, DiffList)) {
 return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in
 precondition(v.u.count == lhsCount || lhsCount == 0,
 "DiffList gradient size mis-match: v: \(v.u.count), lhs: \(lhsCount)")
 precondition(v.u.count == rhsCount || rhsCount == 0,
 "DiffList gradient size mis-match: v: \(v.u.count), rhs: \(rhsCount)")

 var lhsOutput: [U] = []
 var rhsOutput: [U] = []
 // Unbroadcasting
 if lhsCount != 0 { lhsOutput = v.u }
 if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }
 return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))
 })
 }
}

extension DiffList: VectorNumeric {
 public typealias Scalar = U.Scalar
 
 public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {
 return DiffList(u: rhs.u.map( { $0 * lhs } ))
 }
}

extension DiffList: Differentiable {
 public typealias TangentVector = DiffList
 public typealias CotangentVector = DiffList
 public typealias AllDifferentiableVariables = DiffList

 public func tangentVector(from cotangent: CotangentVector) -> TangentVector {
 return cotangent
 }
}

In [None]:
// export
import TensorFlow // Defines Layer.

In [None]:
// export

/// A struct that contains a number of layers within it.
public struct SequentialLayer: KeyPathIterable where 
 U.Input == U.Output,
 U.AllDifferentiableVariables: VectorNumeric,
 U.AllDifferentiableVariables == U.CotangentVector {

 public var layers: [U]

 public init(layers: [U]) {
 self.layers = layers
 }
}


In [None]:
// export
// Not strictly necessary, but nice to have.
extension SequentialLayer: Equatable where U: Equatable {
 public static func ==(lhs: SequentialLayer, rhs: SequentialLayer) -> Bool {
 return lhs.layers == rhs.layers
 }
}


In [None]:
// export
// Mark SequentialLayer as Differentiable
extension SequentialLayer: Differentiable {
 public typealias AllDifferentiableVariables = DiffList
 public typealias TangentVector = DiffList
 public typealias CotangentVector = DiffList

 public func tangentVector(from cotangent: CotangentVector) -> TangentVector {
 precondition(cotangent.u.count == layers.count, "Differing # of layers: \(cotangent.u.count) and \(layers.count)")
 return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))
 }

 public func moved(along direction: TangentVector) -> SequentialLayer {
 precondition(direction.u.count == layers.count, "Differing # of layers: \(direction.u.count) and \(layers.count)")
 return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))
 }

 public var allDifferentiableVariables: AllDifferentiableVariables {
 get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }
 set {
 precondition(newValue.u.count == layers.count, "Differing # of layers: \(newValue.u.count) and \(layers.count)")
 for i in 0.. Output {
 var tmp = input
 for layer in layers { tmp = layer.applied(to: tmp, in: context) }
 return tmp
 }
 
 public func _appliedDifferentiating(to input: Input, in context: Context) -> (
 Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {
 
 var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []
 var tmp = input
 for layer in layers {
 let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in
 return layer.applied(to: input, in: context)
 }
 tmp = output
 pullbacks.append(pullback)
 }
 
 return (tmp, { input in
 var allDiffVars: [U.AllDifferentiableVariables] = []
 var tmp = input
 
 for pb in pullbacks.reversed() {
 let (diffVars, input) = pb(tmp)
 tmp = input
 allDiffVars.append(diffVars)
 }
 
 return (DiffList(u: allDiffVars.reversed()), tmp)
 })
 }
}


In [None]:
struct MyModel: Layer {
 var layers: SequentialLayer>
 
 init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {
 // Make the dense layers.
 
 // TODO(saeta): Clean up this code.
 var input = inputSize
 var output = outputSize
 if hiddenUnits.count > 0 { output = hiddenUnits[0] }
 var layers: [Dense] = []
 for i in 0..(inputSize: \(input), outputSize: \(output))")
 layers.append(Dense(inputSize: input, outputSize: output))
 input = output
 }
 print("Making Dense(inputSize: \(output), outputSize: \(outputSize))")
 layers.append(Dense(inputSize: output, outputSize: outputSize))
 
 self.layers = SequentialLayer(layers: layers)
 }
 
 @differentiable
 func applied(to input: Tensor, in context: Context) -> Tensor {
 return layers.applied(to: input, in: context)
 }
}

In [None]:
struct FixedModel: Layer {
 var d1 = Dense(inputSize: 784, outputSize: 30)
 var d2 = Dense(inputSize: 30, outputSize: 10)
 
 @differentiable
 func applied(to input: Tensor, in context: Context) -> Tensor {
 return input.sequenced(in: context, through: d1, d2)
 }
}

In [None]:
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz...
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz...
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz...
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz...


In [None]:
func logSumExp(_ x: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{
 let m = x.max(alongAxes: -1)
 return m + log(exp(x-m).sum(alongAxes: -1))
}
func logSoftmax(_ activations: Tensor) -> Tensor where Scalar:TensorFlowFloatingPoint{
 return activations - logSumExp(activations)
}

In [None]:
let trainingContext = Context(learningPhase: .training)

In [None]:
let lr:Float = 0.5 // learning rate
let epochs = 1 // how many epochs to train for
let bs:Int32=64 // batch size
let (n,m) = (60000,784) // MNIST dataset size


In [None]:
var modelFixed = FixedModel()
let modelFixedStart = modelFixed

In [None]:
modelFixedStart.d1.weight == modelFixed.d1.weight

true


In [None]:
var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)
let modelFlexStart = modelFlex

Making Dense(inputSize: 784, outputSize: 30)
Making Dense(inputSize: 30, outputSize: 10)


In [None]:
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight

true


In [None]:
public func accuracy(_ output: Tensor, _ target: Tensor) -> Tensor{
 let corrects = Tensor(output.argmax(squeezingAxis: 1) .== target)
 return corrects.mean()
}

In [None]:
let inferenceContext = Context(learningPhase: .inference)

In [None]:
accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)

0.193


In [None]:
accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)

0.1426


The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-(

In [None]:
for epoch in 1...epochs{
 for i in 0..<((n-1)/Int(bs)){
 let startIdx = Int32(i) * bs
 let endIdx = startIdx + bs
 let xb = xTrain[startIdx.. Tensor in
 let preds = model.applied(to: xb, in: trainingContext)
 return softmaxCrossEntropy(logits: preds, labels: yb)
 }
 var parameters = modelFixed.allDifferentiableVariables
 for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ 
 parameters[keyPath: kp] -= lr * grads[keyPath:kp]
 }
 }
}

In [None]:
for epoch in 1...epochs{
 for i in 0..<((n-1)/Int(bs)){
 let startIdx = Int32(i) * bs
 let endIdx = startIdx + bs
 let xb = xTrain[startIdx.. Tensor in
 let preds = model.applied(to: xb, in: trainingContext)
 return softmaxCrossEntropy(logits: preds, labels: yb)
 }
 var parameters = modelFlex.allDifferentiableVariables
 for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor.self){ 
 parameters[keyPath: kp] -= lr * grads[keyPath: kp]
 }
 }
}

In [None]:
accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)

0.193


In [None]:
accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)

0.1426


In [None]:
modelFixedStart.d1.weight == modelFixed.d1.weight

true


In [None]:
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight

true


In [None]:
let optimizerFixed = SGD(learningRate: lr)
// let optimizerFlex = SGD(learningRate: lr) // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing.

In [None]:
/// Stochastic gradient descent (SGD) optimizer.
///
/// An optimizer that implements stochastic gradient descent, with support for momentum, learning
/// rate decay, and Nesterov momentum.
public class SimpleSGD: Optimizer
 where Model.AllDifferentiableVariables == Model.CotangentVector {
 /// The learning rate.
 public var learningRate: Scalar

 public init(
 learningRate: Scalar = 0.01
 ) {
 precondition(learningRate >= 0, "Learning rate must be non-negative")

 self.learningRate = learningRate
 }

 public func update(_ model: inout Model.AllDifferentiableVariables,
 along direction: Model.CotangentVector) {
 for kp in model.recursivelyAllWritableKeyPaths(to: Tensor.self) {
 model[keyPath: kp] -= -learningRate * direction[keyPath: kp]
 }
 }
}


In [None]:
let simpleOptFlex = SimpleSGD(learningRate: lr)

In [None]:
for epoch in 1...epochs{
 for i in 0..<((n-1)/Int(bs)){
 let startIdx = Int32(i) * bs
 let endIdx = startIdx + bs
 let xb = xTrain[startIdx.. Tensor in
 let preds = model.applied(to: xb, in: trainingContext)
 return softmaxCrossEntropy(logits: preds, labels: yb)
 }
 optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)
 }
}

In [None]:
accuracy(modelFixed.applied(to: xValid, in: inferenceContext), yValid)

0.098


In [None]:
modelFixedStart.d1.weight == modelFixed.d1.weight

false


In [None]:
for epoch in 1...epochs{
 for i in 0..<((n-1)/Int(bs)){
 let startIdx = Int32(i) * bs
 let endIdx = startIdx + bs
 let xb = xTrain[startIdx.. Tensor in
 let preds = model.applied(to: xb, in: trainingContext)
 return softmaxCrossEntropy(logits: preds, labels: yb)
 }
// optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)
 simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)
 }
}

In [None]:
accuracy(modelFlex.applied(to: xValid, in: inferenceContext), yValid)

0.098


In [None]:
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight

false
