# What are Tensors?

In [9]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
 # Forward pass: compute predicted y
 h = x.dot(w1)
 h_relu = np.maximum(h, 0)
 y_pred = h_relu.dot(w2)

 # Compute and print loss
 loss = np.square(y_pred - y).sum()
 print(t, loss)

 # Backprop to compute gradients of w1 and w2 with respect to loss
 grad_y_pred = 2.0 * (y_pred - y)
 grad_w2 = h_relu.T.dot(grad_y_pred)
 grad_h_relu = grad_y_pred.dot(w2.T)
 grad_h = grad_h_relu.copy()
 grad_h[h < 0] = 0
 grad_w1 = x.T.dot(grad_h)

 # Update weights
 w1 -= learning_rate * grad_w1
 w2 -= learning_rate * grad_w2

0 29261998.9383
1 23584624.4749
2 21318274.0133
3 19389745.5408
4 16479856.1687
5 12805039.2482
6 9059166.91546
7 6042659.8759
8 3908408.60775
9 2553920.39789
10 1723204.06721
11 1219705.10145
12 906659.056268
13 704582.301008
14 567415.897123
15 469502.722688
16 396243.703489
17 339183.787367
18 293384.908371
19 255753.24473
20 224375.289442
21 197817.587324
22 175121.073496
23 155577.723508
24 138727.89154
25 124054.575745
26 111219.330545
27 99943.0384346
28 90002.3975585
29 81206.7719005
30 73409.0380627
31 66473.3112012
32 60296.3106408
33 54785.7768329
34 49859.0677676
35 45441.2604793
36 41474.757966
37 37900.6254289
38 34674.8838041
39 31756.4912462
40 29118.0035071
41 26731.4581525
42 24563.2300185
43 22591.0640449
44 20795.1155897
45 19157.9008332
46 17663.3262804
47 16297.9786927
48 15048.7541864
49 13904.2761665
50 12855.5370557
51 11893.5831871
52 11009.8840228
53 10198.5198944
54 9452.97741562
55 8766.63119037
56 8134.73416199
57 7552.22098812
58 7015.3292248
59 6519.7532

462 1.19238477354e-05
463 1.14429095618e-05
464 1.098163353e-05
465 1.05391013727e-05
466 1.01146537383e-05
467 9.70748474576e-06
468 9.31687028387e-06
469 8.94210531477e-06
470 8.5825945558e-06
471 8.23769054161e-06
472 7.90680480541e-06
473 7.58936772541e-06
474 7.28473974087e-06
475 6.99252291117e-06
476 6.71209205042e-06
477 6.4430614509e-06
478 6.18492658291e-06
479 5.93721360396e-06
480 5.69951164811e-06
481 5.47142875981e-06
482 5.25255539132e-06
483 5.04254555597e-06
484 4.84100017805e-06
485 4.64756344346e-06
486 4.46194790682e-06
487 4.28379534403e-06
488 4.11286394305e-06
489 3.9487860965e-06
490 3.79132018994e-06
491 3.64017549663e-06
492 3.49512942501e-06
493 3.35590547435e-06
494 3.22230253639e-06
495 3.0940368917e-06
496 2.97092430139e-06
497 2.85274840761e-06
498 2.73931894319e-06
499 2.63045198276e-06


# PyTorch Tensors

Clearly modern deep neural networks are in need of more than what our beloved numpy can offer.

Here we introduce the most fundamental PyTorch concept: the *Tensor*. A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. Like numpy arrays, PyTorch Tensors do not know anything about deep learning or computational graphs or gradients; they are a generic tool for scientific computing.

However unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations. To run a PyTorch Tensor on GPU, you simply need to cast it to a new datatype.

Here we use PyTorch Tensors to fit a two-layer network to random data. Like the numpy example above we need to manually implement the forward and backward passes through the network:

In [8]:
import torch
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
 # Forward pass: compute predicted y
 h = x.mm(w1)
 h_relu = h.clamp(min=0)
 y_pred = h_relu.mm(w2)

 # Compute and print loss
 loss = (y_pred - y).pow(2).sum()
 print(t, loss)

 # Backprop to compute gradients of w1 and w2 with respect to loss
 grad_y_pred = 2.0 * (y_pred - y)
 grad_w2 = h_relu.t().mm(grad_y_pred)
 grad_h_relu = grad_y_pred.mm(w2.t())
 grad_h = grad_h_relu.clone()
 grad_h[h < 0] = 0
 grad_w1 = x.t().mm(grad_h)

 # Update weights using gradient descent
 w1 -= learning_rate * grad_w1
 w2 -= learning_rate * grad_w2

0 28214897.691271067
1 25380405.792548403
2 26288556.067442656
3 27187362.93774879
4 25326431.49736169
5 20070726.423171997
6 13438367.445337629
7 7935834.941528201
8 4453037.240495725
9 2567232.1655493514
10 1604364.933374187
11 1106295.9881061036
12 831370.3628886025
13 664479.3320915042
14 552383.0191260207
15 470307.21917449264
16 406323.70261433884
17 354377.92758273566
18 311124.330613622
19 274515.3858363455
20 243215.3152763464
21 216254.64485477417
22 192876.48988408546
23 172511.55349881982
24 154696.59197369026
25 139077.64419030334
26 125326.40331724554
27 113168.27359832195
28 102388.44114990594
29 92802.5217316554
30 84252.873688431
31 76614.83165994265
32 69777.57502200827
33 63643.55059441269
34 58122.45377116208
35 53149.5297017009
36 48661.48595352931
37 44605.11924878636
38 40936.86403570355
39 37612.1624785422
40 34589.84976270138
41 31842.023658028404
42 29339.426460701798
43 27055.76113430076
44 24971.357019224655
45 23066.443739543673
46 21322.47401335786
47 1972

453 8.406219949108618e-05
454 8.268637916582222e-05
455 8.105226215794625e-05
456 7.962860940187444e-05
457 7.804827419660709e-05
458 7.654020379489757e-05
459 7.561118885025808e-05
460 7.45004278926431e-05
461 7.300097409869422e-05
462 7.156340142840112e-05
463 7.07410268018932e-05
464 6.95669895526968e-05
465 6.816465251008319e-05
466 6.710678036121742e-05
467 6.58694634003143e-05
468 6.480972211669878e-05
469 6.364145772613794e-05
470 6.294719978224006e-05
471 6.208284610231818e-05
472 6.100992742692768e-05
473 6.030514397714626e-05
474 5.950513809528657e-05
475 5.863842784931128e-05
476 5.7600118031853054e-05
477 5.656285652166915e-05
478 5.575245490059555e-05
479 5.4907743583479385e-05
480 5.450484055091742e-05
481 5.38567654812111e-05
482 5.3109503177953266e-05
483 5.230015415125244e-05
484 5.148949327894725e-05
485 5.0706583465745525e-05
486 4.996987102404149e-05
487 4.9266432966016405e-05
488 4.862910026705303e-05
489 4.8046019087769065e-05
490 4.737298535380241e-05
491 4.66903

# Autograd
PyTorch variables and autograd. Autograd package provides cool functionality as the forward pass of your network defines the computational graph; nodes in the graph will be Tensors and edges will be functions that produce output Tensors from input Tensors. Backprop through this graph then allows us to easily compue gradients.

Here we wrap the PyTorch Tensor in a Variable object; where Vaiabel represents a node in the computational graph. if x is a variable then x.data is a Tensor and x.grad is another Varialble holding the gradient of x w.r.t to some scalar value.

PyTorch Variables have samer API as PyTorch Tensots: any operation that you can do with Tensor, also works fine with Variables, difference only being that the Variable defines a computational graph, allowing us to automatically compute gradients.

In [7]:
# Use of Vaiables and Autograd in a 2-layer network with no need to manually implement backprop!
import torch
from torch.autograd import Variable
dtype = torch.FloatTensor

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs and wrap them in Variables.

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False) # requires_grad=False means no need to compute gradients
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors to hold weights and wrap them in Variables.
# requires_grad=True here to compute gradients w.r.t Variables during a backprop pass.

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True) # requires_grad=False means no need to compute gradients
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
 # Forward pass: compute predicted y using operations on Variables; these
 # are exactly the same operations we used to compute the forward pass using
 # Tensors, but we do not need to keep references to intermediate values since
 # we are not implementing the backward pass by hand.
 y_pred = x.mm(w1).clamp(min=0).mm(w2)

 # Compute and print loss using operations on Variables.
 # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
 # (1,); loss.data[0] is a scalar value holding the loss.
 loss = (y_pred - y).pow(2).sum()
 print(t, loss.data[0])

 # Use autograd to compute the backward pass. This call will compute the
 # gradient of loss with respect to all Variables with requires_grad=True.
 # After this call w1.grad and w2.grad will be Variables holding the gradient
 # of the loss with respect to w1 and w2 respectively.
 loss.backward()

 # Update weights using gradient descent; w1.data and w2.data are Tensors,
 # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
 # Tensors.
 w1.data -= learning_rate * w1.grad.data
 w2.data -= learning_rate * w2.grad.data

 # Manually zero the gradients after updating weights
 w1.grad.data.zero_()
 w2.grad.data.zero_()

0 35878500.0
1 33502642.0
2 31638146.0
3 26216880.0
4 18097450.0
5 10643111.0
6 5868223.0
7 3356485.0
8 2129793.5
9 1508282.875
10 1160753.375
11 940967.3125
12 785975.375
13 668166.125
14 574389.8125
15 497736.9375
16 433985.8125
17 380330.0
18 334801.71875
19 295919.6875
20 262469.40625
21 233624.078125
22 208602.84375
23 186785.21875
24 167705.6875
25 150947.25
26 136179.03125
27 123118.4375
28 111543.015625
29 101252.5
30 92084.828125
31 83890.2109375
32 76550.203125
33 69970.4609375
34 64056.62109375
35 58728.921875
36 53917.6015625
37 49565.42578125
38 45631.03515625
39 42059.48046875
40 38813.0390625
41 35858.09765625
42 33163.74609375
43 30702.73828125
44 28452.41796875
45 26393.0234375
46 24505.55078125
47 22772.90234375
48 21181.724609375
49 19717.416015625
50 18369.517578125
51 17127.080078125
52 15980.390625
53 14921.2587890625
54 13942.4697265625
55 13036.6015625
56 12197.0810546875
57 11419.048828125
58 10696.9755859375
59 10026.7861328125
60 9403.92578125
61 8824.4482421

401 0.006566936150193214
402 0.0063502490520477295
403 0.0061323679983615875
404 0.005927860736846924
405 0.00572930509224534
406 0.005533153191208839
407 0.005349132232367992
408 0.0051721855998039246
409 0.004995665047317743
410 0.004830839112401009
411 0.00467012170702219
412 0.004517041612416506
413 0.0043692924082279205
414 0.004221327602863312
415 0.004083284642547369
416 0.003950608428567648
417 0.0038255397230386734
418 0.003699194174259901
419 0.003574871923774481
420 0.0034629858564585447
421 0.003353290958330035
422 0.003244665451347828
423 0.003142776433378458
424 0.00304229324683547
425 0.002945477142930031
426 0.0028537893667817116
427 0.002764546312391758
428 0.002679029945284128
429 0.002593627432361245
430 0.002514220541343093
431 0.002438138471916318
432 0.0023646263871341944
433 0.0022932353895157576
434 0.002221874427050352
435 0.0021538916043937206
436 0.00209184642881155
437 0.0020262051839381456
438 0.0019666266161948442
439 0.0019075790187343955
440 0.0018491483

# PyTorch: Defining new autograd functions
Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The forward function computes output Tensors from input Tensors. The backward function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Variables containing input data.

In this example we define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network:



In [8]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
 """
 We can implement our own custom autograd Functions by subclassing
 torch.autograd.Function and implementing the forward and backward passes
 which operate on Tensors.
 """

 def forward(self, input):
 """
 In the forward pass we receive a Tensor containing the input and return a
 Tensor containing the output. You can cache arbitrary Tensors for use in the
 backward pass using the save_for_backward method.
 """
 self.save_for_backward(input)
 return input.clamp(min=0)

 def backward(self, grad_output):
 """
 In the backward pass we receive a Tensor containing the gradient of the loss
 with respect to the output, and we need to compute the gradient of the loss
 with respect to the input.
 """
 input, = self.saved_tensors
 grad_input = grad_output.clone()
 grad_input[input < 0] = 0
 return grad_input


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
 # Construct an instance of our MyReLU class to use in our network
 relu = MyReLU()

 # Forward pass: compute predicted y using operations on Variables; we compute
 # ReLU using our custom autograd operation.
 y_pred = relu(x.mm(w1)).mm(w2)

 # Compute and print loss
 loss = (y_pred - y).pow(2).sum()
 print(t, loss.data[0])

 # Use autograd to compute the backward pass.
 loss.backward()

 # Update weights using gradient descent
 w1.data -= learning_rate * w1.grad.data
 w2.data -= learning_rate * w2.grad.data

 # Manually zero the gradients after updating weights
 w1.grad.data.zero_()
 w2.grad.data.zero_()

0 37267740.0
1 35764716.0
2 35199480.0
3 30134798.0
4 20876230.0
5 11940865.0
6 6248357.5
7 3411474.25
8 2109929.75
9 1486262.5
10 1147416.0
11 933659.25
12 781843.3125
13 665453.875
14 572186.3125
15 495587.15625
16 431790.71875
17 378165.53125
18 332707.5
19 293927.75
20 260602.078125
21 231805.1875
22 206837.75
23 185119.71875
24 166225.28125
25 149666.796875
26 135110.765625
27 122273.4140625
28 110904.5703125
29 100806.375
30 91813.03125
31 83787.9921875
32 76605.484375
33 70168.1953125
34 64381.640625
35 59169.4453125
36 54462.21875
37 50205.0546875
38 46347.6328125
39 42845.19921875
40 39659.359375
41 36757.4609375
42 34109.640625
43 31690.05078125
44 29476.259765625
45 27446.322265625
46 25583.3671875
47 23871.591796875
48 22297.443359375
49 20848.37109375
50 19513.296875
51 18281.951171875
52 17144.08203125
53 16093.103515625
54 15118.5703125
55 14213.7685546875
56 13372.318359375
57 12589.6640625
58 11861.248046875
59 11182.630859375
60 10549.470703125
61 9958.626953125
62 94

401 0.03092949651181698
402 0.029925189912319183
403 0.028983892872929573
404 0.028051339089870453
405 0.027159346267580986
406 0.026293398812413216
407 0.02546057477593422
408 0.024651827290654182
409 0.023875653743743896
410 0.02311556600034237
411 0.022379254922270775
412 0.021676119416952133
413 0.0209877360612154
414 0.020319685339927673
415 0.01967649720609188
416 0.019053490832448006
417 0.018447471782565117
418 0.017870858311653137
419 0.01730620674788952
420 0.01676577515900135
421 0.01624283567070961
422 0.0157344788312912
423 0.015246149152517319
424 0.014768954366445541
425 0.01429677102714777
426 0.013850965537130833
427 0.013418878428637981
428 0.013010782189667225
429 0.012603594921529293
430 0.012216034345328808
431 0.011834518052637577
432 0.01147628203034401
433 0.011118064634501934
434 0.0107742203399539
435 0.010442456230521202
436 0.010112447664141655
437 0.009803086519241333
438 0.009505126625299454
439 0.009210986085236073
440 0.008932799100875854
441 0.008662850

## What is a nn module
When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.

In TensorFlow, packages like Keras, TensorFlow-Slim, and TFLearn provide higher-level abstractions over raw computational graphs that are useful for building neural networks.

In PyTorch, the nn package serves this same purpose. The nn package defines a set of Modules, which are roughly equivalent to neural network layers. A Module receives input Variables and computes output Variables, but may also hold internal state such as Variables containing learnable parameters. The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

In this example we use the nn package to implement our two-layer network:

In [7]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
 torch.nn.Linear(D_in, H),
 torch.nn.ReLU(),
 torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
 # Forward pass: compute predicted y by passing x to the model. Module objects
 # override the __call__ operator so you can call them like functions. When
 # doing so you pass a Variable of input data to the Module and it produces
 # a Variable of output data.
 y_pred = model(x)

 # Compute and print loss. We pass Variables containing the predicted and true
 # values of y, and the loss function returns a Variable containing the
 # loss.
 loss = loss_fn(y_pred, y)
 print(t, loss.data[0])

 # Zero the gradients before running the backward pass.
 model.zero_grad()

 # Backward pass: compute gradient of the loss with respect to all the learnable
 # parameters of the model. Internally, the parameters of each Module are stored
 # in Variables with requires_grad=True, so this call will compute gradients for
 # all learnable parameters in the model.
 loss.backward()

 # Update the weights using gradient descent. Each parameter is a Variable, so
 # we can access its data and gradients like we did before.
 for param in model.parameters():
 param.data -= learning_rate * param.grad.data

0 680.08154296875
1 628.8499755859375
2 584.1482543945312
3 544.8362426757812
4 509.60052490234375
5 477.94586181640625
6 449.4169616699219
7 423.177734375
8 398.7761535644531
9 376.30096435546875
10 355.3580322265625
11 335.78729248046875
12 317.5242614746094
13 300.2922058105469
14 283.9698181152344
15 268.5042419433594
16 253.79812622070312
17 239.88864135742188
18 226.66624450683594
19 214.0961151123047
20 202.15972900390625
21 190.82431030273438
22 180.07672119140625
23 169.84349060058594
24 160.177734375
25 151.01641845703125
26 142.33706665039062
27 134.12139892578125
28 126.35624694824219
29 119.03170776367188
30 112.15123748779297
31 105.64317321777344
32 99.51466369628906
33 93.72786712646484
34 88.27943420410156
35 83.15460205078125
36 78.34161376953125
37 73.80974578857422
38 69.55596923828125
39 65.56876373291016
40 61.8112907409668
41 58.278228759765625
42 54.950050354003906
43 51.81352615356445
44 48.865779876708984
45 46.08952713012695
46 43.48237609863281
47 41.0322303

378 0.00019453163258731365
379 0.0001887652324512601
380 0.0001831761037465185
381 0.00017774860316421837
382 0.00017248367657884955
383 0.00016738024714868516
384 0.00016242482524830848
385 0.0001576153008500114
386 0.0001529530854895711
387 0.00014842944801785052
388 0.00014404467947315425
389 0.00013978010974824429
390 0.00013565001427195966
391 0.00013164129632059485
392 0.00012775001232512295
393 0.0001239780249306932
394 0.00012031249207211658
395 0.00011676689609885216
396 0.00011332175199640915
397 0.00010997249773936346
398 0.00010673052747733891
399 0.00010358005238231272
400 0.00010052488505607471
401 9.756081271916628e-05
402 9.469027281738818e-05
403 9.190698619931936e-05
404 8.919845276977867e-05
405 8.657083526486531e-05
406 8.402206731261685e-05
407 8.154464012477547e-05
408 7.914425077615306e-05
409 7.681240822421387e-05
410 7.455307786585763e-05
411 7.236027886392549e-05
412 7.022878708085045e-05
413 6.816528912167996e-05
414 6.616451719310135e-05
415 6.4219391788356e

## PyTorch - optim
With learning rate of $1e-4$


In [3]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential( torch.nn.Linear(D_in, H),
 torch.nn.ReLU(),
 torch.nn.Linear(H, D_out)
 )

loss_fxn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
# We loop

for i in range(500):
 y_pred = model(x)
 loss = loss_fxn(y_pred, y)
 print(t, loss.data[0])
 
 optimizer.zero_grad()
 loss.backward()
 optimizer.step()

499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239

499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239514988381e-06
499 5.173239

## Custom nn module

For more complex computation, you can define your own module by subclassing nn.Module

In [36]:
import torch
from torch.autograd import Variable

class DoubleLayerNet(torch.nn.Module):
 def __init__(self, D_in, H, D_out):
 # initialize 2 instances of nn.Linear mods
 super(DoubleLayerNet, self).__init__()
 self.linear1 = torch.nn.Linear(D_in, H)
 self.linear2 = torch.nn.Linear(H, D_out)
 
 def forward(self, x):
 # in this fxn we accept a Var of input data and
 # return a Var of output data.
 h_relu = self.linear1(x).clamp(min=0)
 y_pred = self.linear2(h_relu)
 return y_pred

# Next, again as usual, define batch size, input dimensions, hidden dimension and output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

# Create some random tensors to hold both input and output

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Build model by instantiating class defined above
my_model = DoubleLayerNet(D_in, H, D_out)

# Build loss fxn and optimizer

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

# and then we loop

for i in range(500):
 # fwd pass, calculate predicted y by passing x to the model
 y_pred = my_model(x)
 
 #calculate and print loss
 loss = criteria(y_pred, y)
 print(t, loss.data[0])
 
 # Zero gradients, performs a backprop pass and update the weights as it goe along
 optimizer.zero_grad()
 loss.backward()
 optimizer.step()

0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607421875
0 656.797607