{
"cells": [
{
"cell_type": "markdown",
"id": "49b43f1d",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"# Weight Decay\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e143c528",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:45:33.457379Z",
"iopub.status.busy": "2023-08-18T19:45:33.456856Z",
"iopub.status.idle": "2023-08-18T19:45:36.417497Z",
"shell.execute_reply": "2023-08-18T19:45:36.416229Z"
},
"origin_pos": 3,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import torch\n",
"from torch import nn\n",
"from d2l import torch as d2l"
]
},
{
"cell_type": "markdown",
"id": "0b07ca10",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Rather than directly manipulating the number of parameters,\n",
"*weight decay*, operates by restricting the values \n",
"that the parameters can take.\n",
"Generate some data as before\n",
"$$y = 0.05 + \\sum_{i = 1}^d 0.01 x_i + \\epsilon \\textrm{ where }\n",
"\\epsilon \\sim \\mathcal{N}(0, 0.01^2)$$"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c254bc8e",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:45:36.422127Z",
"iopub.status.busy": "2023-08-18T19:45:36.421372Z",
"iopub.status.idle": "2023-08-18T19:45:36.428080Z",
"shell.execute_reply": "2023-08-18T19:45:36.427182Z"
},
"origin_pos": 7,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"class Data(d2l.DataModule):\n",
" def __init__(self, num_train, num_val, num_inputs, batch_size):\n",
" self.save_hyperparameters()\n",
" n = num_train + num_val\n",
" self.X = torch.randn(n, num_inputs)\n",
" noise = torch.randn(n, 1) * 0.01\n",
" w, b = torch.ones((num_inputs, 1)) * 0.01, 0.05\n",
" self.y = torch.matmul(self.X, w) + b + noise\n",
"\n",
" def get_dataloader(self, train):\n",
" i = slice(0, self.num_train) if train else slice(self.num_train, None)\n",
" return self.get_tensorloader([self.X, self.y], train, i)"
]
},
{
"cell_type": "markdown",
"id": "7122e163",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"Defining $\\ell_2$ Norm Penalty"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "11c12f35",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:45:36.445796Z",
"iopub.status.busy": "2023-08-18T19:45:36.445315Z",
"iopub.status.idle": "2023-08-18T19:45:36.475713Z",
"shell.execute_reply": "2023-08-18T19:45:36.474732Z"
},
"origin_pos": 14,
"tab": [
"pytorch"
]
},
"outputs": [],
"source": [
"def l2_penalty(w):\n",
" return (w ** 2).sum() / 2\n",
"\n",
"class WeightDecayScratch(d2l.LinearRegressionScratch):\n",
" def __init__(self, num_inputs, lambd, lr, sigma=0.01):\n",
" super().__init__(num_inputs, lr, sigma)\n",
" self.save_hyperparameters()\n",
"\n",
" def loss(self, y_hat, y):\n",
" return (super().loss(y_hat, y) +\n",
" self.lambd * l2_penalty(self.w))\n",
"\n",
"data = Data(num_train=20, num_val=100, num_inputs=200, batch_size=5)\n",
"trainer = d2l.Trainer(max_epochs=10)\n",
"\n",
"def train_scratch(lambd):\n",
" model = WeightDecayScratch(num_inputs=200, lambd=lambd, lr=0.01)\n",
" model.board.yscale='log'\n",
" trainer.fit(model, data)\n",
" print('L2 norm of w:', float(l2_penalty(model.w)))"
]
},
{
"cell_type": "markdown",
"id": "412ae74a",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Training without Regularization"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e63ee5d3",
"metadata": {
"execution": {
"iopub.execute_input": "2023-08-18T19:45:36.479194Z",
"iopub.status.busy": "2023-08-18T19:45:36.478880Z",
"iopub.status.idle": "2023-08-18T19:45:47.229507Z",
"shell.execute_reply": "2023-08-18T19:45:47.228580Z"
},
"origin_pos": 16,
"tab": [
"pytorch"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L2 norm of w: 0.009948714636266232\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n"
],
"text/plain": [
"