{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Regularisation in NNsĀ¶" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before we start doing anything, I think it's important to understand for NLP, this is the intuitive process on what we are trying to do when we are processing our data in the IMDB dataset:\n", "1. Tokenization: break sentence into individual words\n", " - Before: `\"PyTorch seems really easy to use!\"`\n", " - After: `[\"PyTorch\", \"seems\", \"really\", \"easy\", \"to\", \"use\", \"!\"]`\n", "2. Building vocabulary: build an index of words associated with unique numbers\n", " - Before: `[\"PyTorch\", \"seems\", \"really\", \"easy\", \"to\", \"use\", \"!\"]`\n", " - After: `{\"Pytorch: 0, \"seems\": 1, \"really\": 2, ...}`\n", "3. Convert to numerals: map words to unique numbers (indices)\n", " - Before: `{\"Pytorch: 0, \"seems\": 1, \"really\": 2, ...}`\n", " - After: `[0, 1, 2, ...]`\n", "4. Embedding look-up: map sentences (indices now) to fixed matrices\n", " - ```[[0.1, 0.4, 0.3],\n", " [0.8, 0.1, 0.5],\n", " ...]```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Critical plotting imports\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "# PyTorch imports\n", "from torchtext import data, datasets\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "\n", "# Checking for iterable objects\n", "import collections\n", "import random" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set seed\n", "torch.manual_seed(1337)\n", "if torch.cuda.is_available():\n", " torch.cuda.manual_seed_all(1337)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set plotting style\n", "plt.style.use(('dark_background', 'bmh'))\n", "plt.rc('axes', facecolor='none')\n", "plt.rc('figure', figsize=(16, 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create instances of fields\n", "# The important field here is fix_length: all examples using this field will be padded to, or None for flexible sequence lengths\n", "# We are fixing this because we will be using a FNN not an LSTM/RNN/GRU where we can go through uneven sequence lengths\n", "max_len = 80\n", "text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)\n", "label = data.LabelField(sequential=False, dtype=torch.float)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Calling splits() class method of datasets.IMDB to return a torchtext.data.Dataset object\n", "datasets.IMDB.download('./')\n", "ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Training and test set each 25k samples\n", "# 2 fields due to the way we split above\n", "print('train : ', len(ds_train))\n", "print('test : ', len(ds_test))\n", "print('train.fields :', ds_train.fields)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get validation set\n", "seed_num = 1337\n", "ds_train, ds_valid = ds_train.split(random_state=random.seed(seed_num))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Now we've training, validation and test set\n", "print('train : ', len(ds_train))\n", "print('valid : ', len(ds_valid))\n", "print('valid : ', len(ds_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Build vocabulary\n", "# num_words = 25000\n", "num_words = 1000\n", "text.build_vocab(ds_train, max_size=num_words)\n", "label.build_vocab(ds_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print vocab size\n", "print('Vocabulary size: {}'.format(len(text.vocab)))\n", "print('Label size: {}'.format(len(label.vocab)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print most common vocabulary text\n", "most_common_samples = 10\n", "print(text.vocab.freqs.most_common(most_common_samples))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print most common labels\n", "print(label.vocab.freqs.most_common())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sample 0 label\n", "ds_train[0].label" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Sample 0 text: broken down into individual portions\n", "ds_train[0].text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sample 0 text: human readeable sample\n", "def show_text(sample):\n", " print(' '.join(word for word in sample))\n", " \n", "show_text(ds_train[0].text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create and iterable object for our training, validation and testing datasets\n", "# Batches examples of similar lengths together that minimizes amount of padding needed\n", "batch_size = 64 # Change batch size from 1 to bigger number once explanation is done\n", "train_loader, valid_loader, test_loader = data.BucketIterator.splits(\n", " (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check if iterator above is an iterable which should show True\n", "isinstance(train_loader, collections.Iterable)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What's inside this iteratable object? Our text and label although now everything is in machine format (not \"words\") but in numbers!\n", "# The text we saw above becomes a matrix of size 1 x 80 represented by the fixed length we defined before that\n", "list(train_loader)[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Alternative to above, this is much faster but the above code is easy to understand and implement\n", "next(train_loader.__iter__())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_batch = next(train_loader.__iter__())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What methods can we call on this batch object? Text and label\n", "test_batch.fields" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's break this down to check what's in a batch\n", "test_batch.text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1 comment per batch, each comment is limited to a size of 80 as we've defined\n", "test_batch.text.size()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_batch.label" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Extremely weird problem in torchtext where BucketIterator returns a Batch object versus just a simple tuple of tensors containing our text index and labels\n", "# So let's fix this with a new class FixBatchGenerator\n", "\n", "class FixBatchGenerator:\n", " def __init__(self, dl, x_field, y_field):\n", " self.dl, self.x_field, self.y_field = dl, x_field, y_field\n", " \n", " def __len__(self):\n", " return len(self.dl)\n", " \n", " def __iter__(self):\n", " for batch in self.dl:\n", " X = getattr(batch, self.x_field)\n", " y = getattr(batch, self.y_field)\n", " yield (X,y)\n", " \n", "train_loader, valid_loader, test_loader = FixBatchGenerator(train_loader, 'text', 'label'), FixBatchGenerator(valid_loader, 'text', 'label'), FixBatchGenerator(test_loader, 'text', 'label')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Text index\n", "print(next(train_loader.__iter__())[0])\n", "\n", "# Text label\n", "print(next(train_loader.__iter__())[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class FeedforwardNeuralNetModel(nn.Module):\n", " def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):\n", " super(FeedforwardNeuralNetModel, self).__init__()\n", " # Embedding layer\n", " self.embedding = nn.Embedding(input_dim, embedding_dim)\n", " \n", " # Linear function\n", " self.fc1 = nn.Linear(embedding_dim*embedding_dim, hidden_dim) \n", "\n", " # Linear function (readout)\n", " self.fc2 = nn.Linear(hidden_dim, output_dim)\n", " \n", " def forward(self, x):\n", " # Embedding\n", " embedded = self.embedding(x)\n", " embedded = embedded.view(-1, embedding_dim*embedding_dim)\n", " # Linear function\n", " out = self.fc1(embedded)\n", "\n", " # Non-linearity\n", " out = torch.relu(out)\n", " \n", " # Toggle 3: Dropout\n", " # out = torch.dropout(out, 0.8)\n", "\n", " # Linear function (readout)\n", " # Take note here use a final sigmoid function so your loss should not go through sigmoid again.\n", " # BCELoss is the right class to use as it doesn't pass your output through a sigmoid function again.\n", " # In multi-class problems you're used to softmax which can be simplified to a logistic,\n", " # function when you have a two-class problem.\n", " out = self.fc2(out)\n", " out = torch.sigmoid(out)\n", " \n", " return out" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "input_dim = num_words + 2\n", "embedding_dim = max_len\n", "hidden_dim = 32\n", "output_dim = 1\n", "\n", "# Instantiate model class and assign to object\n", "model = FeedforwardNeuralNetModel(input_dim, embedding_dim, hidden_dim, output_dim)\n", "\n", "# Push model to CUDA device if available\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n", "\n", "# Loss function\n", "criterion = nn.BCELoss()\n", "\n", "# Optimizer\n", "# Toggle 2: L2 Norm option - this is called weight decay\n", "# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.005)\n", "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Number of groups of parameters\n", "print('Number of groups of parameters {}'.format(len(list(model.parameters()))))\n", "print('-'*50)\n", "# Print parameters\n", "for i in range(len(list(model.parameters()))):\n", " print(list(model.parameters())[i].size())\n", "print('-'*50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "iter = 0\n", "num_epochs = 10\n", "history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []\n", "best_accuracy = 0\n", "for epoch in range(num_epochs):\n", "# print('-'*50)\n", " for i, (samples, labels) in enumerate(train_loader):\n", " # Training mode\n", " model.train()\n", " \n", " # Load samples\n", " samples = samples.view(-1, max_len).to(device)\n", " labels = labels.view(-1, 1).to(device)\n", "\n", " # Clear gradients w.r.t. parameters\n", " optimizer.zero_grad()\n", "\n", " # Forward pass to get output/logits\n", " outputs = model(samples)\n", "\n", " # Calculate Loss: softmax --> cross entropy loss\n", " loss = criterion(outputs, labels)\n", " \n", " # Toggle 1: L1 norm, add to original loss\n", " # fc1_params = torch.cat([x.view(-1) for x in model.fc1.parameters()])\n", " # loss += 0.001 * torch.norm(fc1_params, 1)\n", " \n", " # Getting gradients w.r.t. parameters\n", " loss.backward()\n", "\n", " # Updating parameters\n", " optimizer.step()\n", "\n", " iter += 1\n", "\n", " if iter % 100 == 0:\n", " # Get training statistics\n", " train_loss = loss.data.item()\n", " \n", " # Testing mode\n", " model.eval()\n", " # Calculate Accuracy \n", " correct = 0\n", " total = 0\n", " # Iterate through test dataset\n", " for samples, labels in valid_loader:\n", " # Load samples\n", " samples = samples.view(-1, max_len).to(device)\n", " labels = labels.view(-1).to(device)\n", "\n", " # Forward pass only to get logits/output\n", " outputs = model(samples)\n", " \n", " # Val loss\n", " val_loss = criterion(outputs.view(-1, 1), labels.view(-1, 1))\n", " \n", " # We use a threshold to define. \n", " # There is another way to do this with one-hot label. Feel free to explore and understand what are the pros/cons of each.\n", " # This opens up a whole topic on why it becomes problematic when we expand beyond 2 class to 10 classes.\n", " # Why do we encode? Why can't we do 0, 1, 2, 3, 4 etc. without one-hot encoding?\n", " predicted = outputs.ge(0.5).view(-1)\n", "\n", " # Total number of labels\n", " total += labels.size(0)\n", "\n", " # Total correct predictions\n", " correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item()\n", " # correct = (predicted == labels.byte()).int().sum().item()\n", " \n", " accuracy = 100. * correct / total\n", " \n", " # Print Loss\n", " print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2)))\n", " \n", " # Append to history\n", " history_val_loss.append(val_loss.data.item())\n", " history_val_acc.append(round(accuracy, 2))\n", " history_train_loss.append(train_loss)\n", " \n", " # Save model when accuracy beats best accuracy\n", " if accuracy > best_accuracy:\n", " best_accuracy = accuracy\n", " # We can load this best model on the validation set later\n", " torch.save(model.state_dict(), 'best_model.pth')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plotting loss graph\n", "plt.plot(history_train_loss, label='Train')\n", "plt.plot(history_val_loss, label='Validation')\n", "plt.title('Loss Graph')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plotting validation accuracy graph\n", "plt.plot(history_val_acc)\n", "plt.title('Validation Accuracy')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "weights = torch.Tensor().to(device)\n", "for param_group in list(model.parameters()):\n", " weights = torch.cat((param_group.view(-1), weights))\n", " print(param_group.size())\n", " \n", "# Toggle 0: No regularization\n", "weights_nothing = weights.cpu().detach().numpy()\n", "\n", "# Toggle 1: L1 norm on FC1\n", "# weights_L1 = weights.detach().numpy()\n", "\n", "# Toggle 2: L2 norm\n", "# weights_L2 = weights.detach().numpy()\n", "\n", "# Toggle 3: dropout\n", "# weights_dropout = weights.detach().numpy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# plt.hist(weights_L1.reshape(-1), range=(-.5, .5), bins=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# plt.hist(weights_nothing.reshape(-1), range=(-.5, .5), bins=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show weight distribution\n", "plt.hist((\n", " weights_nothing.reshape(-1),\n", " weights_L1.reshape(-1),\n", " weights_L2.reshape(-1),\n", "), 49, range=(-.5, .5), label=(\n", " 'No-reg',\n", " 'L1',\n", " 'L2',\n", "))\n", "plt.legend();" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:dl-minicourse] *", "language": "python", "name": "conda-env-dl-minicourse-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }