{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "From:\n", "\n", "- [BERT Fine-Tuning Tutorial with PyTorch · Chris McCormick](http://mccormickml.com/2019/07/22/BERT-fine-tuning/)\n", "- [huggingface/pytorch-transformers: 👾 A library of state-of-the-art pretrained models for Natural Language Processing (NLP)](https://github.com/huggingface/pytorch-transformers)\n", "\n", "\n", "Fine-Tuning:\n", "\n", "- Easy Training: recommend 2-4 epochs on a special NLP task\n", "- Less Data\n", "- Good Results" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", "from keras.preprocessing.sequence import pad_sequences\n", "from sklearn.model_selection import train_test_split\n", "from pytorch_transformers import BertTokenizer, BertConfig\n", "from pytorch_transformers import BertForSequenceClassification, BertModel\n", "from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule\n", "from tqdm import tqdm, trange\n", "import pandas as pd\n", "import io\n", "import os\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```bash\n", "# download glue data\n", "$ git clone https://gist.github.com/60c2bdb54d156a41194446737ce03e2e.git download_glue_repo\n", "$ python download_glue_repo/download_glue_data.py --data_dir='glue_data'\n", "```" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# [The Corpus of Linguistic Acceptability (CoLA)](https://nyu-mll.github.io/CoLA/)\n", "data_path = \"cola_public/raw/\"\n", "train_path = os.path.join(data_path, \"in_domain_train.tsv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(train_path, delimiter='\\t', header=None, \n", " names=['sentence_source', 'label', 'label_notes', 'sentence'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence_sourcelabellabel_notessentence
668bc011NaNI expect John to win and Harry to lose.
7090sgww851NaNSome people go by car, but others by bike.
1997rhl071NaNMartha gave Myrna an apple.
4314ks080*It tried to rain.
6406d_981NaNWe didn't keep a list of the names, but the Pr...
\n", "
" ], "text/plain": [ " sentence_source label label_notes \\\n", "668 bc01 1 NaN \n", "7090 sgww85 1 NaN \n", "1997 rhl07 1 NaN \n", "4314 ks08 0 * \n", "6406 d_98 1 NaN \n", "\n", " sentence \n", "668 I expect John to win and Harry to lose. \n", "7090 Some people go by car, but others by bike. \n", "1997 Martha gave Myrna an apple. \n", "4314 It tried to rain. \n", "6406 We didn't keep a list of the names, but the Pr... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sample(5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "sentences = df.sentence.values\n", "sentences = [\"[CLS] \" + sentence + \" [SEP]\" for sentence in sentences]\n", "labels = df.label.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Input" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# The default download path is ~/.cache\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\", do_lower_case=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokenize the first sentence:\n", "['[CLS]', 'our', 'friends', 'won', \"'\", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']\n" ] } ], "source": [ "tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]\n", "print (\"Tokenize the first sentence:\")\n", "print (tokenized_texts[0])" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['i', 'love', 'you']" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.tokenize(\"i love you\")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "30522" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.vocab_size" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Format:\n", "\n", "- input ids: index in BERT tokenizer vocabulary\n", "- segment mask: a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences.\n", "- attention mask: a sequence of 1s and 0s, with 1s for input tokens and 0s for padding ones.\n", "- labels: a single 0 or 1" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "MAX_LEN = 16" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(s) for s in tokenized_texts],\n", " maxlen=MAX_LEN, dtype=\"long\", truncating=\"post\", padding=\"post\")\n", "# \"post\" means padding or truncating at the end of the sequence." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "attn_masks = []\n", "for seq in input_ids:\n", " seq_mask = [float(i>0) for i in seq]\n", " attn_masks.append(seq_mask)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8551" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(attn_masks)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8551" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(input_ids)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(\n", " input_ids, labels, random_state=2018, test_size=0.1)\n", "train_masks, validation_masks, _, _ = train_test_split(\n", " attn_masks, input_ids, random_state=2018, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "train_inputs = torch.tensor(train_inputs)\n", "validation_inputs = torch.tensor(validation_inputs)\n", "train_labels = torch.tensor(train_labels)\n", "validation_labels = torch.tensor(validation_labels)\n", "train_masks = torch.tensor(train_masks)\n", "validation_masks = torch.tensor(validation_masks)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "batch_size = 32\n", "train_data = TensorDataset(train_inputs, train_masks, train_labels)\n", "train_sampler = RandomSampler(train_data)\n", "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n", "\n", "validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)\n", "validation_sampler = SequentialSampler(validation_data)\n", "validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([7695, 16])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_inputs.shape" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([7695])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_labels.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# need to download the base uncased model, 420M\n", "# The default download path is ~/.cache\n", "model1 = BertForSequenceClassification.from_pretrained(\"bert-base-uncased\", num_labels=2)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "model2 = BertModel.from_pretrained(\"bert-base-uncased\", num_labels=2)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "input_ids = torch.tensor(tokenizer.encode(\"Hello, my dog is cute\")).unsqueeze(0) # Batch size 1\n", "outputs1 = model1(input_ids)\n", "outputs2 = model2(input_ids)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor([[0.0587, 0.2715]], grad_fn=),)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs1" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "encoder_out, text_cls = outputs2" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[[[-0.0103, -0.1227, 0.3213, ..., -0.1227, 0.1663, 0.8999],\n", " [-0.1704, 0.1079, 0.1188, ..., 0.1253, 0.2631, 0.7764],\n", " [-0.1531, 0.1066, 0.0684, ..., 0.2248, 0.2979, 0.8439],\n", " [-0.2531, 0.1373, -0.2181, ..., 0.0539, 0.2127, 0.7677],\n", " [-0.5390, -0.1542, 0.0448, ..., 0.0955, 0.1294, 0.7215],\n", " [-0.5359, -0.1256, 0.1278, ..., 0.1238, 0.3003, 0.3731]]]],\n", " grad_fn=)" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder_out.unsqueeze(1)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"finetuning_task\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"num_labels\": 2,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"torchscript\": false,\n", " \"type_vocab_size\": 2,\n", " \"vocab_size\": 30522\n", "}" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.config" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"finetuning_task\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"num_labels\": 2,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"torchscript\": false,\n", " \"type_vocab_size\": 2,\n", " \"vocab_size\": 30522\n", "}" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model2.config" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hyperparamters recommended:\n", "\n", "- Batch size: 16, 32\n", "- Learning rate (Adam): 5e-5, 3e-5, 2e-5\n", "- Number of epochs: 2, 3, 4" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "param_optimizer = list(model.named_parameters())\n", "no_decay = ['bias', 'gamma', 'beta']\n", "optimizer_grouped_parameters = [\n", " {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n", " 'weight_decay_rate': 0.01},\n", " {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n", " 'weight_decay_rate': 0.0}\n", "]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "num_train_optimization_steps = int(len(train_data) / batch_size / 1) * 4" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "scheduler = WarmupLinearSchedule(optimizer, warmup_steps=100, t_total=num_train_optimization_steps)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def flat_accuracy(preds, labels):\n", " pred_flat = np.argmax(preds, axis=1).flatten()\n", " labels_flat = labels.flatten()\n", " return np.sum(pred_flat == labels_flat) / len(labels_flat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_loss_set = []\n", "\n", "# Number of training epochs (authors recommend between 2 and 4)\n", "epochs = 4\n", "\n", "# trange is a tqdm wrapper around the normal python range\n", "for _ in trange(epochs, desc=\"Epoch\"):\n", "\n", " # Training\n", "\n", " # Set our model to training mode (as opposed to evaluation mode)\n", " model.train()\n", " # Tracking variables\n", " tr_loss = 0\n", " nb_tr_examples, nb_tr_steps = 0, 0\n", "\n", " # Train the data for one epoch\n", " for step, batch in enumerate(train_dataloader):\n", " # Add batch to GPU\n", "# batch = tuple(t.to(device) for t in batch)\n", " # Unpack the inputs from our dataloader\n", " b_input_ids, b_input_mask, b_labels = batch\n", " # Clear out the gradients (by default they accumulate)\n", " optimizer.zero_grad()\n", " # Forward pass\n", " outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)\n", " loss = outputs[0]\n", " train_loss_set.append(loss.item()) \n", " # Backward pass\n", " loss.backward()\n", " # Update parameters and take a step using the computed gradient\n", " optimizer.step()\n", " scheduler.step()\n", "\n", " # Update tracking variables\n", " tr_loss += loss.item()\n", " nb_tr_examples += b_input_ids.size(0)\n", " nb_tr_steps += 1\n", "\n", " print(\"Train loss: {}\".format(tr_loss/nb_tr_steps))\n", "\n", "\n", " # Validation\n", "\n", " # Put model in evaluation mode to evaluate loss on the validation set\n", " model.eval()\n", "\n", " # Tracking variables \n", " eval_loss, eval_accuracy = 0, 0\n", " nb_eval_steps, nb_eval_examples = 0, 0\n", "\n", " # Evaluate data for one epoch\n", " for batch in validation_dataloader:\n", " # Add batch to GPU\n", "# batch = tuple(t.to(device) for t in batch)\n", " # Unpack the inputs from our dataloader\n", " b_input_ids, b_input_mask, b_labels = batch\n", " # Telling the model not to compute or store gradients, saving memory and speeding up validation\n", " with torch.no_grad():\n", " # Forward pass, calculate logit predictions\n", " logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)\n", "\n", " # Move logits and labels to CPU\n", " logits = logits.detach().cpu().numpy()\n", " label_ids = b_labels.to('cpu').numpy()\n", "\n", " tmp_eval_accuracy = flat_accuracy(logits, label_ids)\n", "\n", " eval_accuracy += tmp_eval_accuracy\n", " nb_eval_steps += 1\n", "\n", " print(\"Validation Accuracy: {}\".format(eval_accuracy/nb_eval_steps))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(15,8))\n", "plt.title(\"Training loss\")\n", "plt.xlabel(\"Batch\")\n", "plt.ylabel(\"Loss\")\n", "plt.plot(train_loss_set)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }