{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Comet and Hugging Face.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "metadata": { "id": "KU7cVuqkmiOZ" }, "source": [ "%pip install comet_ml\n", "%pip install transformers==3.3.1" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "To15jUCoAW6T" }, "source": [ "import comet_ml" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "D_MlG7M2x6yb" }, "source": [ "!wget https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zFC922P1ebiY" }, "source": [ "import os\n", "import warnings\n", "warnings.filterwarnings('ignore')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mmdj_S_vml1k" }, "source": [ "import transformers\n", "from transformers import AutoTokenizer\n", "from transformers import BertForSequenceClassification, Trainer, TrainingArguments" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mttl_keG2_Gv" }, "source": [ "import torch" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AP5m24X9jBd1" }, "source": [ "PRE_TRAINED_MODEL_NAME = \"distilbert-base-uncased\"" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZodQLf99i5Tt" }, "source": [ "tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "XxrIr46eaKLU" }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"./title_conference.csv\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "X4z7AENkaQzN" }, "source": [ "df.head()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2X6J27qhabL4" }, "source": [ "df['Conference'] = pd.Categorical(df['Conference'])\n", "df['Target'] = df['Conference'].cat.codes" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "t3v8tCICeJwG" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "train_data, test_data = train_test_split(df, test_size=0.2, stratify=df[\"Target\"])" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZlxAi8U9ezps" }, "source": [ "train_texts, train_labels = train_data['Title'].values.tolist(), train_data['Target'].values.tolist()\n", "test_texts, test_labels = test_data['Title'].values.tolist(), test_data['Target'].values.tolist()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "3Ofm3Nf7SiSX" }, "source": [ "def preprocess(texts, labels):\n", " encoded = tokenizer(\n", " texts, \n", " add_special_tokens=True,\n", " truncation=True, \n", " max_length=64, \n", " pad_to_max_length=True,\n", " return_attention_mask=True, \n", " return_tensors='pt',\n", " )\n", " \n", " return encoded, torch.tensor(labels) " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SHYyvoZAc6zm" }, "source": [ "train_encoded, train_labels = preprocess(train_texts, train_labels)\n", "test_encoded, test_labels = preprocess(test_texts, test_labels)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LgbEnilJ4Zif" }, "source": [ "class Dataset(torch.utils.data.Dataset):\n", " def __init__(self, encodings, labels):\n", " self.encodings = encodings\n", " self.labels = labels\n", "\n", " def __getitem__(self, idx):\n", " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", " item['labels'] = torch.tensor(self.labels[idx])\n", " return item\n", "\n", " def __len__(self):\n", " return len(self.labels)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oTrcPNOau4us" }, "source": [ "train_dataset = Dataset(train_encoded, train_labels)\n", "test_dataset = Dataset(test_encoded, test_labels)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ccxiZLo-zbmZ" }, "source": [ "model = BertForSequenceClassification.from_pretrained(\n", " PRE_TRAINED_MODEL_NAME, \n", " num_labels=len(df[\"Target\"].unique()), \n", " output_attentions=False,\n", " output_hidden_states=False,\n", ")\n", "\n", "# Tell pytorch to run this model on the GPU.\n", "model.cuda()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7w1oU0nlzNti" }, "source": [ "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", "\n", "def compute_metrics(pred): \n", " experiment = comet_ml.get_global_experiment()\n", " \n", " labels = pred.label_ids\n", " preds = pred.predictions.argmax(-1)\n", " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')\n", " acc = accuracy_score(labels, preds)\n", "\n", " if experiment:\n", " experiment.log_confusion_matrix(preds, labels)\n", "\n", " return {\n", " 'accuracy': acc,\n", " 'f1': f1,\n", " 'precision': precision,\n", " 'recall': recall\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "V6eIRRh0RWO8" }, "source": [ "# Training Parameters\n", "EPOCHS = 5" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "GOPNH3KXtdEj" }, "source": [ "import itertools\n", "\n", "decays = [0.0, 0.5, 0.99]\n", "learning_rates = [5.0e-5, 3.0e-5, 2.0e-5, 1.0e-5]\n", "batch_sizes = [32, 64, 128]\n", "\n", "parameters = [\n", " {\"weight_decay\": x[0], \"learning_rate\": x[1], \"batch_size\": x[2]} for x in list(itertools.product(*[decays, learning_rates, batch_sizes]))\n", "]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "11_6DsP3u0TA" }, "source": [ "from tqdm import tqdm\n", "\n", "%env COMET_MODE=ONLINE\n", "%env COMET_API_KEY=\n", "%env COMET_PROJECT_NAME=transformers\n", "\n", "for idx, p in tqdm(enumerate(parameters)):\n", " weight_decay = p[\"weight_decay\"]\n", " learning_rate = p[\"learning_rate\"]\n", " batch_size = p[\"batch_size\"]\n", "\n", " training_args = TrainingArguments(\n", " seed=42,\n", " output_dir='./results', \n", " overwrite_output_dir=True, \n", " num_train_epochs=EPOCHS, \n", " per_device_train_batch_size=batch_size, \n", " per_device_eval_batch_size=batch_size,\n", " warmup_steps=500, \n", " weight_decay=weight_decay, \n", " learning_rate=learning_rate, \n", " evaluation_strategy=\"epoch\",\n", " do_train=True,\n", " do_eval=True \n", " )\n", " trainer = Trainer(\n", " model=model, \n", " args=training_args, \n", " train_dataset=train_dataset, \n", " eval_dataset=test_dataset,\n", " compute_metrics=compute_metrics, \n", " )\n", " trainer.train()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "AD6IKE1DysN4" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }