{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "f0x7V8yXuug3" }, "source": [ "# Déboguer le pipeline d'entraînement\n", "\n", "Ce chapitre portant sur le débogage, la langue nous importe peu ici. Nous nous intéressons surtout à la logique du code pour comprendre d'où provient l'erreur." ] }, { "cell_type": "markdown", "metadata": { "id": "UEU_G8vPuug5" }, "source": [ "Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y-jlJv25uug5" }, "outputs": [], "source": [ "!pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1P4rPHpjuug7", "outputId": "9187a2a7-dca6-4b94-bce9-95f4422b893f" }, "outputs": [], "source": [ "from datasets import load_dataset, load_metric\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", " TrainingArguments,\n", " Trainer,\n", ")\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", "\n", "model_checkpoint = \"distilbert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "\n", "def preprocess_function(examples):\n", " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n", "\n", "args = TrainingArguments(\n", " f\"distilbert-finetuned-mnli\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "metric = load_metric(\"glue\", \"mnli\")\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "trainer = Trainer(\n", " model,\n", " args,\n", " train_dataset=raw_datasets[\"train\"],\n", " eval_dataset=raw_datasets[\"validation_matched\"],\n", " compute_metrics=compute_metrics,\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RpbitvtGuug9", "outputId": "d8ee2113-612a-4dd2-db95-b0c8ec4aa34b" }, "outputs": [], "source": [ "trainer.train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Jfb5bjfpuug-", "outputId": "fe25eb2b-5f82-46fa-f0db-e542b51a2b02" }, "outputs": [], "source": [ "from datasets import load_dataset, load_metric\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", " TrainingArguments,\n", " Trainer,\n", ")\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", "\n", "model_checkpoint = \"distilbert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "\n", "def preprocess_function(examples):\n", " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n", "\n", "args = TrainingArguments(\n", " f\"distilbert-finetuned-mnli\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "metric = load_metric(\"glue\", \"mnli\")\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "trainer = Trainer(\n", " model,\n", " args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"validation_matched\"],\n", " compute_metrics=compute_metrics,\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5hEziUUKuuhA", "outputId": "e3824437-3921-4276-d080-80119eafe347" }, "outputs": [], "source": [ "tokenizer.decode(trainer.train_dataset[0][\"input_ids\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QmlaOe9vuuhB", "outputId": "520dbef0-aba9-4a7e-f9d7-e58c674914bd" }, "outputs": [], "source": [ "trainer.train_dataset[0].keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CUPHJfK7uuhB", "outputId": "b50e1cdb-4c73-49c2-e804-759bfb3cb7ad" }, "outputs": [], "source": [ "type(trainer.model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8z6fWGP3uuhC", "outputId": "371b4974-abec-4702-e9dd-2c174030c87b" }, "outputs": [], "source": [ "trainer.train_dataset[0][\"attention_mask\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8QaC9HaAuuhD", "outputId": "5eea24ac-aa92-423e-cbee-3bd0cafe1da9" }, "outputs": [], "source": [ "len(trainer.train_dataset[0][\"attention_mask\"]) == len(\n", " trainer.train_dataset[0][\"input_ids\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "O_p1b0eFuuhD", "outputId": "957ec941-7d95-44cd-e5e6-d1c13049c147" }, "outputs": [], "source": [ "trainer.train_dataset[0][\"label\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "A2gIbdMeuuhE", "outputId": "7abc86fe-97d0-4b8b-e1b8-85a431b39f57" }, "outputs": [], "source": [ "trainer.train_dataset.features[\"label\"].names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BD78i7gduuhE", "outputId": "df0ad6c6-2db1-46a2-d0ec-7644cb10021f" }, "outputs": [], "source": [ "for batch in trainer.get_train_dataloader():\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3XyCqZ6fuuhF", "outputId": "95ec7610-cc00-466b-8ebe-68005fda0c4d" }, "outputs": [], "source": [ "data_collator = trainer.get_train_dataloader().collate_fn\n", "data_collator" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wNaHrxC_uuhF", "outputId": "5eec7369-2582-4812-dc01-0241f163f91a" }, "outputs": [], "source": [ "from datasets import load_dataset, load_metric\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", " DataCollatorWithPadding,\n", " TrainingArguments,\n", " Trainer,\n", ")\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", "\n", "model_checkpoint = \"distilbert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "\n", "def preprocess_function(examples):\n", " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n", "\n", "args = TrainingArguments(\n", " f\"distilbert-finetuned-mnli\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "metric = load_metric(\"glue\", \"mnli\")\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "trainer = Trainer(\n", " model,\n", " args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"validation_matched\"],\n", " compute_metrics=compute_metrics,\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kTdU1zHxuuhG" }, "outputs": [], "source": [ "data_collator = trainer.get_train_dataloader().collate_fn\n", "batch = data_collator([trainer.train_dataset[i] for i in range(4)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "puZHYwznuuhH" }, "outputs": [], "source": [ "data_collator = trainer.get_train_dataloader().collate_fn\n", "actual_train_set = trainer._remove_unused_columns(trainer.train_dataset)\n", "batch = data_collator([actual_train_set[i] for i in range(4)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZgkgG5wTuuhH" }, "outputs": [], "source": [ "for batch in trainer.get_train_dataloader():\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RNbbigNxuuhH", "outputId": "469b3dbb-b037-4ba1-8b89-9b931730757f" }, "outputs": [], "source": [ "outputs = trainer.model.cpu()(**batch)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FoVsBT4juuhH", "outputId": "229f92ef-696e-4500-a5a4-1f8fc964f56e" }, "outputs": [], "source": [ "trainer.model.config.num_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d4xgB4hluuhI" }, "outputs": [], "source": [ "from datasets import load_dataset, load_metric\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", " DataCollatorWithPadding,\n", " TrainingArguments,\n", " Trainer,\n", ")\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", "\n", "model_checkpoint = \"distilbert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "\n", "def preprocess_function(examples):\n", " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)\n", "\n", "args = TrainingArguments(\n", " f\"distilbert-finetuned-mnli\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "metric = load_metric(\"glue\", \"mnli\")\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "trainer = Trainer(\n", " model,\n", " args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"validation_matched\"],\n", " compute_metrics=compute_metrics,\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "e_qqisTBuuhI" }, "outputs": [], "source": [ "for batch in trainer.get_train_dataloader():\n", " break\n", "\n", "outputs = trainer.model.cpu()(**batch)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DSRfCJJluuhI" }, "outputs": [], "source": [ "import torch\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "batch = {k: v.to(device) for k, v in batch.items()}\n", "\n", "outputs = trainer.model.to(device)(**batch)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XdAPze1RuuhK" }, "outputs": [], "source": [ "loss = outputs.loss\n", "loss.backward()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fo6iA0tkuuhK" }, "outputs": [], "source": [ "trainer.create_optimizer()\n", "trainer.optimizer.step()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mw2fCqo1uuhK", "outputId": "b6dda37e-8458-4bb9-b632-fb1a954dc080" }, "outputs": [], "source": [ "# This will take a long time and error out, so you shouldn't run this cell\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cqOlsDu7uuhL", "outputId": "34d59c53-4025-4bab-eb54-a030065ad33f" }, "outputs": [], "source": [ "trainer.evaluate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NE0RLsNAuuhL" }, "outputs": [], "source": [ "for batch in trainer.get_eval_dataloader():\n", " break\n", "\n", "batch = {k: v.to(device) for k, v in batch.items()}\n", "\n", "with torch.no_grad():\n", " outputs = trainer.model(**batch)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QWSy7QSZuuhL", "outputId": "fffbd1fa-d215-43dc-ebe1-94273d49bec2" }, "outputs": [], "source": [ "predictions = outputs.logits.cpu().numpy()\n", "labels = batch[\"labels\"].cpu().numpy()\n", "\n", "compute_metrics((predictions, labels))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oim704wtuuhL", "outputId": "f060baf3-91d1-4496-b77d-1bceaf1a1371" }, "outputs": [], "source": [ "predictions.shape, labels.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EvvBlvf-uuhL", "outputId": "ae10fe2c-3add-455e-e659-edbac1cfa24d" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "compute_metrics((predictions, labels))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fnGWpZsTuuhM" }, "outputs": [], "source": [ "import numpy as np\n", "from datasets import load_dataset, load_metric\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", " DataCollatorWithPadding,\n", " TrainingArguments,\n", " Trainer,\n", ")\n", "\n", "raw_datasets = load_dataset(\"glue\", \"mnli\")\n", "\n", "model_checkpoint = \"distilbert-base-uncased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "\n", "\n", "def preprocess_function(examples):\n", " return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)\n", "\n", "args = TrainingArguments(\n", " f\"distilbert-finetuned-mnli\",\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " learning_rate=2e-5,\n", " num_train_epochs=3,\n", " weight_decay=0.01,\n", ")\n", "\n", "metric = load_metric(\"glue\", \"mnli\")\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "trainer = Trainer(\n", " model,\n", " args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"validation_matched\"],\n", " compute_metrics=compute_metrics,\n", " data_collator=data_collator,\n", " tokenizer=tokenizer,\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KwImoBnJuuhM" }, "outputs": [], "source": [ "for batch in trainer.get_train_dataloader():\n", " break\n", "\n", "batch = {k: v.to(device) for k, v in batch.items()}\n", "trainer.create_optimizer()\n", "\n", "for _ in range(20):\n", " outputs = trainer.model(**batch)\n", " loss = outputs.loss\n", " loss.backward()\n", " trainer.optimizer.step()\n", " trainer.optimizer.zero_grad()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Odz1EwppuuhN", "outputId": "7215dd2e-6951-4644-c9b9-31849e786434" }, "outputs": [], "source": [ "with torch.no_grad():\n", " outputs = trainer.model(**batch)\n", "preds = outputs.logits\n", "labels = batch[\"labels\"]\n", "\n", "compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))" ] } ], "metadata": { "colab": { "collapsed_sections": [], "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }