{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "rGgb0tYAgCXS" }, "source": [ "# Un entraînement complet" ] }, { "cell_type": "markdown", "metadata": { "id": "rb9JpxcVgCXU" }, "source": [ "Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SXDhUvwegCXV" }, "outputs": [], "source": [ "!pip install datasets transformers[sentencepiece]\n", "!pip install accelerate\n", "# Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante :\n", "# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KU3SNvXugCXX" }, "outputs": [], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "\n", "raw_datasets = load_dataset(\"paws-x\", \"fr\")\n", "checkpoint = \"camembert-base\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n", "\n", "\n", "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "i3sASOtBgCXY" }, "outputs": [], "source": [ "tokenized_datasets = tokenized_datasets.remove_columns([\"sentence1\", \"sentence2\", \"idx\"])\n", "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", "tokenized_datasets.set_format(\"torch\")\n", "tokenized_datasets[\"train\"].column_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qPDal3ZZgCXY" }, "outputs": [], "source": [ "[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ljub68PygCXa" }, "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "\n", "train_dataloader = DataLoader(\n", " tokenized_datasets[\"train\"], shuffle=True, batch_size=8, collate_fn=data_collator\n", ")\n", "eval_dataloader = DataLoader(\n", " tokenized_datasets[\"validation\"], batch_size=8, collate_fn=data_collator\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DbXdwYYcgCXb" }, "outputs": [], "source": [ "for batch in train_dataloader:\n", " break\n", "{k: v.shape for k, v in batch.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AfYEvRJ3gCXb" }, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cSVoIMofgCXd" }, "outputs": [], "source": [ "outputs = model(**batch)\n", "print(outputs.loss, outputs.logits.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GG4qqJZTgCXe" }, "outputs": [], "source": [ "from transformers import AdamW\n", "\n", "optimizer = AdamW(model.parameters(), lr=5e-5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fborIyYfgCXe" }, "outputs": [], "source": [ "from transformers import get_scheduler\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dataloader)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "print(num_training_steps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gGNfkrI8gCXf" }, "outputs": [], "source": [ "import torch\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "model.to(device)\n", "device" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6pU0KLlIgCXg" }, "outputs": [], "source": [ "from tqdm.auto import tqdm\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " loss.backward()\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uebr5rvegCXg" }, "outputs": [], "source": [ "from datasets import load_metric\n", "\n", "metric = load_metric(\"glue\", \"mrpc\")\n", "model.eval()\n", "for batch in eval_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " with torch.no_grad():\n", " outputs = model(**batch)\n", "\n", " logits = outputs.logits\n", " predictions = torch.argmax(logits, dim=-1)\n", " metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n", "\n", "metric.compute()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tqd55EFWgCXh" }, "outputs": [], "source": [ "from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "optimizer = AdamW(model.parameters(), lr=3e-5)\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "model.to(device)\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dataloader)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dataloader:\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " loss.backward()\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PCAcBjKjgCXi" }, "outputs": [], "source": [ "from accelerate import Accelerator\n", "from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler\n", "\n", "accelerator = Accelerator()\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n", "optimizer = AdamW(model.parameters(), lr=3e-5)\n", "\n", "train_dl, eval_dl, model, optimizer = accelerator.prepare(\n", " train_dataloader, eval_dataloader, model, optimizer\n", ")\n", "\n", "num_epochs = 3\n", "num_training_steps = num_epochs * len(train_dl)\n", "lr_scheduler = get_scheduler(\n", " \"linear\",\n", " optimizer=optimizer,\n", " num_warmup_steps=0,\n", " num_training_steps=num_training_steps,\n", ")\n", "\n", "progress_bar = tqdm(range(num_training_steps))\n", "\n", "model.train()\n", "for epoch in range(num_epochs):\n", " for batch in train_dl:\n", " outputs = model(**batch)\n", " loss = outputs.loss\n", " accelerator.backward(loss)\n", "\n", " optimizer.step()\n", " lr_scheduler.step()\n", " optimizer.zero_grad()\n", " progress_bar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y7InjBE5gCXi" }, "outputs": [], "source": [ "from accelerate import notebook_launcher\n", "\n", "notebook_launcher(training_function)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }