# Déboguer le pipeline d'entraînement

Ce chapitre portant sur le débogage, la langue nous importe peu ici. Nous nous intéressons surtout à la logique du code pour comprendre d'où provient l'erreur.

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*.

In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
from datasets import load_dataset, load_metric
from transformers import (
 AutoTokenizer,
 AutoModelForSequenceClassification,
 TrainingArguments,
 Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
 return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

args = TrainingArguments(
 f"distilbert-finetuned-mnli",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.01,
)

metric = load_metric("glue", "mnli")


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
 model,
 args,
 train_dataset=raw_datasets["train"],
 eval_dataset=raw_datasets["validation_matched"],
 compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
trainer.train_dataset[0]

In [None]:
from datasets import load_dataset, load_metric
from transformers import (
 AutoTokenizer,
 AutoModelForSequenceClassification,
 TrainingArguments,
 Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
 return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

args = TrainingArguments(
 f"distilbert-finetuned-mnli",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.01,
)

metric = load_metric("glue", "mnli")


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
 model,
 args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["validation_matched"],
 compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

In [None]:
trainer.train_dataset[0].keys()

In [None]:
type(trainer.model)

In [None]:
trainer.train_dataset[0]["attention_mask"]

In [None]:
len(trainer.train_dataset[0]["attention_mask"]) == len(
 trainer.train_dataset[0]["input_ids"]
)

In [None]:
trainer.train_dataset[0]["label"]

In [None]:
trainer.train_dataset.features["label"].names

In [None]:
for batch in trainer.get_train_dataloader():
 break

In [None]:
data_collator = trainer.get_train_dataloader().collate_fn
data_collator

In [None]:
from datasets import load_dataset, load_metric
from transformers import (
 AutoTokenizer,
 AutoModelForSequenceClassification,
 DataCollatorWithPadding,
 TrainingArguments,
 Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
 return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

args = TrainingArguments(
 f"distilbert-finetuned-mnli",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.01,
)

metric = load_metric("glue", "mnli")


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
 model,
 args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["validation_matched"],
 compute_metrics=compute_metrics,
 data_collator=data_collator,
 tokenizer=tokenizer,
)
trainer.train()

In [None]:
data_collator = trainer.get_train_dataloader().collate_fn
batch = data_collator([trainer.train_dataset[i] for i in range(4)])

In [None]:
data_collator = trainer.get_train_dataloader().collate_fn
actual_train_set = trainer._remove_unused_columns(trainer.train_dataset)
batch = data_collator([actual_train_set[i] for i in range(4)])

In [None]:
for batch in trainer.get_train_dataloader():
 break

In [None]:
outputs = trainer.model.cpu()(**batch)

In [None]:
trainer.model.config.num_labels

In [None]:
from datasets import load_dataset, load_metric
from transformers import (
 AutoTokenizer,
 AutoModelForSequenceClassification,
 DataCollatorWithPadding,
 TrainingArguments,
 Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
 return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
 f"distilbert-finetuned-mnli",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.01,
)

metric = load_metric("glue", "mnli")


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
 model,
 args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["validation_matched"],
 compute_metrics=compute_metrics,
 data_collator=data_collator,
 tokenizer=tokenizer,
)

In [None]:
for batch in trainer.get_train_dataloader():
 break

outputs = trainer.model.cpu()(**batch)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: v.to(device) for k, v in batch.items()}

outputs = trainer.model.to(device)(**batch)

In [None]:
loss = outputs.loss
loss.backward()

In [None]:
trainer.create_optimizer()
trainer.optimizer.step()

In [None]:
# This will take a long time and error out, so you shouldn't run this cell
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
for batch in trainer.get_eval_dataloader():
 break

batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
 outputs = trainer.model(**batch)

In [None]:
predictions = outputs.logits.cpu().numpy()
labels = batch["labels"].cpu().numpy()

compute_metrics((predictions, labels))

In [None]:
predictions.shape, labels.shape

In [None]:
import numpy as np


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 predictions = np.argmax(predictions, axis=1)
 return metric.compute(predictions=predictions, references=labels)


compute_metrics((predictions, labels))

In [None]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import (
 AutoTokenizer,
 AutoModelForSequenceClassification,
 DataCollatorWithPadding,
 TrainingArguments,
 Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
 return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
 f"distilbert-finetuned-mnli",
 evaluation_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 num_train_epochs=3,
 weight_decay=0.01,
)

metric = load_metric("glue", "mnli")


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 predictions = np.argmax(predictions, axis=1)
 return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
 model,
 args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["validation_matched"],
 compute_metrics=compute_metrics,
 data_collator=data_collator,
 tokenizer=tokenizer,
)
trainer.train()

In [None]:
for batch in trainer.get_train_dataloader():
 break

batch = {k: v.to(device) for k, v in batch.items()}
trainer.create_optimizer()

for _ in range(20):
 outputs = trainer.model(**batch)
 loss = outputs.loss
 loss.backward()
 trainer.optimizer.step()
 trainer.optimizer.zero_grad()

In [None]:
with torch.no_grad():
 outputs = trainer.model(**batch)
preds = outputs.logits
labels = batch["labels"]

compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))