# 使用 Trainer API 或者 Keras 微调一个模型

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import numpy as np

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
 return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
 columns=["attention_mask", "input_ids", "token_type_ids"],
 label_cols=["labels"],
 shuffle=True,
 collate_fn=data_collator,
 batch_size=8,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
 columns=["attention_mask", "input_ids", "token_type_ids"],
 label_cols=["labels"],
 shuffle=False,
 collate_fn=data_collator,
 batch_size=8,
)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
 optimizer="adam",
 loss=SparseCategoricalCrossentropy(from_logits=True),
 metrics=["accuracy"],
)
model.fit(
 tf_train_dataset,
 validation_data=tf_validation_dataset,
)

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 8
num_epochs = 3
# 训练步数是数据集中的样本数除以batch size再乘以 epoch。
# 注意这里的tf_train_dataset是一个转化为batch后的 tf.data.Dataset,
# 不是原来的 Hugging Face Dataset,所以它的 len() 已经是 num_samples // batch_size。
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
 initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=lr_scheduler)

In [None]:
import tensorflow as tf

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

In [None]:
preds = model.predict(tf_validation_dataset)["logits"]

In [None]:
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

(408, 2) (408,)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])

{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}