{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Translation (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['id', 'translation'],\n", " num_rows: 210173\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['id', 'translation'],\n", " num_rows: 189155\n", " })\n", " test: Dataset({\n", " features: ['id', 'translation'],\n", " num_rows: 21018\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "split_datasets = raw_datasets[\"train\"].train_test_split(train_size=0.9, seed=20)\n", "split_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "split_datasets[\"validation\"] = split_datasets.pop(\"test\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'en': 'Default to expanded threads',\n", " 'fr': 'Par défaut, développer les fils de discussion'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "split_datasets[\"train\"][1][\"translation\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'translation_text': 'Par défaut pour les threads élargis'}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", "translator = pipeline(\"translation\", model=model_checkpoint)\n", "translator(\"Default to expanded threads\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'en': 'Unable to import %1 using the OFX importer plugin. This file is not the correct format.',\n", " 'fr': \"Impossible d'importer %1 en utilisant le module d'extension d'importation OFX. Ce fichier n'a pas un format correct.\"}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "split_datasets[\"train\"][172][\"translation\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'translation_text': \"Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format.\"}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translator(\n", " \"Unable to import %1 using the OFX importer plugin. This file is not the correct format.\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"Helsinki-NLP/opus-mt-en-fr\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "en_sentence = split_datasets[\"train\"][1][\"translation\"][\"en\"]\n", "fr_sentence = split_datasets[\"train\"][1][\"translation\"][\"fr\"]\n", "\n", "inputs = tokenizer(en_sentence, text_target=fr_sentence)\n", "inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '']\n", "['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wrong_targets = tokenizer(fr_sentence)\n", "print(tokenizer.convert_ids_to_tokens(wrong_targets[\"input_ids\"]))\n", "print(tokenizer.convert_ids_to_tokens(inputs[\"labels\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max_length = 128\n", "\n", "\n", "def preprocess_function(examples):\n", " inputs = [ex[\"en\"] for ex in examples[\"translation\"]]\n", " targets = [ex[\"fr\"] for ex in examples[\"translation\"]]\n", " model_inputs = tokenizer(\n", " inputs, text_target=targets, max_length=max_length, truncation=True\n", " )\n", " return model_inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = split_datasets.map(\n", " preprocess_function,\n", " batched=True,\n", " remove_columns=split_datasets[\"train\"].column_names,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import TFAutoModelForSeq2SeqLM\n", "\n", "model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForSeq2Seq\n", "\n", "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"tf\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['attention_mask', 'input_ids', 'labels', 'decoder_input_ids'])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(1, 3)])\n", "batch.keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 577, 5891, 2, 3184, 16, 2542, 5, 1710, 0, -100,\n", " -100, -100, -100, -100, -100, -100],\n", " [ 1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817,\n", " 550, 7032, 5821, 7907, 12649, 0]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch[\"labels\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[59513, 577, 5891, 2, 3184, 16, 2542, 5, 1710, 0,\n", " 59513, 59513, 59513, 59513, 59513, 59513],\n", " [59513, 1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124,\n", " 817, 550, 7032, 5821, 7907, 12649]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch[\"decoder_input_ids\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]\n", "[1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in range(1, 3):\n", " print(tokenized_datasets[\"train\"][i][\"labels\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf_train_dataset = model.prepare_tf_dataset(\n", " tokenized_datasets[\"train\"],\n", " collate_fn=data_collator,\n", " shuffle=True,\n", " batch_size=32,\n", ")\n", "tf_eval_dataset = model.prepare_tf_dataset(\n", " tokenized_datasets[\"validation\"],\n", " collate_fn=data_collator,\n", " shuffle=False,\n", " batch_size=16,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install sacrebleu" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "\n", "metric = evaluate.load(\"sacrebleu\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'score': 46.750469682990165,\n", " 'counts': [11, 6, 4, 3],\n", " 'totals': [12, 11, 10, 9],\n", " 'precisions': [91.67, 54.54, 40.0, 33.33],\n", " 'bp': 0.9200444146293233,\n", " 'sys_len': 12,\n", " 'ref_len': 13}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = [\n", " \"This plugin lets you translate web pages between several languages automatically.\"\n", "]\n", "references = [\n", " [\n", " \"This plugin allows you to automatically translate web pages between several languages.\"\n", " ]\n", "]\n", "metric.compute(predictions=predictions, references=references)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'score': 1.683602693167689,\n", " 'counts': [1, 0, 0, 0],\n", " 'totals': [4, 3, 2, 1],\n", " 'precisions': [25.0, 16.67, 12.5, 12.5],\n", " 'bp': 0.10539922456186433,\n", " 'sys_len': 4,\n", " 'ref_len': 13}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = [\"This This This This\"]\n", "references = [\n", " [\n", " \"This plugin allows you to automatically translate web pages between several languages.\"\n", " ]\n", "]\n", "metric.compute(predictions=predictions, references=references)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'score': 0.0,\n", " 'counts': [2, 1, 0, 0],\n", " 'totals': [2, 1, 0, 0],\n", " 'precisions': [100.0, 100.0, 0.0, 0.0],\n", " 'bp': 0.004086771438464067,\n", " 'sys_len': 2,\n", " 'ref_len': 13}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = [\"This plugin\"]\n", "references = [\n", " [\n", " \"This plugin allows you to automatically translate web pages between several languages.\"\n", " ]\n", "]\n", "metric.compute(predictions=predictions, references=references)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import tensorflow as tf\n", "from tqdm import tqdm\n", "\n", "generation_data_collator = DataCollatorForSeq2Seq(\n", " tokenizer, model=model, return_tensors=\"tf\", pad_to_multiple_of=128\n", ")\n", "\n", "tf_generate_dataset = model.prepare_tf_dataset(\n", " tokenized_datasets[\"validation\"],\n", " collate_fn=generation_data_collator,\n", " shuffle=False,\n", " batch_size=8,\n", ")\n", "\n", "\n", "@tf.function(jit_compile=True)\n", "def generate_with_xla(batch):\n", " return model.generate(\n", " input_ids=batch[\"input_ids\"],\n", " attention_mask=batch[\"attention_mask\"],\n", " max_new_tokens=128,\n", " )\n", "\n", "\n", "def compute_metrics():\n", " all_preds = []\n", " all_labels = []\n", "\n", " for batch, labels in tqdm(tf_generate_dataset):\n", " predictions = generate_with_xla(batch)\n", " decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)\n", " labels = labels.numpy()\n", " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n", " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " decoded_preds = [pred.strip() for pred in decoded_preds]\n", " decoded_labels = [[label.strip()] for label in decoded_labels]\n", " all_preds.extend(decoded_preds)\n", " all_labels.extend(decoded_labels)\n", "\n", " result = metric.compute(predictions=all_preds, references=all_labels)\n", " return {\"bleu\": result[\"score\"]}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(compute_metrics())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import create_optimizer\n", "from transformers.keras_callbacks import PushToHubCallback\n", "import tensorflow as tf\n", "\n", "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n", "# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,\n", "# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.\n", "num_epochs = 3\n", "num_train_steps = len(tf_train_dataset) * num_epochs\n", "\n", "optimizer, schedule = create_optimizer(\n", " init_lr=5e-5,\n", " num_warmup_steps=0,\n", " num_train_steps=num_train_steps,\n", " weight_decay_rate=0.01,\n", ")\n", "model.compile(optimizer=optimizer)\n", "\n", "# Train in mixed-precision float16\n", "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers.keras_callbacks import PushToHubCallback\n", "\n", "callback = PushToHubCallback(\n", " output_dir=\"marian-finetuned-kde4-en-to-fr\", tokenizer=tokenizer\n", ")\n", "\n", "model.fit(\n", " tf_train_dataset,\n", " validation_data=tf_eval_dataset,\n", " callbacks=[callback],\n", " epochs=num_epochs,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(compute_metrics())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'translation_text': 'Par défaut, développer les fils de discussion'}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "# Replace this with your own checkpoint\n", "model_checkpoint = \"huggingface-course/marian-finetuned-kde4-en-to-fr\"\n", "translator = pipeline(\"translation\", model=model_checkpoint)\n", "translator(\"Default to expanded threads\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'translation_text': \"Impossible d'importer %1 en utilisant le module externe d'importation OFX. Ce fichier n'est pas le bon format.\"}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translator(\n", " \"Unable to import %1 using the OFX importer plugin. This file is not the correct format.\"\n", ")" ] } ], "metadata": { "colab": { "name": "Translation (TensorFlow)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }