{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "IIgKlE2DbZaP" }, "source": [ "# Préparer des données (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": { "id": "Nc00b8BWbZaQ" }, "source": [ "Installez les bibliothèques 🤗 *Transformers* et 🤗 *Datasets* pour exécuter ce *notebook*." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "me9NX9X4bZaQ" }, "outputs": [], "source": [ "!pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FFc_IGipbZaS" }, "outputs": [], "source": [ "import tensorflow as tf\n", "import numpy as np\n", "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n", "\n", "# Same as before\n", "checkpoint = \"camembert-base\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "sequences = [\n", " \"J'ai attendu un cours d'HuggingFace toute ma vie.\", \n", " \"Je déteste tellement ça !\"]\n", "batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\"))\n", "\n", "# This is new\n", "model.compile(optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\")\n", "labels = tf.convert_to_tensor([1, 1])\n", "model.train_on_batch(batch, labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mnRaEEZibZaT" }, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"paws-x\", \"fr\")\n", "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HO2D7pJYbZaU" }, "outputs": [], "source": [ "raw_train_dataset = raw_datasets[\"train\"]\n", "raw_train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iJxImFEQbZaU" }, "outputs": [], "source": [ "raw_train_dataset.features" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mcfjYv9hbZaV" }, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"camembert-base\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "tokenized_sentences_1 = tokenizer(raw_datasets[\"train\"][\"sentence1\"])\n", "tokenized_sentences_2 = tokenizer(raw_datasets[\"train\"][\"sentence2\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Jd5FXWQBbZaW" }, "outputs": [], "source": [ "inputs = tokenizer(\"C'est la première phrase.\", \"C'est la deuxième.\")\n", "inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ly68WIIHbZaX" }, "outputs": [], "source": [ "tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Avts72_WbZaY" }, "outputs": [], "source": [ "tokenized_dataset = tokenizer(\n", " raw_datasets[\"train\"][\"sentence1\"],\n", " raw_datasets[\"train\"][\"sentence2\"],\n", " padding=True,\n", " truncation=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TMvS1GB-bZaZ" }, "outputs": [], "source": [ "def tokenize_function(example):\n", " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AH7OVmjAbZaa" }, "outputs": [], "source": [ "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n", "tokenized_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "B-mS1PgvbZaa" }, "outputs": [], "source": [ "from transformers import DataCollatorWithPadding\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "huGLcqWdbZab" }, "outputs": [], "source": [ "samples = tokenized_datasets[\"train\"][:8]\n", "samples = {k: v for k, v in samples.items() if k not in [\"idx\", \"sentence1\", \"sentence2\"]}\n", "[len(x) for x in samples[\"input_ids\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d6g3J9rbbZab" }, "outputs": [], "source": [ "batch = data_collator(samples)\n", "{k: v.shape for k, v in batch.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "th3DW4MhbZab" }, "outputs": [], "source": [ "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n", " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n", " label_cols=[\"labels\"],\n", " shuffle=True,\n", " collate_fn=data_collator,\n", " batch_size=8,\n", ")\n", "\n", "tf_validation_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n", " columns=[\"attention_mask\", \"input_ids\", \"token_type_ids\"],\n", " label_cols=[\"labels\"],\n", " shuffle=False,\n", " collate_fn=data_collator,\n", " batch_size=8,\n", ")" ] } ], "metadata": { "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }