{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Token classification (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "raw_datasets = load_dataset(\"conll2003\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n", " num_rows: 14041\n", " })\n", " validation: Dataset({\n", " features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n", " num_rows: 3250\n", " })\n", " test: Dataset({\n", " features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n", " num_rows: 3453\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets[\"train\"][0][\"tokens\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[3, 0, 7, 0, 0, 0, 7, 0, 0]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets[\"train\"][0][\"ner_tags\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n", "ner_feature" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label_names = ner_feature.feature.names\n", "label_names" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'EU rejects German call to boycott British lamb .'\n", "'B-ORG O B-MISC O O O B-MISC O O'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words = raw_datasets[\"train\"][0][\"tokens\"]\n", "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "line1 = \"\"\n", "line2 = \"\"\n", "for word, label in zip(words, labels):\n", " full_label = label_names[label]\n", " max_length = max(len(word), len(full_label))\n", " line1 += word + \" \" * (max_length - len(word) + 1)\n", " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", "\n", "print(line1)\n", "print(line2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "model_checkpoint = \"bert-base-cased\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.is_fast" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", "inputs.tokens()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs.word_ids()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def align_labels_with_tokens(labels, word_ids):\n", " new_labels = []\n", " current_word = None\n", " for word_id in word_ids:\n", " if word_id != current_word:\n", " # Start of a new word!\n", " current_word = word_id\n", " label = -100 if word_id is None else labels[word_id]\n", " new_labels.append(label)\n", " elif word_id is None:\n", " # Special token\n", " new_labels.append(-100)\n", " else:\n", " # Same word as previous token\n", " label = labels[word_id]\n", " # If the label is B-XXX we change it to I-XXX\n", " if label % 2 == 1:\n", " label += 1\n", " new_labels.append(label)\n", "\n", " return new_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n", "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "word_ids = inputs.word_ids()\n", "print(labels)\n", "print(align_labels_with_tokens(labels, word_ids))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize_and_align_labels(examples):\n", " tokenized_inputs = tokenizer(\n", " examples[\"tokens\"], truncation=True, is_split_into_words=True\n", " )\n", " all_labels = examples[\"ner_tags\"]\n", " new_labels = []\n", " for i, labels in enumerate(all_labels):\n", " word_ids = tokenized_inputs.word_ids(i)\n", " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", "\n", " tokenized_inputs[\"labels\"] = new_labels\n", " return tokenized_inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = raw_datasets.map(\n", " tokenize_and_align_labels,\n", " batched=True,\n", " remove_columns=raw_datasets[\"train\"].column_names,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForTokenClassification\n", "\n", "data_collator = DataCollatorForTokenClassification(\n", " tokenizer=tokenizer, return_tensors=\"tf\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100],\n", " [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n", "batch[\"labels\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n", "[-100, 1, 2, -100]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in range(2):\n", " print(tokenized_datasets[\"train\"][i][\"labels\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n", " columns=[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"],\n", " collate_fn=data_collator,\n", " shuffle=True,\n", " batch_size=16,\n", ")\n", "\n", "tf_eval_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n", " columns=[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"],\n", " collate_fn=data_collator,\n", " shuffle=False,\n", " batch_size=16,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "id2label = {i: label for i, label in enumerate(label_names)}\n", "label2id = {v: k for k, v in id2label.items()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import TFAutoModelForTokenClassification\n", "\n", "model = TFAutoModelForTokenClassification.from_pretrained(\n", " model_checkpoint,\n", " id2label=id2label,\n", " label2id=label2id,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.config.num_labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import create_optimizer\n", "import tensorflow as tf\n", "\n", "# Train in mixed-precision float16\n", "# Comment this line out if you're using a GPU that will not benefit from this\n", "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")\n", "\n", "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n", "# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,\n", "# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.\n", "num_epochs = 3\n", "num_train_steps = len(tf_train_dataset) * num_epochs\n", "\n", "optimizer, schedule = create_optimizer(\n", " init_lr=2e-5,\n", " num_warmup_steps=0,\n", " num_train_steps=num_train_steps,\n", " weight_decay_rate=0.01,\n", ")\n", "model.compile(optimizer=optimizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers.keras_callbacks import PushToHubCallback\n", "\n", "callback = PushToHubCallback(output_dir=\"bert-finetuned-ner\", tokenizer=tokenizer)\n", "\n", "model.fit(\n", " tf_train_dataset,\n", " validation_data=tf_eval_dataset,\n", " callbacks=[callback],\n", " epochs=num_epochs,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install seqeval" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "\n", "metric = evaluate.load(\"seqeval\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", "labels = [label_names[i] for i in labels]\n", "labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'MISC': {'precision': 1.0, 'recall': 0.5, 'f1': 0.67, 'number': 2},\n", " 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},\n", " 'overall_precision': 1.0,\n", " 'overall_recall': 0.67,\n", " 'overall_f1': 0.8,\n", " 'overall_accuracy': 0.89}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = labels.copy()\n", "predictions[2] = \"O\"\n", "metric.compute(predictions=[predictions], references=[labels])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'LOC': {'precision': 0.91, 'recall': 0.92, 'f1': 0.91, 'number': 1668},\n", " 'MISC': {'precision': 0.70, 'recall': 0.79, 'f1': 0.74, 'number': 702},\n", " 'ORG': {'precision': 0.85, 'recall': 0.90, 'f1': 0.88, 'number': 1661},\n", " 'PER': {'precision': 0.95, 'recall': 0.95, 'f1': 0.95, 'number': 1617},\n", " 'overall_precision': 0.87,\n", " 'overall_recall': 0.91,\n", " 'overall_f1': 0.89,\n", " 'overall_accuracy': 0.97}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "all_predictions = []\n", "all_labels = []\n", "for batch in tf_eval_dataset:\n", " logits = model.predict_on_batch(batch)[\"logits\"]\n", " labels = batch[\"labels\"]\n", " predictions = np.argmax(logits, axis=-1)\n", " for prediction, label in zip(predictions, labels):\n", " for predicted_idx, label_idx in zip(prediction, label):\n", " if label_idx == -100:\n", " continue\n", " all_predictions.append(label_names[predicted_idx])\n", " all_labels.append(label_names[label_idx])\n", "metric.compute(predictions=[all_predictions], references=[all_labels])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity_group': 'PER', 'score': 0.9988506, 'word': 'Sylvain', 'start': 11, 'end': 18},\n", " {'entity_group': 'ORG', 'score': 0.9647625, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n", " {'entity_group': 'LOC', 'score': 0.9986118, 'word': 'Brooklyn', 'start': 49, 'end': 57}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "# Replace this with your own checkpoint\n", "model_checkpoint = \"huggingface-course/bert-finetuned-ner\"\n", "token_classifier = pipeline(\n", " \"token-classification\", model=model_checkpoint, aggregation_strategy=\"simple\"\n", ")\n", "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" ] } ], "metadata": { "colab": { "name": "Token classification (TensorFlow)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }