{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Token classification (TensorFlow)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will need to setup git, adapt your email and name in the following cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git config --global user.email \"you@example.com\"\n",
    "!git config --global user.name \"Your Name\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "raw_datasets = load_dataset(\"conll2003\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 14041\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 3250\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 3453\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets[\"train\"][0][\"tokens\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3, 0, 7, 0, 0, 0, 7, 0, 0]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets[\"train\"][0][\"ner_tags\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n",
    "ner_feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_names = ner_feature.feature.names\n",
    "label_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'EU    rejects German call to boycott British lamb .'\n",
       "'B-ORG O       B-MISC O    O  O       B-MISC  O    O'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = raw_datasets[\"train\"][0][\"tokens\"]\n",
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "line1 = \"\"\n",
    "line2 = \"\"\n",
    "for word, label in zip(words, labels):\n",
    "    full_label = label_names[label]\n",
    "    max_length = max(len(word), len(full_label))\n",
    "    line1 += word + \" \" * (max_length - len(word) + 1)\n",
    "    line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n",
    "\n",
    "print(line1)\n",
    "print(line2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_checkpoint = \"bert-base-cased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n",
    "inputs.tokens()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs.word_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def align_labels_with_tokens(labels, word_ids):\n",
    "    new_labels = []\n",
    "    current_word = None\n",
    "    for word_id in word_ids:\n",
    "        if word_id != current_word:\n",
    "            # Start of a new word!\n",
    "            current_word = word_id\n",
    "            label = -100 if word_id is None else labels[word_id]\n",
    "            new_labels.append(label)\n",
    "        elif word_id is None:\n",
    "            # Special token\n",
    "            new_labels.append(-100)\n",
    "        else:\n",
    "            # Same word as previous token\n",
    "            label = labels[word_id]\n",
    "            # If the label is B-XXX we change it to I-XXX\n",
    "            if label % 2 == 1:\n",
    "                label += 1\n",
    "            new_labels.append(label)\n",
    "\n",
    "    return new_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n",
       "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "word_ids = inputs.word_ids()\n",
    "print(labels)\n",
    "print(align_labels_with_tokens(labels, word_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_and_align_labels(examples):\n",
    "    tokenized_inputs = tokenizer(\n",
    "        examples[\"tokens\"], truncation=True, is_split_into_words=True\n",
    "    )\n",
    "    all_labels = examples[\"ner_tags\"]\n",
    "    new_labels = []\n",
    "    for i, labels in enumerate(all_labels):\n",
    "        word_ids = tokenized_inputs.word_ids(i)\n",
    "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = new_labels\n",
    "    return tokenized_inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize_and_align_labels,\n",
    "    batched=True,\n",
    "    remove_columns=raw_datasets[\"train\"].column_names,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForTokenClassification\n",
    "\n",
    "data_collator = DataCollatorForTokenClassification(\n",
    "    tokenizer=tokenizer, return_tensors=\"tf\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],\n",
       "        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n",
    "batch[\"labels\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n",
       "[-100, 1, 2, -100]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i in range(2):\n",
    "    print(tokenized_datasets[\"train\"][i][\"labels\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n",
    "    columns=[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=True,\n",
    "    batch_size=16,\n",
    ")\n",
    "\n",
    "tf_eval_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n",
    "    columns=[\"attention_mask\", \"input_ids\", \"labels\", \"token_type_ids\"],\n",
    "    collate_fn=data_collator,\n",
    "    shuffle=False,\n",
    "    batch_size=16,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "id2label = {i: label for i, label in enumerate(label_names)}\n",
    "label2id = {v: k for k, v in id2label.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TFAutoModelForTokenClassification\n",
    "\n",
    "model = TFAutoModelForTokenClassification.from_pretrained(\n",
    "    model_checkpoint,\n",
    "    id2label=id2label,\n",
    "    label2id=label2id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.num_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import create_optimizer\n",
    "import tensorflow as tf\n",
    "\n",
    "# Train in mixed-precision float16\n",
    "# Comment this line out if you're using a GPU that will not benefit from this\n",
    "tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")\n",
    "\n",
    "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n",
    "# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,\n",
    "# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.\n",
    "num_epochs = 3\n",
    "num_train_steps = len(tf_train_dataset) * num_epochs\n",
    "\n",
    "optimizer, schedule = create_optimizer(\n",
    "    init_lr=2e-5,\n",
    "    num_warmup_steps=0,\n",
    "    num_train_steps=num_train_steps,\n",
    "    weight_decay_rate=0.01,\n",
    ")\n",
    "model.compile(optimizer=optimizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.keras_callbacks import PushToHubCallback\n",
    "\n",
    "callback = PushToHubCallback(output_dir=\"bert-finetuned-ner\", tokenizer=tokenizer)\n",
    "\n",
    "model.fit(\n",
    "    tf_train_dataset,\n",
    "    validation_data=tf_eval_dataset,\n",
    "    callbacks=[callback],\n",
    "    epochs=num_epochs,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install seqeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "metric = evaluate.load(\"seqeval\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "labels = [label_names[i] for i in labels]\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'MISC': {'precision': 1.0, 'recall': 0.5, 'f1': 0.67, 'number': 2},\n",
       " 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},\n",
       " 'overall_precision': 1.0,\n",
       " 'overall_recall': 0.67,\n",
       " 'overall_f1': 0.8,\n",
       " 'overall_accuracy': 0.89}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions = labels.copy()\n",
    "predictions[2] = \"O\"\n",
    "metric.compute(predictions=[predictions], references=[labels])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LOC': {'precision': 0.91, 'recall': 0.92, 'f1': 0.91, 'number': 1668},\n",
       " 'MISC': {'precision': 0.70, 'recall': 0.79, 'f1': 0.74, 'number': 702},\n",
       " 'ORG': {'precision': 0.85, 'recall': 0.90, 'f1': 0.88, 'number': 1661},\n",
       " 'PER': {'precision': 0.95, 'recall': 0.95, 'f1': 0.95, 'number': 1617},\n",
       " 'overall_precision': 0.87,\n",
       " 'overall_recall': 0.91,\n",
       " 'overall_f1': 0.89,\n",
       " 'overall_accuracy': 0.97}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "all_predictions = []\n",
    "all_labels = []\n",
    "for batch in tf_eval_dataset:\n",
    "    logits = model.predict_on_batch(batch)[\"logits\"]\n",
    "    labels = batch[\"labels\"]\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    for prediction, label in zip(predictions, labels):\n",
    "        for predicted_idx, label_idx in zip(prediction, label):\n",
    "            if label_idx == -100:\n",
    "                continue\n",
    "            all_predictions.append(label_names[predicted_idx])\n",
    "            all_labels.append(label_names[label_idx])\n",
    "metric.compute(predictions=[all_predictions], references=[all_labels])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER', 'score': 0.9988506, 'word': 'Sylvain', 'start': 11, 'end': 18},\n",
       " {'entity_group': 'ORG', 'score': 0.9647625, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n",
       " {'entity_group': 'LOC', 'score': 0.9986118, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "# Replace this with your own checkpoint\n",
    "model_checkpoint = \"huggingface-course/bert-finetuned-ner\"\n",
    "token_classifier = pipeline(\n",
    "    \"token-classification\", model=model_checkpoint, aggregation_strategy=\"simple\"\n",
    ")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Token classification (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}