{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "cellView": "form"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/PrX4CjrVnNc?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#@title\n",
    "from IPython.display import HTML\n",
    "\n",
    "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/PrX4CjrVnNc?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers and Datasets libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install datasets transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "token_classifier = pipeline(\"token-classification\")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, TFAutoModelForTokenClassification\n",
    "\n",
    "model_checkpoint = \"dbmdz/bert-large-cased-finetuned-conll03-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint)\n",
    "\n",
    "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
    "inputs = tokenizer(example, return_tensors=\"tf\")\n",
    "outputs = model(**inputs)\n",
    "print(inputs[\"input_ids\"].shape)\n",
    "print(outputs.logits.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "probabilities = tf.math.softmax(outputs.logits, axis=-1)[0]\n",
    "probabilities = probabilities.numpy().tolist()\n",
    "predictions = tf.math.argmax(predictions, axis=-1)[0]\n",
    "predictions = predictions.numpy().tolist()\n",
    "print(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.config.id2label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
    "tokens = inputs_with_offsets.tokens()\n",
    "offsets = inputs_with_offsets[\"offset_mapping\"]\n",
    "\n",
    "for idx, pred in enumerate(predictions):\n",
    "    label = model.config.id2label[pred]\n",
    "    if label != \"O\":\n",
    "        start, end = offsets[idx]\n",
    "        results.append(\n",
    "            {\"entity\": label, \"score\": probabilities[idx][pred],\n",
    "             \"word\": tokens[idx], \"start\": start, \"end\": end}\n",
    "        )\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "label_map = model.config.id2label\n",
    "results = []\n",
    "idx = 0\n",
    "while idx < len(predictions):\n",
    "    pred = predictions[idx]\n",
    "    label = label_map[pred]\n",
    "    if label != \"O\":\n",
    "        # Remove the B- or I-\n",
    "        label = label[2:]\n",
    "        start, _ = offsets[idx]\n",
    "\n",
    "        # Grab all the tokens labeled with I-label\n",
    "        all_scores = []\n",
    "        while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n",
    "            all_scores.append(probabilities[idx][pred])\n",
    "            _, end = offsets[idx]\n",
    "            idx += 1\n",
    "\n",
    "        # The score is the mean of all the scores of the token in that grouped entity.\n",
    "        score = np.mean(all_scores).item()\n",
    "        word = example[start:end]\n",
    "        results.append(\n",
    "            {\"entity_group\": label, \"score\": score,\n",
    "             \"word\": word, \"start\": start, \"end\": end}\n",
    "        )\n",
    "    idx += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "name": "Inside the Token classification pipeline (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}