{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline\n",
"\n",
"token_classifier = pipeline(\"token-classification\")\n",
"token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n",
"token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, TFAutoModelForTokenClassification\n",
"\n",
"model_checkpoint = \"dbmdz/bert-large-cased-finetuned-conll03-english\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint)\n",
"\n",
"example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
"inputs = tokenizer(example, return_tensors=\"tf\")\n",
"outputs = model(**inputs)\n",
"print(inputs[\"input_ids\"].shape)\n",
"print(outputs.logits.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"probabilities = tf.math.softmax(outputs.logits, axis=-1)[0]\n",
"probabilities = probabilities.numpy().tolist()\n",
"predictions = tf.math.argmax(predictions, axis=-1)[0]\n",
"predictions = predictions.numpy().tolist()\n",
"print(predictions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.config.id2label"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
"tokens = inputs_with_offsets.tokens()\n",
"offsets = inputs_with_offsets[\"offset_mapping\"]\n",
"\n",
"for idx, pred in enumerate(predictions):\n",
" label = model.config.id2label[pred]\n",
" if label != \"O\":\n",
" start, end = offsets[idx]\n",
" results.append(\n",
" {\"entity\": label, \"score\": probabilities[idx][pred],\n",
" \"word\": tokens[idx], \"start\": start, \"end\": end}\n",
" )\n",
"\n",
"print(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"label_map = model.config.id2label\n",
"results = []\n",
"idx = 0\n",
"while idx < len(predictions):\n",
" pred = predictions[idx]\n",
" label = label_map[pred]\n",
" if label != \"O\":\n",
" # Remove the B- or I-\n",
" label = label[2:]\n",
" start, _ = offsets[idx]\n",
"\n",
" # Grab all the tokens labeled with I-label\n",
" all_scores = []\n",
" while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n",
" all_scores.append(probabilities[idx][pred])\n",
" _, end = offsets[idx]\n",
" idx += 1\n",
"\n",
" # The score is the mean of all the scores of the token in that grouped entity.\n",
" score = np.mean(all_scores).item()\n",
" word = example[start:end]\n",
" results.append(\n",
" {\"entity_group\": label, \"score\": score,\n",
" \"word\": word, \"start\": start, \"end\": end}\n",
" )\n",
" idx += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Inside the Token classification pipeline (TensorFlow)",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}