{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form" }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#@title\n", "from IPython.display import HTML\n", "\n", "HTML('')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers and Datasets libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "\n", "token_classifier = pipeline(\"token-classification\")\n", "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n", "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "\n", "model_checkpoint = \"\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n", "\n", "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n", "inputs = tokenizer(example, return_tensors=\"pt\")\n", "outputs = model(**inputs)\n", "\n", "print(inputs[\"input_ids\"].shape)\n", "print(outputs.logits.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n", "predictions = probabilities.argmax(dim=-1)[0].tolist()\n", "print(predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.config.id2label" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results = []\n", "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n", "tokens = inputs_with_offsets.tokens()\n", "offsets = inputs_with_offsets[\"offset_mapping\"]\n", "\n", "for idx, pred in enumerate(predictions):\n", " label = model.config.id2label[pred]\n", " if label != \"O\":\n", " start, end = offsets[idx]\n", " results.append(\n", " {\"entity\": label, \"score\": probabilities[idx][pred],\n", " \"word\": tokens[idx], \"start\": start, \"end\": end}\n", " )\n", "\n", "print(results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "label_map = model.config.id2label\n", "results = []\n", "idx = 0\n", "while idx < len(predictions):\n", " pred = predictions[idx]\n", " label = label_map[pred]\n", " if label != \"O\":\n", " # Remove the B- or I-\n", " label = label[2:]\n", " start, _ = offsets[idx]\n", "\n", " # Grab all the tokens labeled with I-label\n", " all_scores = []\n", " while idx < len(predictions) and label_map[predictions[idx]] == f\"I-{label}\":\n", " all_scores.append(probabilities[idx][pred])\n", " _, end = offsets[idx]\n", " idx += 1\n", "\n", " # The score is the mean of all the scores of the token in that grouped entity.\n", " score = np.mean(all_scores).item()\n", " word = example[start:end]\n", " results.append(\n", " {\"entity_group\": label, \"score\": score,\n", " \"word\": word, \"start\": start, \"end\": end}\n", " )\n", " idx += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "Inside the Token classification pipeline (PyTorch)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }