{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ความสามารถพิเศษของตัวตัดคำแบบเร็ว (fast tokenizers) (PyTorch)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n", "encoding = tokenizer(example)\n", "print(type(encoding))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.is_fast" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding.is_fast" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in',\n", " 'Brooklyn', '.', '[SEP]']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding.tokens()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding.word_ids()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sylvain" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "start, end = encoding.word_to_chars(3)\n", "example[start:end]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12},\n", " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14},\n", " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16},\n", " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18},\n", " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35},\n", " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40},\n", " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45},\n", " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "token_classifier = pipeline(\"token-classification\")\n", "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18},\n", " {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n", " {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n", "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "\n", "model_checkpoint = \"dbmdz/bert-large-cased-finetuned-conll03-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n", "\n", "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n", "inputs = tokenizer(example, return_tensors=\"pt\")\n", "outputs = model(**inputs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 19])\n", "torch.Size([1, 19, 9])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(inputs[\"input_ids\"].shape)\n", "print(outputs.logits.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "\n", "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n", "predictions = outputs.logits.argmax(dim=-1)[0].tolist()\n", "print(predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'O',\n", " 1: 'B-MISC',\n", " 2: 'I-MISC',\n", " 3: 'B-PER',\n", " 4: 'I-PER',\n", " 5: 'B-ORG',\n", " 6: 'I-ORG',\n", " 7: 'B-LOC',\n", " 8: 'I-LOC'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.config.id2label" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S'},\n", " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl'},\n", " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va'},\n", " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in'},\n", " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu'},\n", " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging'},\n", " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face'},\n", " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn'}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = []\n", "tokens = inputs.tokens()\n", "\n", "for idx, pred in enumerate(predictions):\n", " label = model.config.id2label[pred]\n", " if label != \"O\":\n", " results.append(\n", " {\"entity\": label, \"score\": probabilities[idx][pred], \"word\": tokens[idx]}\n", " )\n", "\n", "print(results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 0), (0, 2), (3, 7), (8, 10), (11, 12), (12, 14), (14, 16), (16, 18), (19, 22), (23, 24), (25, 29), (30, 32),\n", " (33, 35), (35, 40), (41, 45), (46, 48), (49, 57), (57, 58), (0, 0)]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n", "inputs_with_offsets[\"offset_mapping\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "yl" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example[12:14]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12},\n", " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14},\n", " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16},\n", " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18},\n", " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35},\n", " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40},\n", " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45},\n", " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = []\n", "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n", "tokens = inputs_with_offsets.tokens()\n", "offsets = inputs_with_offsets[\"offset_mapping\"]\n", "\n", "for idx, pred in enumerate(predictions):\n", " label = model.config.id2label[pred]\n", " if label != \"O\":\n", " start, end = offsets[idx]\n", " results.append(\n", " {\n", " \"entity\": label,\n", " \"score\": probabilities[idx][pred],\n", " \"word\": tokens[idx],\n", " \"start\": start,\n", " \"end\": end,\n", " }\n", " )\n", "\n", "print(results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Hugging Face" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example[33:45]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18},\n", " {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n", " {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "results = []\n", "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n", "tokens = inputs_with_offsets.tokens()\n", "offsets = inputs_with_offsets[\"offset_mapping\"]\n", "\n", "idx = 0\n", "while idx < len(predictions):\n", " pred = predictions[idx]\n", " label = model.config.id2label[pred]\n", " if label != \"O\":\n", " # Remove the B- or I-\n", " label = label[2:]\n", " start, _ = offsets[idx]\n", "\n", " # Grab all the tokens labeled with I-label\n", " all_scores = []\n", " while (\n", " idx < len(predictions)\n", " and model.config.id2label[predictions[idx]] == f\"I-{label}\"\n", " ):\n", " all_scores.append(probabilities[idx][pred])\n", " _, end = offsets[idx]\n", " idx += 1\n", "\n", " # The score is the mean of all the scores of the tokens in that grouped entity\n", " score = np.mean(all_scores).item()\n", " word = example[start:end]\n", " results.append(\n", " {\n", " \"entity_group\": label,\n", " \"score\": score,\n", " \"word\": word,\n", " \"start\": start,\n", " \"end\": end,\n", " }\n", " )\n", " idx += 1\n", "\n", "print(results)" ] } ], "metadata": { "colab": { "name": "ความสามารถพิเศษของตัวตัดคำแบบเร็ว (fast tokenizers) (PyTorch)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }