{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ความสามารถพิเศษของตัวตัดคำแบบเร็ว (fast tokenizers) (PyTorch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<class 'transformers.tokenization_utils_base.BatchEncoding'>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
    "encoding = tokenizer(example)\n",
    "print(type(encoding))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoding.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in',\n",
       " 'Brooklyn', '.', '[SEP]']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoding.tokens()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoding.word_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sylvain"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start, end = encoding.word_to_chars(3)\n",
    "example[start:end]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12},\n",
       " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14},\n",
       " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16},\n",
       " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18},\n",
       " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35},\n",
       " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40},\n",
       " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45},\n",
       " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "token_classifier = pipeline(\"token-classification\")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18},\n",
       " {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n",
       " {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "token_classifier = pipeline(\"token-classification\", aggregation_strategy=\"simple\")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
    "\n",
    "model_checkpoint = \"dbmdz/bert-large-cased-finetuned-conll03-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)\n",
    "\n",
    "example = \"My name is Sylvain and I work at Hugging Face in Brooklyn.\"\n",
    "inputs = tokenizer(example, return_tensors=\"pt\")\n",
    "outputs = model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 19])\n",
       "torch.Size([1, 19, 9])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(inputs[\"input_ids\"].shape)\n",
    "print(outputs.logits.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()\n",
    "predictions = outputs.logits.argmax(dim=-1)[0].tolist()\n",
    "print(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'O',\n",
       " 1: 'B-MISC',\n",
       " 2: 'I-MISC',\n",
       " 3: 'B-PER',\n",
       " 4: 'I-PER',\n",
       " 5: 'B-ORG',\n",
       " 6: 'I-ORG',\n",
       " 7: 'B-LOC',\n",
       " 8: 'I-LOC'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.id2label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S'},\n",
       " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl'},\n",
       " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va'},\n",
       " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in'},\n",
       " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu'},\n",
       " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging'},\n",
       " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face'},\n",
       " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn'}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = []\n",
    "tokens = inputs.tokens()\n",
    "\n",
    "for idx, pred in enumerate(predictions):\n",
    "    label = model.config.id2label[pred]\n",
    "    if label != \"O\":\n",
    "        results.append(\n",
    "            {\"entity\": label, \"score\": probabilities[idx][pred], \"word\": tokens[idx]}\n",
    "        )\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 0), (0, 2), (3, 7), (8, 10), (11, 12), (12, 14), (14, 16), (16, 18), (19, 22), (23, 24), (25, 29), (30, 32),\n",
       " (33, 35), (35, 40), (41, 45), (46, 48), (49, 57), (57, 58), (0, 0)]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
    "inputs_with_offsets[\"offset_mapping\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "yl"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example[12:14]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12},\n",
       " {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14},\n",
       " {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16},\n",
       " {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18},\n",
       " {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35},\n",
       " {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40},\n",
       " {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45},\n",
       " {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = []\n",
    "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
    "tokens = inputs_with_offsets.tokens()\n",
    "offsets = inputs_with_offsets[\"offset_mapping\"]\n",
    "\n",
    "for idx, pred in enumerate(predictions):\n",
    "    label = model.config.id2label[pred]\n",
    "    if label != \"O\":\n",
    "        start, end = offsets[idx]\n",
    "        results.append(\n",
    "            {\n",
    "                \"entity\": label,\n",
    "                \"score\": probabilities[idx][pred],\n",
    "                \"word\": tokens[idx],\n",
    "                \"start\": start,\n",
    "                \"end\": end,\n",
    "            }\n",
    "        )\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Hugging Face"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example[33:45]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18},\n",
       " {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n",
       " {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "results = []\n",
    "inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)\n",
    "tokens = inputs_with_offsets.tokens()\n",
    "offsets = inputs_with_offsets[\"offset_mapping\"]\n",
    "\n",
    "idx = 0\n",
    "while idx < len(predictions):\n",
    "    pred = predictions[idx]\n",
    "    label = model.config.id2label[pred]\n",
    "    if label != \"O\":\n",
    "        # Remove the B- or I-\n",
    "        label = label[2:]\n",
    "        start, _ = offsets[idx]\n",
    "\n",
    "        # Grab all the tokens labeled with I-label\n",
    "        all_scores = []\n",
    "        while (\n",
    "            idx < len(predictions)\n",
    "            and model.config.id2label[predictions[idx]] == f\"I-{label}\"\n",
    "        ):\n",
    "            all_scores.append(probabilities[idx][pred])\n",
    "            _, end = offsets[idx]\n",
    "            idx += 1\n",
    "\n",
    "        # The score is the mean of all the scores of the tokens in that grouped entity\n",
    "        score = np.mean(all_scores).item()\n",
    "        word = example[start:end]\n",
    "        results.append(\n",
    "            {\n",
    "                \"entity_group\": label,\n",
    "                \"score\": score,\n",
    "                \"word\": word,\n",
    "                \"start\": start,\n",
    "                \"end\": end,\n",
    "            }\n",
    "        )\n",
    "    idx += 1\n",
    "\n",
    "print(results)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "ความสามารถพิเศษของตัวตัดคำแบบเร็ว (fast tokenizers) (PyTorch)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}