{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# トークン分類 (PyTorch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]\n",
    "!pip install accelerate\n",
    "# To run the training on TPU, you will need to uncomment the following line:\n",
    "# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will need to setup git, adapt your email and name in the following cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git config --global user.email \"you@example.com\"\n",
    "!git config --global user.name \"Your Name\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "raw_datasets = load_dataset(\"conll2003\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 14041\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 3250\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'],\n",
       "        num_rows: 3453\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets[\"train\"][0][\"tokens\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3, 0, 7, 0, 0, 0, 7, 0, 0]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets[\"train\"][0][\"ner_tags\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n",
    "ner_feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label_names = ner_feature.feature.names\n",
    "label_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'EU    rejects German call to boycott British lamb .'\n",
       "'B-ORG O       B-MISC O    O  O       B-MISC  O    O'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = raw_datasets[\"train\"][0][\"tokens\"]\n",
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "line1 = \"\"\n",
    "line2 = \"\"\n",
    "for word, label in zip(words, labels):\n",
    "    full_label = label_names[label]\n",
    "    max_length = max(len(word), len(full_label))\n",
    "    line1 += word + \" \" * (max_length - len(word) + 1)\n",
    "    line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n",
    "\n",
    "print(line1)\n",
    "print(line2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_checkpoint = \"bert-base-cased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n",
    "inputs.tokens()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs.word_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def align_labels_with_tokens(labels, word_ids):\n",
    "    new_labels = []\n",
    "    current_word = None\n",
    "    for word_id in word_ids:\n",
    "        if word_id != current_word:\n",
    "            # Start of a new word!\n",
    "            current_word = word_id\n",
    "            label = -100 if word_id is None else labels[word_id]\n",
    "            new_labels.append(label)\n",
    "        elif word_id is None:\n",
    "            # Special token\n",
    "            new_labels.append(-100)\n",
    "        else:\n",
    "            # Same word as previous token\n",
    "            label = labels[word_id]\n",
    "            # If the label is B-XXX we change it to I-XXX\n",
    "            if label % 2 == 1:\n",
    "                label += 1\n",
    "            new_labels.append(label)\n",
    "\n",
    "    return new_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n",
       "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "word_ids = inputs.word_ids()\n",
    "print(labels)\n",
    "print(align_labels_with_tokens(labels, word_ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_and_align_labels(examples):\n",
    "    tokenized_inputs = tokenizer(\n",
    "        examples[\"tokens\"], truncation=True, is_split_into_words=True\n",
    "    )\n",
    "    all_labels = examples[\"ner_tags\"]\n",
    "    new_labels = []\n",
    "    for i, labels in enumerate(all_labels):\n",
    "        word_ids = tokenized_inputs.word_ids(i)\n",
    "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = new_labels\n",
    "    return tokenized_inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize_and_align_labels,\n",
    "    batched=True,\n",
    "    remove_columns=raw_datasets[\"train\"].column_names,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForTokenClassification\n",
    "\n",
    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],\n",
       "        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n",
    "batch[\"labels\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n",
       "[-100, 1, 2, -100]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for i in range(2):\n",
    "    print(tokenized_datasets[\"train\"][i][\"labels\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install seqeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "metric = evaluate.load(\"seqeval\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n",
    "labels = [label_names[i] for i in labels]\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'MISC': {'precision': 1.0, 'recall': 0.5, 'f1': 0.67, 'number': 2},\n",
       " 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},\n",
       " 'overall_precision': 1.0,\n",
       " 'overall_recall': 0.67,\n",
       " 'overall_f1': 0.8,\n",
       " 'overall_accuracy': 0.89}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions = labels.copy()\n",
    "predictions[2] = \"O\"\n",
    "metric.compute(predictions=[predictions], references=[labels])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    logits, labels = eval_preds\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "\n",
    "    # Remove ignored index (special tokens) and convert to labels\n",
    "    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n",
    "    true_predictions = [\n",
    "        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n",
    "    return {\n",
    "        \"precision\": all_metrics[\"overall_precision\"],\n",
    "        \"recall\": all_metrics[\"overall_recall\"],\n",
    "        \"f1\": all_metrics[\"overall_f1\"],\n",
    "        \"accuracy\": all_metrics[\"overall_accuracy\"],\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "id2label = {i: label for i, label in enumerate(label_names)}\n",
    "label2id = {v: k for k, v in id2label.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForTokenClassification\n",
    "\n",
    "model = AutoModelForTokenClassification.from_pretrained(\n",
    "    model_checkpoint,\n",
    "    id2label=id2label,\n",
    "    label2id=label2id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.num_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TrainingArguments\n",
    "\n",
    "args = TrainingArguments(\n",
    "    \"bert-finetuned-ner\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    "    push_to_hub=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    "    tokenizer=tokenizer,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/sgugger/bert-finetuned-ner/commit/26ab21e5b1568f9afeccdaed2d8715f571d786ed'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.push_to_hub(commit_message=\"Training complete\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import DataLoader\n",
    "\n",
    "train_dataloader = DataLoader(\n",
    "    tokenized_datasets[\"train\"],\n",
    "    shuffle=True,\n",
    "    collate_fn=data_collator,\n",
    "    batch_size=8,\n",
    ")\n",
    "eval_dataloader = DataLoader(\n",
    "    tokenized_datasets[\"validation\"], collate_fn=data_collator, batch_size=8\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForTokenClassification.from_pretrained(\n",
    "    model_checkpoint,\n",
    "    id2label=id2label,\n",
    "    label2id=label2id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.optim import AdamW\n",
    "\n",
    "optimizer = AdamW(model.parameters(), lr=2e-5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from accelerate import Accelerator\n",
    "\n",
    "accelerator = Accelerator()\n",
    "model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(\n",
    "    model, optimizer, train_dataloader, eval_dataloader\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import get_scheduler\n",
    "\n",
    "num_train_epochs = 3\n",
    "num_update_steps_per_epoch = len(train_dataloader)\n",
    "num_training_steps = num_train_epochs * num_update_steps_per_epoch\n",
    "\n",
    "lr_scheduler = get_scheduler(\n",
    "    \"linear\",\n",
    "    optimizer=optimizer,\n",
    "    num_warmup_steps=0,\n",
    "    num_training_steps=num_training_steps,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'sgugger/bert-finetuned-ner-accelerate'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import Repository, get_full_repo_name\n",
    "\n",
    "model_name = \"bert-finetuned-ner-accelerate\"\n",
    "repo_name = get_full_repo_name(model_name)\n",
    "repo_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"bert-finetuned-ner-accelerate\"\n",
    "repo = Repository(output_dir, clone_from=repo_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def postprocess(predictions, labels):\n",
    "    predictions = predictions.detach().cpu().clone().numpy()\n",
    "    labels = labels.detach().cpu().clone().numpy()\n",
    "\n",
    "    # Remove ignored index (special tokens) and convert to labels\n",
    "    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n",
    "    true_predictions = [\n",
    "        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "    return true_labels, true_predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm\n",
    "import torch\n",
    "\n",
    "progress_bar = tqdm(range(num_training_steps))\n",
    "\n",
    "for epoch in range(num_train_epochs):\n",
    "    # Training\n",
    "    model.train()\n",
    "    for batch in train_dataloader:\n",
    "        outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        accelerator.backward(loss)\n",
    "\n",
    "        optimizer.step()\n",
    "        lr_scheduler.step()\n",
    "        optimizer.zero_grad()\n",
    "        progress_bar.update(1)\n",
    "\n",
    "    # Evaluation\n",
    "    model.eval()\n",
    "    for batch in eval_dataloader:\n",
    "        with torch.no_grad():\n",
    "            outputs = model(**batch)\n",
    "\n",
    "        predictions = outputs.logits.argmax(dim=-1)\n",
    "        labels = batch[\"labels\"]\n",
    "\n",
    "        # Necessary to pad predictions and labels for being gathered\n",
    "        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)\n",
    "        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)\n",
    "\n",
    "        predictions_gathered = accelerator.gather(predictions)\n",
    "        labels_gathered = accelerator.gather(labels)\n",
    "\n",
    "        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)\n",
    "        metric.add_batch(predictions=true_predictions, references=true_labels)\n",
    "\n",
    "    results = metric.compute()\n",
    "    print(\n",
    "        f\"epoch {epoch}:\",\n",
    "        {\n",
    "            key: results[f\"overall_{key}\"]\n",
    "            for key in [\"precision\", \"recall\", \"f1\", \"accuracy\"]\n",
    "        },\n",
    "    )\n",
    "\n",
    "    # Save and upload\n",
    "    accelerator.wait_for_everyone()\n",
    "    unwrapped_model = accelerator.unwrap_model(model)\n",
    "    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)\n",
    "    if accelerator.is_main_process:\n",
    "        tokenizer.save_pretrained(output_dir)\n",
    "        repo.push_to_hub(\n",
    "            commit_message=f\"Training in progress epoch {epoch}\", blocking=False\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accelerator.wait_for_everyone()\n",
    "unwrapped_model = accelerator.unwrap_model(model)\n",
    "unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER', 'score': 0.9988506, 'word': 'Sylvain', 'start': 11, 'end': 18},\n",
       " {'entity_group': 'ORG', 'score': 0.9647625, 'word': 'Hugging Face', 'start': 33, 'end': 45},\n",
       " {'entity_group': 'LOC', 'score': 0.9986118, 'word': 'Brooklyn', 'start': 49, 'end': 57}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "# Replace this with your own checkpoint\n",
    "model_checkpoint = \"huggingface-course/bert-finetuned-ner\"\n",
    "token_classifier = pipeline(\n",
    "    \"token-classification\", model=model_checkpoint, aggregation_strategy=\"simple\"\n",
    ")\n",
    "token_classifier(\"My name is Sylvain and I work at Hugging Face in Brooklyn.\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "トークン分類 (PyTorch)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}