{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Building a tokenizer, block by block" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"wikitext\", name=\"wikitext-2-raw-v1\", split=\"train\")\n", "\n", "\n", "def get_training_corpus():\n", " for i in range(0, len(dataset), 1000):\n", " yield dataset[i : i + 1000][\"text\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"wikitext-2.txt\", \"w\", encoding=\"utf-8\") as f:\n", " for i in range(len(dataset)):\n", " f.write(dataset[i][\"text\"] + \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tokenizers import (\n", " decoders,\n", " models,\n", " normalizers,\n", " pre_tokenizers,\n", " processors,\n", " trainers,\n", " Tokenizer,\n", ")\n", "\n", "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.normalizer = normalizers.Sequence(\n", " [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "hello how are u?" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(tokenizer.normalizer.normalize_str(\"Héllò hôw are ü?\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Let', (0, 3)), (\"'\", (3, 4)), ('s', (4, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre', (14, 17)),\n", " ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.pre_tokenizer.pre_tokenize_str(\"Let's test my pre-tokenizer.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(\"Let's\", (0, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre-tokenizer.', (14, 28))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pre_tokenizer = pre_tokenizers.WhitespaceSplit()\n", "pre_tokenizer.pre_tokenize_str(\"Let's test my pre-tokenizer.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Let', (0, 3)), (\"'\", (3, 4)), ('s', (4, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre', (14, 17)),\n", " ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pre_tokenizer = pre_tokenizers.Sequence(\n", " [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]\n", ")\n", "pre_tokenizer.pre_tokenize_str(\"Let's test my pre-tokenizer.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n", "trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.model = models.WordPiece(unk_token=\"[UNK]\")\n", "tokenizer.train([\"wikitext-2.txt\"], trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['let', \"'\", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer.\")\n", "print(encoding.tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2, 3)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n", "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n", "print(cls_token_id, sep_token_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.post_processor = processors.TemplateProcessing(\n", " single=f\"[CLS]:0 $A:0 [SEP]:0\",\n", " pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n", " special_tokens=[(\"[CLS]\", cls_token_id), (\"[SEP]\", sep_token_id)],\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]', 'let', \"'\", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer.\")\n", "print(encoding.tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[CLS]', 'let', \"'\", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer...\", \"on a pair of sentences.\")\n", "print(encoding.tokens)\n", "print(encoding.type_ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"let's test this tokenizer... on a pair of sentences.\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(encoding.ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.save(\"tokenizer.json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_tokenizer = Tokenizer.from_file(\"tokenizer.json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import PreTrainedTokenizerFast\n", "\n", "wrapped_tokenizer = PreTrainedTokenizerFast(\n", " tokenizer_object=tokenizer,\n", " # tokenizer_file=\"tokenizer.json\", # You can load from the tokenizer file, alternatively\n", " unk_token=\"[UNK]\",\n", " pad_token=\"[PAD]\",\n", " cls_token=\"[CLS]\",\n", " sep_token=\"[SEP]\",\n", " mask_token=\"[MASK]\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import BertTokenizerFast\n", "\n", "wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(models.BPE())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Let', (0, 3)), (\"'s\", (3, 5)), ('Ġtest', (5, 10)), ('Ġpre', (10, 14)), ('-', (14, 15)),\n", " ('tokenization', (15, 27)), ('!', (27, 28))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.pre_tokenizer.pre_tokenize_str(\"Let's test pre-tokenization!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=[\"<|endoftext|>\"])\n", "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.model = models.BPE()\n", "tokenizer.train([\"wikitext-2.txt\"], trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['L', 'et', \"'\", 's', 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'izer', '.']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer.\")\n", "print(encoding.tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' test'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentence = \"Let's test this tokenizer.\"\n", "encoding = tokenizer.encode(sentence)\n", "start, end = encoding.offsets[4]\n", "sentence[start:end]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.decoder = decoders.ByteLevel()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Let's test this tokenizer.\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(encoding.ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import PreTrainedTokenizerFast\n", "\n", "wrapped_tokenizer = PreTrainedTokenizerFast(\n", " tokenizer_object=tokenizer,\n", " bos_token=\"<|endoftext|>\",\n", " eos_token=\"<|endoftext|>\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2TokenizerFast\n", "\n", "wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(models.Unigram())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from tokenizers import Regex\n", "\n", "tokenizer.normalizer = normalizers.Sequence(\n", " [\n", " normalizers.Replace(\"``\", '\"'),\n", " normalizers.Replace(\"''\", '\"'),\n", " normalizers.NFKD(),\n", " normalizers.StripAccents(),\n", " normalizers.Replace(Regex(\" {2,}\"), \" \"),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(\"▁Let's\", (0, 5)), ('▁test', (5, 10)), ('▁the', (10, 14)), ('▁pre-tokenizer!', (14, 29))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.pre_tokenizer.pre_tokenize_str(\"Let's test the pre-tokenizer!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "special_tokens = [\"\", \"\", \"\", \"\", \"\", \"\", \"\"]\n", "trainer = trainers.UnigramTrainer(\n", " vocab_size=25000, special_tokens=special_tokens, unk_token=\"\"\n", ")\n", "tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.model = models.Unigram()\n", "tokenizer.train([\"wikitext-2.txt\"], trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['▁Let', \"'\", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer.\")\n", "print(encoding.tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cls_token_id = tokenizer.token_to_id(\"\")\n", "sep_token_id = tokenizer.token_to_id(\"\")\n", "print(cls_token_id, sep_token_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.post_processor = processors.TemplateProcessing(\n", " single=\"$A:0 :0 :2\",\n", " pair=\"$A:0 :0 $B:1 :1 :2\",\n", " special_tokens=[(\"\", sep_token_id), (\"\", cls_token_id)],\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['▁Let', \"'\", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '', '▁', 'on', '▁', 'a', '▁pair', \n", " '▁of', '▁sentence', 's', '!', '', '']\n", "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = tokenizer.encode(\"Let's test this tokenizer...\", \"on a pair of sentences!\")\n", "print(encoding.tokens)\n", "print(encoding.type_ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.decoder = decoders.Metaspace()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import PreTrainedTokenizerFast\n", "\n", "wrapped_tokenizer = PreTrainedTokenizerFast(\n", " tokenizer_object=tokenizer,\n", " bos_token=\"\",\n", " eos_token=\"\",\n", " unk_token=\"\",\n", " pad_token=\"\",\n", " cls_token=\"\",\n", " sep_token=\"\",\n", " mask_token=\"\",\n", " padding_side=\"left\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import XLNetTokenizerFast\n", "\n", "wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)" ] } ], "metadata": { "colab": { "name": "Building a tokenizer, block by block", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }