{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# WordPiece tokenization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = [\n", " \"This is the Hugging Face Course.\",\n", " \"This chapter is about tokenization.\",\n", " \"This section shows several tokenizer algorithms.\",\n", " \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "defaultdict(\n", " int, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1,\n", " 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1,\n", " ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1,\n", " 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "\n", "word_freqs = defaultdict(int)\n", "for text in corpus:\n", " words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " new_words = [word for word, offset in words_with_offsets]\n", " for word in new_words:\n", " word_freqs[word] += 1\n", "\n", "word_freqs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s',\n", " '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u',\n", " 'w', 'y']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alphabet = []\n", "for word in word_freqs.keys():\n", " if word[0] not in alphabet:\n", " alphabet.append(word[0])\n", " for letter in word[1:]:\n", " if f\"##{letter}\" not in alphabet:\n", " alphabet.append(f\"##{letter}\")\n", "\n", "alphabet.sort()\n", "alphabet\n", "\n", "print(alphabet)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab = [\"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"] + alphabet.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splits = {\n", " word: [c if i == 0 else f\"##{c}\" for i, c in enumerate(word)]\n", " for word in word_freqs.keys()\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compute_pair_scores(splits):\n", " letter_freqs = defaultdict(int)\n", " pair_freqs = defaultdict(int)\n", " for word, freq in word_freqs.items():\n", " split = splits[word]\n", " if len(split) == 1:\n", " letter_freqs[split[0]] += freq\n", " continue\n", " for i in range(len(split) - 1):\n", " pair = (split[i], split[i + 1])\n", " letter_freqs[split[i]] += freq\n", " pair_freqs[pair] += freq\n", " letter_freqs[split[-1]] += freq\n", "\n", " scores = {\n", " pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])\n", " for pair, freq in pair_freqs.items()\n", " }\n", " return scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('T', '##h'): 0.125\n", "('##h', '##i'): 0.03409090909090909\n", "('##i', '##s'): 0.02727272727272727\n", "('i', '##s'): 0.1\n", "('t', '##h'): 0.03571428571428571\n", "('##h', '##e'): 0.011904761904761904" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pair_scores = compute_pair_scores(splits)\n", "for i, key in enumerate(pair_scores.keys()):\n", " print(f\"{key}: {pair_scores[key]}\")\n", " if i >= 5:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('a', '##b') 0.2" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_pair = \"\"\n", "max_score = None\n", "for pair, score in pair_scores.items():\n", " if max_score is None or max_score < score:\n", " best_pair = pair\n", " max_score = score\n", "\n", "print(best_pair, max_score)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab.append(\"ab\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def merge_pair(a, b, splits):\n", " for word in word_freqs:\n", " split = splits[word]\n", " if len(split) == 1:\n", " continue\n", " i = 0\n", " while i < len(split) - 1:\n", " if split[i] == a and split[i + 1] == b:\n", " merge = a + b[2:] if b.startswith(\"##\") else a + b\n", " split = split[:i] + [merge] + split[i + 2 :]\n", " else:\n", " i += 1\n", " splits[word] = split\n", " return splits" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ab', '##o', '##u', '##t']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "splits = merge_pair(\"a\", \"##b\", splits)\n", "splits[\"about\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab_size = 70\n", "while len(vocab) < vocab_size:\n", " scores = compute_pair_scores(splits)\n", " best_pair, max_score = \"\", None\n", " for pair, score in scores.items():\n", " if max_score is None or max_score < score:\n", " best_pair = pair\n", " max_score = score\n", " splits = merge_pair(*best_pair, splits)\n", " new_token = (\n", " best_pair[0] + best_pair[1][2:]\n", " if best_pair[1].startswith(\"##\")\n", " else best_pair[0] + best_pair[1]\n", " )\n", " vocab.append(new_token)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k',\n", " '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H',\n", " 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully',\n", " 'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat',\n", " '##ut']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(vocab)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def encode_word(word):\n", " tokens = []\n", " while len(word) > 0:\n", " i = len(word)\n", " while i > 0 and word[:i] not in vocab:\n", " i -= 1\n", " if i == 0:\n", " return [\"[UNK]\"]\n", " tokens.append(word[:i])\n", " word = word[i:]\n", " if len(word) > 0:\n", " word = f\"##{word}\"\n", " return tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Hugg', '##i', '##n', '##g']\n", "['[UNK]']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(encode_word(\"Hugging\"))\n", "print(encode_word(\"HOgging\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(text):\n", " pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n", " encoded_words = [encode_word(word) for word in pre_tokenized_text]\n", " return sum(encoded_words, [])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s',\n", " '##e', '[UNK]']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize(\"This is the Hugging Face course!\")" ] } ], "metadata": { "colab": { "name": "WordPiece tokenization", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }