{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Byte-Pair Encoding tokenization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = [\n", " \"This is the Hugging Face course.\",\n", " \"This chapter is about tokenization.\",\n", " \"This section shows several tokenizer algorithms.\",\n", " \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "defaultdict(int, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1,\n", " 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1,\n", " 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1,\n", " 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "\n", "word_freqs = defaultdict(int)\n", "\n", "for text in corpus:\n", " words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " new_words = [word for word, offset in words_with_offsets]\n", " for word in new_words:\n", " word_freqs[word] += 1\n", "\n", "print(word_freqs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[ ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's',\n", " 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alphabet = []\n", "\n", "for word in word_freqs.keys():\n", " for letter in word:\n", " if letter not in alphabet:\n", " alphabet.append(letter)\n", "alphabet.sort()\n", "\n", "print(alphabet)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab = [\"<|endoftext|>\"] + alphabet.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splits = {word: [c for c in word] for word in word_freqs.keys()}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compute_pair_freqs(splits):\n", " pair_freqs = defaultdict(int)\n", " for word, freq in word_freqs.items():\n", " split = splits[word]\n", " if len(split) == 1:\n", " continue\n", " for i in range(len(split) - 1):\n", " pair = (split[i], split[i + 1])\n", " pair_freqs[pair] += freq\n", " return pair_freqs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('T', 'h'): 3\n", "('h', 'i'): 3\n", "('i', 's'): 5\n", "('Ġ', 'i'): 2\n", "('Ġ', 't'): 7\n", "('t', 'h'): 3" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pair_freqs = compute_pair_freqs(splits)\n", "\n", "for i, key in enumerate(pair_freqs.keys()):\n", " print(f\"{key}: {pair_freqs[key]}\")\n", " if i >= 5:\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Ġ', 't') 7" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_pair = \"\"\n", "max_freq = None\n", "\n", "for pair, freq in pair_freqs.items():\n", " if max_freq is None or max_freq < freq:\n", " best_pair = pair\n", " max_freq = freq\n", "\n", "print(best_pair, max_freq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "merges = {(\"Ġ\", \"t\"): \"Ġt\"}\n", "vocab.append(\"Ġt\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def merge_pair(a, b, splits):\n", " for word in word_freqs:\n", " split = splits[word]\n", " if len(split) == 1:\n", " continue\n", "\n", " i = 0\n", " while i < len(split) - 1:\n", " if split[i] == a and split[i + 1] == b:\n", " split = split[:i] + [a + b] + split[i + 2 :]\n", " else:\n", " i += 1\n", " splits[word] = split\n", " return splits" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "splits = merge_pair(\"Ġ\", \"t\", splits)\n", "print(splits[\"Ġtrained\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab_size = 50\n", "\n", "while len(vocab) < vocab_size:\n", " pair_freqs = compute_pair_freqs(splits)\n", " best_pair = \"\"\n", " max_freq = None\n", " for pair, freq in pair_freqs.items():\n", " if max_freq is None or max_freq < freq:\n", " best_pair = pair\n", " max_freq = freq\n", " splits = merge_pair(*best_pair, splits)\n", " merges[best_pair] = best_pair[0] + best_pair[1]\n", " vocab.append(best_pair[0] + best_pair[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en',\n", " ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok',\n", " ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe',\n", " ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(merges)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o',\n", " 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se',\n", " 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(vocab)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(text):\n", " pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", " pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n", " splits = [[l for l in word] for word in pre_tokenized_text]\n", " for pair, merge in merges.items():\n", " for idx, split in enumerate(splits):\n", " i = 0\n", " while i < len(split) - 1:\n", " if split[i] == pair[0] and split[i + 1] == pair[1]:\n", " split = split[:i] + [merge] + split[i + 2 :]\n", " else:\n", " i += 1\n", " splits[idx] = split\n", "\n", " return sum(splits, [])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize(\"This is not a token.\")" ] } ], "metadata": { "colab": { "name": "Byte-Pair Encoding tokenization", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }