{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Normalization and pre-tokenization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", "print(type(tokenizer.backend_tokenizer))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello how are u?'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(tokenizer.backend_tokenizer.normalizer.normalize_str(\"Héllò hôw are ü?\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)),\n", " ('?', (19, 20))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"t5-small\")\n", "tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Hello, how are you?\")" ] } ], "metadata": { "colab": { "name": "Normalization and pre-tokenization", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }