{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tokenizers (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Jim', 'Henson', 'was', 'a', 'puppeteer']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_text = \"Jim Henson was a puppeteer\".split()\n", "print(tokenized_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import BertTokenizer\n", "\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],\n", " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer(\"Using a Transformer network is simple\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.save_pretrained(\"directory_on_my_computer\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "\n", "sequence = \"Using a Transformer network is simple\"\n", "tokens = tokenizer.tokenize(sequence)\n", "\n", "print(tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[7993, 170, 11303, 1200, 2443, 1110, 3014]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ids = tokenizer.convert_tokens_to_ids(tokens)\n", "\n", "print(ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Using a Transformer network is simple'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])\n", "print(decoded_string)" ] } ], "metadata": { "colab": { "name": "Tokenizers (TensorFlow)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }