{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Putting it all together (TensorFlow)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", "\n", "model_inputs = tokenizer(sequence)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", "\n", "model_inputs = tokenizer(sequence)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", "\n", "model_inputs = tokenizer(sequences)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Will pad the sequences up to the maximum sequence length\n", "model_inputs = tokenizer(sequences, padding=\"longest\")\n", "\n", "# Will pad the sequences up to the model max length\n", "# (512 for BERT or DistilBERT)\n", "model_inputs = tokenizer(sequences, padding=\"max_length\")\n", "\n", "# Will pad the sequences up to the specified max length\n", "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", "\n", "# Will truncate the sequences that are longer than the model max length\n", "# (512 for BERT or DistilBERT)\n", "model_inputs = tokenizer(sequences, truncation=True)\n", "\n", "# Will truncate the sequences that are longer than the specified max length\n", "model_inputs = tokenizer(sequences, max_length=8, truncation=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", "\n", "# Returns PyTorch tensors\n", "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n", "\n", "# Returns TensorFlow tensors\n", "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n", "\n", "# Returns NumPy arrays\n", "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n", "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", "\n", "model_inputs = tokenizer(sequence)\n", "print(model_inputs[\"input_ids\"])\n", "\n", "tokens = tokenizer.tokenize(sequence)\n", "ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n", "\"i've been waiting for a huggingface course my whole life.\"" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n", "print(tokenizer.decode(ids))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n", "\n", "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")\n", "output = model(**tokens)" ] } ], "metadata": { "colab": { "name": "Putting it all together (TensorFlow)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }