{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Putting it all together (TensorFlow)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
    "\n",
    "model_inputs = tokenizer(sequence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
    "\n",
    "model_inputs = tokenizer(sequence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
    "\n",
    "model_inputs = tokenizer(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Will pad the sequences up to the maximum sequence length\n",
    "model_inputs = tokenizer(sequences, padding=\"longest\")\n",
    "\n",
    "# Will pad the sequences up to the model max length\n",
    "# (512 for BERT or DistilBERT)\n",
    "model_inputs = tokenizer(sequences, padding=\"max_length\")\n",
    "\n",
    "# Will pad the sequences up to the specified max length\n",
    "model_inputs = tokenizer(sequences, padding=\"max_length\", max_length=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
    "\n",
    "# Will truncate the sequences that are longer than the model max length\n",
    "# (512 for BERT or DistilBERT)\n",
    "model_inputs = tokenizer(sequences, truncation=True)\n",
    "\n",
    "# Will truncate the sequences that are longer than the specified max length\n",
    "model_inputs = tokenizer(sequences, max_length=8, truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
    "\n",
    "# Returns PyTorch tensors\n",
    "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"pt\")\n",
    "\n",
    "# Returns TensorFlow tensors\n",
    "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"tf\")\n",
    "\n",
    "# Returns NumPy arrays\n",
    "model_inputs = tokenizer(sequences, padding=True, return_tensors=\"np\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]\n",
       "[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
    "\n",
    "model_inputs = tokenizer(sequence)\n",
    "print(model_inputs[\"input_ids\"])\n",
    "\n",
    "tokens = tokenizer.tokenize(sequence)\n",
    "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "print(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"[CLS] i've been waiting for a huggingface course my whole life. [SEP]\"\n",
       "\"i've been waiting for a huggingface course my whole life.\""
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(tokenizer.decode(model_inputs[\"input_ids\"]))\n",
    "print(tokenizer.decode(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "sequences = [\"I've been waiting for a HuggingFace course my whole life.\", \"So have I!\"]\n",
    "\n",
    "tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"tf\")\n",
    "output = model(**tokens)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Putting it all together (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}