{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling multiple sequences (TensorFlow)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "InvalidArgumentError: Input to reshape is a tensor with 14 values, but the requested shape has 196 [Op:Reshape]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "\n",
    "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
    "\n",
    "tokens = tokenizer.tokenize(sequence)\n",
    "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "input_ids = tf.constant(ids)\n",
    "# This line will fail.\n",
    "model(input_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tf.Tensor: shape=(1, 16), dtype=int32, numpy=\n",
       "array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,\n",
       "        12172,  2607,  2026,  2878,  2166,  1012,   102]], dtype=int32)>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_inputs = tokenizer(sequence, return_tensors=\"tf\")\n",
    "print(tokenized_inputs[\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Input IDs: tf.Tensor(\n",
       "[[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878\n",
       "   2166  1012]], shape=(1, 14), dtype=int32)\n",
       "Logits: tf.Tensor([[-2.7276208  2.8789377]], shape=(1, 2), dtype=float32)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n",
    "\n",
    "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "\n",
    "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n",
    "\n",
    "tokens = tokenizer.tokenize(sequence)\n",
    "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "\n",
    "input_ids = tf.constant([ids])\n",
    "print(\"Input IDs:\", input_ids)\n",
    "\n",
    "output = model(input_ids)\n",
    "print(\"Logits:\", output.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "padding_id = 100\n",
    "\n",
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, padding_id],\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tf.Tensor([[ 1.5693678 -1.3894581]], shape=(1, 2), dtype=float32)\n",
       "tf.Tensor([[ 0.5803005  -0.41252428]], shape=(1, 2), dtype=float32)\n",
       "tf.Tensor(\n",
       "[[ 1.5693681 -1.3894582]\n",
       " [ 1.3373486 -1.2163193]], shape=(2, 2), dtype=float32)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "\n",
    "sequence1_ids = [[200, 200, 200]]\n",
    "sequence2_ids = [[200, 200]]\n",
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, tokenizer.pad_token_id],\n",
    "]\n",
    "\n",
    "print(model(tf.constant(sequence1_ids)).logits)\n",
    "print(model(tf.constant(sequence2_ids)).logits)\n",
    "print(model(tf.constant(batched_ids)).logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tf.Tensor(\n",
       "[[ 1.5693681  -1.3894582 ]\n",
       " [ 0.5803021  -0.41252586]], shape=(2, 2), dtype=float32)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batched_ids = [\n",
    "    [200, 200, 200],\n",
    "    [200, 200, tokenizer.pad_token_id],\n",
    "]\n",
    "\n",
    "attention_mask = [\n",
    "    [1, 1, 1],\n",
    "    [1, 1, 0],\n",
    "]\n",
    "\n",
    "outputs = model(tf.constant(batched_ids), attention_mask=tf.constant(attention_mask))\n",
    "print(outputs.logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence = sequence[:max_sequence_length]"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Handling multiple sequences (TensorFlow)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}