{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Handling multiple sequences (PyTorch)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "\n", "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", "\n", "tokens = tokenizer.tokenize(sequence)\n", "ids = tokenizer.convert_tokens_to_ids(tokens)\n", "input_ids = torch.tensor(ids)\n", "# This line will fail.\n", "model(input_ids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172,\n", " 2607, 2026, 2878, 2166, 1012, 102]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_inputs = tokenizer(sequence, return_tensors=\"pt\")\n", "print(tokenized_inputs[\"input_ids\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Input IDs: [[ 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]\n", "Logits: [[-2.7276, 2.8789]]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "\n", "checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "\n", "sequence = \"I've been waiting for a HuggingFace course my whole life.\"\n", "\n", "tokens = tokenizer.tokenize(sequence)\n", "ids = tokenizer.convert_tokens_to_ids(tokens)\n", "\n", "input_ids = torch.tensor([ids])\n", "print(\"Input IDs:\", input_ids)\n", "\n", "output = model(input_ids)\n", "print(\"Logits:\", output.logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "batched_ids = [\n", " [200, 200, 200],\n", " [200, 200]\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "padding_id = 100\n", "\n", "batched_ids = [\n", " [200, 200, 200],\n", " [200, 200, padding_id],\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 1.5694, -1.3895]], grad_fn=)\n", "tensor([[ 0.5803, -0.4125]], grad_fn=)\n", "tensor([[ 1.5694, -1.3895],\n", " [ 1.3373, -1.2163]], grad_fn=)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "\n", "sequence1_ids = [[200, 200, 200]]\n", "sequence2_ids = [[200, 200]]\n", "batched_ids = [\n", " [200, 200, 200],\n", " [200, 200, tokenizer.pad_token_id],\n", "]\n", "\n", "print(model(torch.tensor(sequence1_ids)).logits)\n", "print(model(torch.tensor(sequence2_ids)).logits)\n", "print(model(torch.tensor(batched_ids)).logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 1.5694, -1.3895],\n", " [ 0.5803, -0.4125]], grad_fn=)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batched_ids = [\n", " [200, 200, 200],\n", " [200, 200, tokenizer.pad_token_id],\n", "]\n", "\n", "attention_mask = [\n", " [1, 1, 1],\n", " [1, 1, 0],\n", "]\n", "\n", "outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))\n", "print(outputs.logits)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sequence = sequence[:max_sequence_length]" ] } ], "metadata": { "colab": { "name": "Handling multiple sequences (PyTorch)", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }