{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form"
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#@title\n",
"from IPython.display import HTML\n",
"\n",
"HTML('')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the Transformers and Datasets libraries to run this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install datasets transformers[sentencepiece]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]\n",
"[1045, 5223, 2023, 1012]\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"sentences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"I hate this.\",\n",
"]\n",
"tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n",
"ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
"print(ids[0])\n",
"print(ids[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Can't convert non-rectangular Python sequence to Tensor.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m [1045, 5223, 2023, 1012]]\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0minput_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconstant\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py\u001b[0m in \u001b[0;36mconstant\u001b[0;34m(value, dtype, shape, name)\u001b[0m\n\u001b[1;32m 263\u001b[0m \"\"\"\n\u001b[1;32m 264\u001b[0m return _constant_impl(value, dtype, shape, name, verify_shape=False,\n\u001b[0;32m--> 265\u001b[0;31m allow_broadcast=True)\n\u001b[0m\u001b[1;32m 266\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py\u001b[0m in \u001b[0;36m_constant_impl\u001b[0;34m(value, dtype, shape, name, verify_shape, allow_broadcast)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTrace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tf.constant\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_constant_eager_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_constant_eager_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0mg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_default_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py\u001b[0m in \u001b[0;36m_constant_eager_impl\u001b[0;34m(ctx, value, dtype, shape, verify_shape)\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_constant_eager_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0;34m\"\"\"Implementation of eager constant.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 301\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_to_eager_tensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 302\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshape\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py\u001b[0m in \u001b[0;36mconvert_to_eager_tensor\u001b[0;34m(value, ctx, dtype)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_datatype_enum\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mensure_initialized\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEagerTensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Can't convert non-rectangular Python sequence to Tensor."
]
}
],
"source": [
"import tensorflow as tf\n",
"\n",
"ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012]]\n",
"\n",
"input_ids = tf.constant(ids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
"\n",
"input_ids = tf.constant(ids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"tokenizer.pad_token_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.\n",
"\n",
"All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([[-2.7276204 2.8789372]], shape=(1, 2), dtype=float32)\n",
"tf.Tensor([[ 3.9497483 -3.1357408]], shape=(1, 2), dtype=float32)\n",
"tf.Tensor(\n",
"[[-2.7276206 2.878937 ]\n",
" [ 1.5444432 -1.3998369]], shape=(2, 2), dtype=float32)\n"
]
}
],
"source": [
"from transformers import TFAutoModelForSequenceClassification\n",
"\n",
"ids1 = tf.constant(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]\n",
")\n",
"ids2 = tf.constant([[1045, 5223, 2023, 1012]])\n",
"all_ids = tf.constant(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")\n",
"\n",
"model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"print(model(ids1).logits)\n",
"print(model(ids2).logits)\n",
"print(model(all_ids).logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_ids = tf.constant(\n",
" [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],\n",
" [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")\n",
"attention_mask = tf.constant(\n",
" [[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']\n",
"- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_39']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([[-2.7276204 2.8789372]], shape=(1, 2), dtype=float32)\n",
"tf.Tensor([[ 3.9497483 -3.1357408]], shape=(1, 2), dtype=float32)\n"
]
}
],
"source": [
"model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
"output1 = model(ids1)\n",
"output2 = model(ids2)\n",
"print(output1.logits)\n",
"print(output2.logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor(\n",
"[[-2.7276206 2.878937 ]\n",
" [ 3.9497476 -3.1357408]], shape=(2, 2), dtype=float32)\n"
]
}
],
"source": [
"output = model(all_ids, attention_mask=attention_mask)\n",
"print(output.logits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"sentences = [\n",
" \"I've been waiting for a HuggingFace course my whole life.\",\n",
" \"I hate this.\",\n",
"]\n",
"print(tokenizer(sentences, padding=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "Batching inputs together (TensorFlow)",
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 4
}