In [1]:
! pip install tokenizer sentencepiece
! nvidia-smi

Collecting tokenizer
 Downloading tokenizer-3.4.2-py2.py3-none-any.whl (79 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
 Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizer, sentencepiece
Successfully installed sentencepiece-0.1.97 tokenizer-3.4.2
You should consider upgrading via the '/usr/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0mSat Oct 29 19:53:58 2022 
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.85.02 Driver Version: 510.85.02 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr.

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import time
import torch._dynamo as torchdynamo
import torch
from kernl.model_optimization import optimize_model

In [3]:
# default cache size needs to be increased to store the many graphs with generative models
torchdynamo.config.cache_size_limit = 512

model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 0%| | 0.00/1.20k [00:00<?, ?B/s]

Downloading: 0%| | 0.00/242M [00:00<?, ?B/s]

Downloading: 0%| | 0.00/792k [00:00<?, ?B/s]

Downloading: 0%| | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
input_ids = tokenizer(
 "translate English to French: The house in the woods is wonderful, can we buy it ?",
 return_tensors="pt",
 pad_to_multiple_of=8,
 padding=True,
).to("cuda")

In [5]:
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
 for _ in range(10):
 output = model.generate(
 inputs=input_ids["input_ids"],
 min_length=22,
 max_length=22,
 )
 torch.cuda.synchronize()
 start = time.perf_counter()
 output = model.generate(
 inputs=input_ids["input_ids"],
 min_length=22,
 max_length=22,
 )
 torch.cuda.synchronize()
 latency_baseline = time.perf_counter() - start
 print(latency_baseline)
 print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.16655227100000047
La maison dans les bois est merveilleuse, pouvons-nous l'acheter? 


In [6]:
optimize_model(model.encoder)
optimize_model(model.decoder)

In [8]:
# warmup (IRL, encoder and decoder should be warmed each on their own)
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
 start = time.perf_counter()
 model.generate(inputs=input_ids["input_ids"], min_length=22, max_length=22)
 print(time.perf_counter() - start)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
with torch.inference_mode(), torch.autocast(dtype=torch.float16, cache_enabled=True, device_type="cuda"):
 for _ in range(10):
 model.generate(
 inputs=input_ids["input_ids"],
 min_length=22,
 max_length=22,
 )
 torch.cuda.synchronize()
 start = time.perf_counter()
 output = model.generate(
 inputs=input_ids["input_ids"],
 min_length=22,
 max_length=22,
 )
 torch.cuda.synchronize()
 latency_optimized = time.perf_counter() - start
 print(latency_optimized)
 print(f"{latency_baseline/latency_optimized:.1f}x speedup")
 print(tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

0.02287006200003816
7.3x speedup
La maison dans les bois est merveilleuse, pouvons-nous l'acheter? 
