In [1]:
! pip install tokenizer datasets sentencepiece protobuf==3.20.0
! nvidia-smi

Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.7/451.7 KB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.11.0-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.5/139.5 KB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.0/212.0 KB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow>=6.0.0
  Downloading pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K     [90m━━━━

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import time
import torch



In [3]:
model_name = "BaptisteDoyen/camembert-base-xnli"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.eval().cuda()

model_opt = AutoModelForSequenceClassification.from_pretrained(model_name)
model_opt = model_opt.eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = load_dataset(path="xnli", name="fr")

Downloading:   0%|          | 0.00/882 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/36.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

Downloading and preparing dataset xnli/fr to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/466M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

  

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Dataset xnli downloaded and prepared to /root/.cache/huggingface/datasets/xnli/fr/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Below we do a warmup, it builds the triton kernels optimized for each size.

In [4]:
from kernl.model_optimization import optimize_model

optimize_model(model_opt)
start = time.perf_counter()
shapes = [(1, w) for w in range(8, 128 + 8, 8)]
with torch.inference_mode(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=True):
    for s in shapes:
        inputs = {
            "input_ids": torch.ones(s, device="cuda", dtype=torch.long),
            "attention_mask": torch.ones(s, device="cuda", dtype=torch.long),
        }
        _ = model_opt(**inputs)
        _ = model(**inputs)

print(f"{time.perf_counter() - start:.0f}s")

370s


In [5]:
complete_time_baseline = 0
score_baseline = 0
complete_time_optimized = 0
score_optimize = 0
nb_examples = len(dataset["test"])
nb_disagree = 0

with torch.inference_mode(), torch.cuda.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=True):
    for index, content in enumerate(dataset["test"]):
        premise, hypothesis, label = content.values()
        inputs = tokenizer(premise, hypothesis, return_tensors="pt", pad_to_multiple_of=8, padding=True)
        inputs = dict(inputs.to("cuda"))

        torch.cuda.synchronize()
        start = time.perf_counter()
        output_original = model(**inputs)
        torch.cuda.synchronize()
        complete_time_baseline += time.perf_counter() - start

        choice_baseline = torch.argmax(output_original.logits, dim=1)
        score_baseline += label == choice_baseline.item()

        start = time.perf_counter()
        output_optimized = model_opt(**inputs)
        torch.cuda.synchronize()
        complete_time_optimized += time.perf_counter() - start

        choice_optimize = torch.argmax(output_optimized.logits, dim=1)
        score_optimize += label == choice_optimize.item()

        assert torch.allclose(
            output_original.logits, output_optimized.logits, atol=1e-1
        ), f"logits don't match:\n{output_original}\n{output_optimized}"
        if choice_baseline != choice_optimize:
            nb_disagree += 1

print(f"{complete_time_baseline=:.2f}s")
print(f"{complete_time_optimized=:.2f}s")
print(f"{nb_disagree=}")
print(f"score baseline: {score_baseline / nb_examples:.2f}")
print(f"score optimize: {score_optimize / nb_examples:.2f}")

complete_time_baseline=38.08s
complete_time_optimized=5.25s
nb_disagree=1
score baseline: 0.82
score optimize: 0.82
