scripts/transformers/run_inference.py (39 lines of code) (raw):
from pathlib import Path
from time import perf_counter
import numpy as np
import torch
import typer
from datasets import Dataset, load_dataset
from transformers import pipeline
RESULTS_PATH = Path("results")
RESULTS_PATH.mkdir(parents=True, exist_ok=True)
device_id = 0 if torch.cuda.is_available() else -1
def time_pipeline(pipe: pipeline, dataset: Dataset):
latencies = []
# Warm up
for _ in range(10):
_ = pipe("Warming up the pipeline :)")
# Timed run
total_start_time = perf_counter()
for row in dataset:
start_time = perf_counter()
_ = pipe(row["text"])
latency = perf_counter() - start_time
latencies.append(latency)
total_time_ms = (perf_counter() - total_start_time) * 1_000
# Compute run statistics
time_avg_ms = 1_000 * np.mean(latencies)
time_std_ms = 1_000 * np.std(latencies)
time_p95_ms = 1_000 * np.percentile(latencies, 95)
print(
f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", # noqa
time_p95_ms,
f"Total time (ms) - {total_time_ms:.2f}",
)
def main(
model_id: str = "distilbert-base-uncased__sst2__train-16-4", dataset_id: str = "sst2", num_samples: int = None
):
# Load dataset
dataset = load_dataset(f"SetFit/{dataset_id}", split="test")
if num_samples is not None:
dataset = dataset.shuffle(seed=42).select(range(num_samples))
# Load pipeline
pipe = pipeline("text-classification", model=f"SetFit/{model_id}", device=device_id)
# Time it!
time_pipeline(pipe, dataset)
if __name__ == "__main__":
typer.run(main)