scripts/benchmark_pipelines.py (156 lines of code) (raw):

from argparse import ArgumentParser, Namespace from typing import List, Union import numpy as np import torch from huggingface_hub import login from tqdm import trange from transformers import pipeline as raw_pipeline from optimum.nvidia.pipelines import pipeline def get_transformers_pipeline(args: Namespace): if "dtype" in args: assert args.dtype in {"float16", "bfloat16", "float32"} return raw_pipeline( model=args.model, torch_dtype=args.dtype, model_kwargs={ "device_map": "balanced", "max_memory": {0: "20GiB", "cpu": "64GiB"}, }, ) def get_trtllm_pipeline(args: Namespace): return pipeline( model=args.model, use_fp8=args.use_fp8, use_cuda_graph=args.use_cuda_graph, max_batch_size=args.batch_size, max_prompt_length=args.prompt_length, max_new_tokens=args.max_new_tokens, tp=args.tp, pp=args.pp, gpus_per_node=args.gpus_per_node, world_size=args.world_size, dtype=args.dtype, ) def create_prompt_for_length(batch: int, length: int) -> Union[str, List[str]]: tokens = ["I"] * length tokens = " ".join(tokens) if batch == 1: return tokens return [tokens] * batch if __name__ == "__main__": parser = ArgumentParser("Hugging Face Optimum-Nvidia Pipelines Benchmarking tool") parser.add_argument( "--token", type=str, help="Hugging Face Hub token to authenticate the request." ) parser.add_argument( "--warmup", type=int, default=10, help="Number of warmup runs before collecting metrics.", ) parser.add_argument( "--repeat", type=int, default=20, help="Number of runs collecting metrics." ) parser.add_argument( "--batch-size", type=int, required=True, help="Size of the batch." ) parser.add_argument( "--prompt-length", type=int, required=True, help="Size of the prompt to use." ) parser.add_argument( "--output-length", type=int, help="Size of the desired output (prompt included).", ) parser.add_argument( "--use-transformers", action="store_true", help="Use transformers pipeline as baseline.", ) parser.add_argument( "--use-cuda-graph", action="store_true", help="Turn on CUDA Graph." ) parser.add_argument( "--use-fp8", action="store_true", help="Attempt to benchmark in float8 precision.", ) parser.add_argument( "--dtype", type=str, default="float16", help="Specify the precision for the model.", ) parser.add_argument( "--tp", type=int, default=1, help="Degree of tensor parallelism to apply." ) parser.add_argument( "--pp", type=int, default=1, help="Degree of pipeline parallelism to apply." ) parser.add_argument( "--gpus-per-node", type=int, default=1, help="Number of GPUs per node." ) parser.add_argument( "--world-size", type=int, help="Total number of GPUs over all the node." ) parser.add_argument( "--time-to-first-token", action="store_true", help="Indicate we will only generating a single token.", ) parser.add_argument("model", type=str, help="Model's id to use for the benchmark.") args = parser.parse_args() args.world_size = args.world_size or args.gpus_per_node if not args.world_size: args.world_size = args.gpus_per_node if args.token: login(args.token) # Check use case if args.time_to_first_token: args.max_new_tokens = 1 args.min_length = args.prompt_length + 1 args.output_length = args.prompt_length + 1 else: args.min_length = args.output_length args.max_new_tokens = args.output_length - args.prompt_length prompt = create_prompt_for_length(args.batch_size, args.prompt_length) pipe = ( get_transformers_pipeline(args) if args.use_transformers else get_trtllm_pipeline(args) ) # Warm up for _ in trange(args.warmup, desc="Warming up..."): _ = pipe( prompt, max_new_tokens=args.max_new_tokens, min_length=args.min_length, use_cache=True, ) start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) # Benchmark latencies = [] for _ in trange(args.repeat, desc="Benchmarking..."): start.record() _ = pipe( prompt, max_new_tokens=args.max_new_tokens, min_length=args.min_length, use_cache=True, ) end.record() torch.cuda.synchronize() latencies.append(start.elapsed_time(end)) latencies = np.array(latencies) if args.time_to_first_token: print( "Time-To-First-Token Latency: " f"{latencies.mean().astype(np.uint64)} ms " f"(+/- {latencies.std().astype(np.uint64)})" ) else: num_tokens = args.batch_size * args.output_length tokens_per_sec = num_tokens / (latencies / 1e3) print( "Throughput: " f"{tokens_per_sec.mean().astype(np.uint64)} tokens/s " f"(+/- {tokens_per_sec.std().astype(np.uint64)})" )