in benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py [0:0]
def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_requests, model, request_latencies, ttfts, errors):
benchmark_result = {}
print(f"====Result for Model: {model}====")
print(f"Errors: {errors}")
print(f"Total time: {benchmark_duration:.2f} s")
print(f"Successful/total requests: {len(request_latencies)}/{total_requests}")
print(f"Requests/min: {60 * total_requests / benchmark_duration:.2f}")
benchmark_result["num_prompts_attempted"] = total_requests
benchmark_result["num_prompts_succeeded"] = len(request_latencies)
benchmark_result['benchmark_time'] = benchmark_duration
benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_duration)
total_output_tokens = np.sum([output_len for _, output_len, _ in
request_latencies])
output_tokens_per_second = total_output_tokens / benchmark_duration
benchmark_result['throughput'] = output_tokens_per_second
output_tokens_per_min = 60 * output_tokens_per_second
print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
benchmark_result['total_output_token'] = int(total_output_tokens)
benchmark_result['output_tokens_per_min'] = output_tokens_per_min
total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
request_latencies])
input_tokens_per_min = 60 * total_input_tokens / benchmark_duration
print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
benchmark_result['total_input_tokens'] = int(total_input_tokens)
benchmark_result['input_tokens_per_min'] = input_tokens_per_min
total_tokens = total_input_tokens + total_output_tokens
tokens_per_min = 60 * total_tokens / benchmark_duration
print(f"Tokens/min: {tokens_per_min:.2f}")
benchmark_result['total_tokens'] = int(total_tokens)
benchmark_result['tokens_per_min'] = tokens_per_min
ttft_stats = {}
if args.stream_request:
ttft_stats = get_stats_for_set("TTFT", "Time to First Token (s)", ttfts)
if args.machine_cost:
print(
"Cost $/1k tokens:"
f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
)
benchmark_result = {
**benchmark_result,
**(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [
latency / (prompt_len + output_len)
for prompt_len, output_len, latency in request_latencies
])),
**ttft_stats,
# NOTE: The latency below includes requests awaiting time on server side.
# It's not comparable with the model inference latency for batch size 1.
**(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in request_latencies])),
**(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in request_latencies])),
**(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in request_latencies])),
**(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in request_latencies]))
}
server_metrics = {}
if args.scrape_server_metrics:
server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend)
if args.save_json_results:
save_json_results(args, benchmark_result, server_metrics, model, errors)