def print_and_save

def print_and_save_result()

in benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py [0:0]
55 lines of code
13 McCabe index (conditional complexity)

def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_requests, model, request_latencies, ttfts, errors):
  benchmark_result = {}

  print(f"====Result for Model: {model}====")
  print(f"Errors: {errors}")
  print(f"Total time: {benchmark_duration:.2f} s")
  print(f"Successful/total requests: {len(request_latencies)}/{total_requests}")
  print(f"Requests/min: {60 * total_requests / benchmark_duration:.2f}")
  benchmark_result["num_prompts_attempted"] = total_requests
  benchmark_result["num_prompts_succeeded"] = len(request_latencies)
  benchmark_result['benchmark_time'] = benchmark_duration
  benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_duration)

  total_output_tokens = np.sum([output_len for _, output_len, _ in
                                request_latencies])
  output_tokens_per_second = total_output_tokens / benchmark_duration
  benchmark_result['throughput'] = output_tokens_per_second

  output_tokens_per_min = 60 * output_tokens_per_second
  print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
  benchmark_result['total_output_token'] = int(total_output_tokens)
  benchmark_result['output_tokens_per_min'] = output_tokens_per_min

  total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
                               request_latencies])
  input_tokens_per_min = 60 * total_input_tokens / benchmark_duration
  print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
  benchmark_result['total_input_tokens'] = int(total_input_tokens)
  benchmark_result['input_tokens_per_min'] = input_tokens_per_min

  total_tokens = total_input_tokens + total_output_tokens
  tokens_per_min = 60 * total_tokens / benchmark_duration
  print(f"Tokens/min: {tokens_per_min:.2f}")
  benchmark_result['total_tokens'] = int(total_tokens)
  benchmark_result['tokens_per_min'] = tokens_per_min
  ttft_stats = {}
  if args.stream_request:
    ttft_stats = get_stats_for_set("TTFT", "Time to First Token (s)", ttfts)
  if args.machine_cost:
    print(
        "Cost $/1k tokens:"
        f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
    )

  benchmark_result = {
    **benchmark_result,
    **(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [
      latency / (prompt_len + output_len)
      for prompt_len, output_len, latency in request_latencies
    ])),
    **ttft_stats,
    # NOTE: The latency below includes requests awaiting time on server side.
    # It's not comparable with the model inference latency for batch size 1.
    **(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in request_latencies])),
    **(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in request_latencies])),
    **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in request_latencies])),
    **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in request_latencies]))
  }

  server_metrics = {}
  if args.scrape_server_metrics:
    server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend)
  if args.save_json_results:
    save_json_results(args, benchmark_result, server_metrics, model, errors)