in benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py [0:0]
def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics, model, errors):
# Setup
start_dt_proto = Timestamp()
start_dt_proto.FromDatetime(args.start_datetime)
final_json = {
# metrics values are numerical
"metrics" : {
# Traffic
"num_prompts_attempted": benchmark_result['num_prompts_attempted'],
"num_prompts_succeeded": benchmark_result['num_prompts_succeeded'],
"request_rate": args.request_rate,
'server_metrics': {
**server_metrics
},
**benchmark_result,
**errors,
},
# dimensions values are strings
"dimensions": {
"date": args.start_datetime.strftime('%Y%m%d-%H%M%S'),
"backend": args.backend,
"model_id": model,
"tokenizer_id": args.tokenizer,
**(json.loads(args.additional_metadata_metrics_to_save) if args.additional_metadata_metrics_to_save else {})
},
"config": {
"model": model,
"num_models": len(args.models.split(',')),
"model_server": args.backend,
"start_time": {
"seconds" : start_dt_proto.seconds,
"nanos" : start_dt_proto.nanos
}
},
"summary_stats": {
"stats": [{
"request_rate": args.request_rate,
"request_latency": {
"mean": benchmark_result["avg_latency"],
"median": benchmark_result["median_latency"],
"sd": benchmark_result["sd_latency"],
"min": benchmark_result["min_latency"],
"max": benchmark_result["max_latency"],
"p90": benchmark_result["p90_latency"],
"p99": benchmark_result["p99_latency"],
},
"throughput": {
"mean": benchmark_result['throughput']
},
"input_length": {
"mean": benchmark_result["avg_input_len"],
"median": benchmark_result["median_input_len"],
"sd": benchmark_result["sd_input_len"],
"min": benchmark_result["min_input_len"],
"max": benchmark_result["max_input_len"],
"p90": benchmark_result["p90_input_len"],
"p99": benchmark_result["p99_input_len"],
},
"output_length": {
"mean": benchmark_result["avg_output_len"],
"median": benchmark_result["median_output_len"],
"sd": benchmark_result["sd_output_len"],
"min": benchmark_result["min_output_len"],
"max": benchmark_result["max_output_len"],
"p90": benchmark_result["p90_output_len"],
"p99": benchmark_result["p99_output_len"],
},
"tpot": {
"mean": benchmark_result["avg_per_output_token_latency"],
"median": benchmark_result["median_per_output_token_latency"],
"sd": benchmark_result["sd_per_output_token_latency"],
"min": benchmark_result["min_per_output_token_latency"],
"max": benchmark_result["max_per_output_token_latency"],
"p90": benchmark_result["p90_per_output_token_latency"],
"p99": benchmark_result["p99_per_output_token_latency"],
},
"model_server_metrics" : [{"Name": name, **metrics} for name, metrics in server_metrics.items()]
}]
}
}
# Save to file
model_without_slash = model.replace("/","-")
file_name = (
f"{args.file_prefix}-{args.backend}-{args.request_rate}qps-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json"
)
with open(file_name, "w", encoding="utf-8") as outfile:
json.dump(final_json, outfile)
if gcs_bucket is not None:
try:
gcs_bucket.blob(f"{args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name)
print(f"File {file_name} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}")
except google.cloud.exceptions.NotFound:
print(f"GS Bucket (gs://{args.output_bucket}) does not exist")