in optimum/executorch/stats.py [0:0]
def print_report(self):
"""Print a report of the stats, similar to the C++ implementation."""
print(
"\n⚠️ DISCLAIMER: Python-based perf measurements are approximate and may not "
"match absolute speeds on Android/iOS apps. They are intended for relative "
"comparisons—-e.g. SDPA vs. custom SDPA, FP16 vs. FP32—-so you can gauge "
"performance improvements from each optimization step. For end-to-end, "
"platform-accurate benchmarks, please use the official ExecuTorch apps:\n"
" • iOS: https://github.com/pytorch/executorch/tree/main/extension/benchmark/apple/Benchmark\n"
" • Android: https://github.com/pytorch/executorch/tree/main/extension/benchmark/android/benchmark\n"
)
print(f"PyTorchObserver {self.to_json_string()}")
print(f"\tPrompt Tokens: {self.num_prompt_tokens} Generated Tokens: {self.num_generated_tokens}")
model_load_time = (self.model_load_end_ms - self.model_load_start_ms) / self.SCALING_FACTOR_UNITS_PER_SECOND
print(f"\tModel Load Time:\t\t{model_load_time:.6f} (seconds)")
inference_time_ms = self.inference_end_ms - self.inference_start_ms
inference_time = inference_time_ms / self.SCALING_FACTOR_UNITS_PER_SECOND
if inference_time_ms > 0 and self.num_generated_tokens > 0:
inference_rate = (self.num_generated_tokens / inference_time_ms) * self.SCALING_FACTOR_UNITS_PER_SECOND
else:
inference_rate = 0
print(
f"\tTotal inference time:\t\t{inference_time:.6f} (seconds)\t\t Rate: \t{inference_rate:.6f} (tokens/second)"
)
prompt_eval_time = (self.prompt_eval_end_ms - self.inference_start_ms) / self.SCALING_FACTOR_UNITS_PER_SECOND
if (self.prompt_eval_end_ms - self.inference_start_ms) > 0 and self.num_prompt_tokens > 0:
prompt_eval_rate = (
self.num_prompt_tokens / (self.prompt_eval_end_ms - self.inference_start_ms)
) * self.SCALING_FACTOR_UNITS_PER_SECOND
else:
prompt_eval_rate = 0
print(
f"\t\tPrompt evaluation:\t{prompt_eval_time:.6f} (seconds)\t\t Rate: \t{prompt_eval_rate:.6f} (tokens/second)"
)
eval_time = (self.inference_end_ms - self.prompt_eval_end_ms) / self.SCALING_FACTOR_UNITS_PER_SECOND
if (self.inference_end_ms - self.prompt_eval_end_ms) > 0 and self.num_generated_tokens > 0:
eval_rate = (
self.num_generated_tokens / (self.inference_end_ms - self.prompt_eval_end_ms)
) * self.SCALING_FACTOR_UNITS_PER_SECOND
else:
eval_rate = 0
print(
f"\t\tGenerated {self.num_generated_tokens} tokens:\t{eval_time:.6f} (seconds)\t\t Rate: \t{eval_rate:.6f} (tokens/second)"
)
time_to_first_token = (self.first_token_ms - self.inference_start_ms) / self.SCALING_FACTOR_UNITS_PER_SECOND
print(f"\tTime to first generated token:\t{time_to_first_token:.6f} (seconds)")
sampling_time = self.aggregate_sampling_time_ms / self.SCALING_FACTOR_UNITS_PER_SECOND
print(
f"\tSampling time over {self.num_prompt_tokens + self.num_generated_tokens} tokens:\t{sampling_time:.6f} (seconds)"
)