in optimum_benchmark/trackers/latency.py [0:0]
def get_generate_latency(self) -> Latency:
assert len(self.prefill_start_events) == len(self.prefill_end_events) > 0
assert len(self.decode_start_events) == len(self.decode_end_events) > 0
if self.is_pytorch_cuda:
torch.cuda.synchronize()
latencies = [
start_event.elapsed_time(end_event) / 1e3
for start_event, end_event in zip(self.prefill_start_events, self.decode_end_events)
]
else:
latencies = [
(end_event - start_event)
for start_event, end_event in zip(self.prefill_start_events, self.decode_end_events)
]
assert all(latency >= 0 for latency in latencies), (
"Found some negative latencies while performing substraction. "
"Please increase the dimensions of your benchmark or the number of warmup runs."
)
return Latency.from_values(latencies, unit=LATENCY_UNIT)