in optimum_benchmark/scenarios/inference/scenario.py [0:0]
def run_text_generation_latency_tracking(self):
self.logger.info("\t+ Running Text Generation latency tracking")
prefill_kwargs = {**self.config.generate_kwargs, **TEXT_GENERATION_PREFILL_OVERRIDES}
with self.latency_tracker.session():
while (
self.latency_tracker.elapsed() < self.config.duration
or self.latency_tracker.count() < self.config.iterations
):
with self.latency_tracker.track():
self.backend.prefill(self.inputs, prefill_kwargs)
prefill_latency = self.latency_tracker.get_latency()
self.report.prefill.latency = prefill_latency
self.report.prefill.throughput = Throughput.from_latency(
prefill_latency, self.atomic_prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
)
with self.latency_tracker.session():
while (
self.latency_tracker.elapsed() < self.config.duration
or self.latency_tracker.count() < self.config.iterations
):
with self.latency_tracker.track():
self.backend.generate(self.inputs, self.config.generate_kwargs)
generate_latency = self.latency_tracker.get_latency()
self.report.generate.latency = generate_latency
self.report.generate.throughput = Throughput.from_latency(
generate_latency, self.atomic_generate_volume, unit=GENERATE_THROUGHPUT_UNIT
)
decode_latency = generate_latency - prefill_latency
self.report.decode.latency = decode_latency
self.report.decode.throughput = Throughput.from_latency(
decode_latency, self.atomic_decode_volume, unit=DECODE_THROUGHPUT_UNIT
)