def run_per_token_text_generation_latency_tracking()

in optimum_benchmark/scenarios/inference/scenario.py [0:0]


    def run_per_token_text_generation_latency_tracking(self):
        self.logger.info("\t+ Running Per-Token Text Generation latency tracking")

        with self.per_token_latency_tracker.session():
            while (
                self.per_token_latency_tracker.elapsed() < self.config.duration
                or self.per_token_latency_tracker.count() < self.config.iterations
            ):
                with self.per_token_latency_tracker.track():
                    self.backend.generate(self.inputs, self.config.generate_kwargs)

        per_token_latency = self.per_token_latency_tracker.get_per_token_latency()
        generate_latency = self.per_token_latency_tracker.get_generate_latency()
        prefill_latency = self.per_token_latency_tracker.get_prefill_latency()
        decode_latency = self.per_token_latency_tracker.get_decode_latency()

        self.report.per_token.latency = per_token_latency
        self.report.generate.latency = generate_latency
        self.report.prefill.latency = prefill_latency
        self.report.decode.latency = decode_latency

        # we don't register a per-token throughput,
        # it's a confusing metric and the same signal as the decode throughput
        self.report.generate.throughput = Throughput.from_latency(
            generate_latency, self.atomic_decode_volume, unit=GENERATE_THROUGHPUT_UNIT
        )
        self.report.prefill.throughput = Throughput.from_latency(
            prefill_latency, self.atomic_prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
        )
        self.report.decode.throughput = Throughput.from_latency(
            decode_latency, self.atomic_decode_volume, unit=DECODE_THROUGHPUT_UNIT
        )