in debug.py [0:0]
def __init__(self, message: str, p: processor.Processor):
"""Wraps a processor to provide performance messaging.
Should only be used for processors that consume their entire input before
producing output (such as non-streaming or unidirectional/single streaming
model calls). The TTFT is estimated by waiting first that the inputs
stream is
completely sent to the processor (`start` time is then set). When the
processor outputs its first token, the duration from `start` is then
reported.
In a bidirectional streaming setup, the TTFT will not be reported at all.
Args:
message: header of the status chunk that will be returned. It is used to
identify different calls to this function.
p: processor for which we need to compute ttft. self._message = message
self._p = p self._start = None self._ttft = None self._model_call_event
= asyncio.Event() self._model_call_event.clear()
"""
self._message = message
self._p = p
self._start = None
self._ttft = None
self._model_call_event = asyncio.Event()
self._model_call_event.clear()