in text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py [0:0]
def _post_generate(self, slot: Slot, next_token: int, generations: List[Generation]) -> None:
"""Post-generate a slot after the generation has been completed.
This will check if the slot is finished and append the generated text to the response.
Args:
slot (`Slot`):
The slot to post-generate.
next_token (`int`):
The next token generated by the model.
generations (`List[Generation]`):
The list of generations to append the slot to.
"""
# prepare the generation response
next_token_text = slot.append(next_token)
generated_text = None
finish_reason = None
if next_token == self.tokenizer.eos_token_id:
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
elif slot.stopped:
if slot.generated_tokens == slot.max_new_tokens:
finish_reason = FinishReason.FINISH_REASON_LENGTH
else:
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
request_id = slot.request_id
if finish_reason is not None:
# We must include the generated text for each finished sequence in the response
generated_text = GeneratedText(
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
)
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
# This slot is now empty, it will be removed from the list of
# active slots.
slot.clear()
generations.append(
Generation(
request_id=request_id,
prefill_tokens=None,
tokens=Tokens(
ids=[next_token],
logprobs=[0],
texts=[next_token_text],
is_special=[next_token in self.special_tokens],
),
generated_text=generated_text,
)
)