in fastchat/serve/huggingface_api_worker.py [0:0]
def generate_stream_gate(self, params):
self.call_ct += 1
prompt = params["prompt"]
gen_kwargs = get_gen_kwargs(params, seed=self.seed)
stop = gen_kwargs["stop_sequences"]
if "falcon" in self.model_path and "chat" in self.model_path:
stop.extend(["\nUser:", "<|endoftext|>", " User:", "###"])
stop = list(set(stop))
gen_kwargs["stop_sequences"] = stop
logger.info(f"prompt: {prompt}")
logger.info(f"gen_kwargs: {gen_kwargs}")
try:
if self.model_path == "":
url = f"{self.api_base}"
else:
url = f"{self.api_base}/{self.model_path}"
client = InferenceClient(url, token=self.token)
res = client.text_generation(
prompt, stream=True, details=True, **gen_kwargs
)
reason = None
text = ""
for chunk in res:
if chunk.token.special:
continue
text += chunk.token.text
s = next((x for x in stop if text.endswith(x)), None)
if s is not None:
text = text[: -len(s)]
reason = "stop"
break
if could_be_stop(text, stop):
continue
if (
chunk.details is not None
and chunk.details.finish_reason is not None
):
reason = chunk.details.finish_reason
if reason not in ["stop", "length"]:
reason = None
ret = {
"text": text,
"error_code": 0,
"finish_reason": reason,
}
yield json.dumps(ret).encode() + b"\0"
except Exception as e:
ret = {
"text": f"{SERVER_ERROR_MSG}\n\n({e})",
"error_code": ErrorCode.INTERNAL_ERROR,
}
yield json.dumps(ret).encode() + b"\0"