in benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py [0:0]
def generate_request(prompt):
"""Generates request for given model server"""
global model_params
backend = model_params["backend"]
best_of = model_params["best_of"]
output_len = model_params["max_output_len"]
use_beam_search = model_params["use_beam_search"]
sax_model = model_params["sax_model"]
if backend == "vllm":
pload = {
"prompt": prompt,
"n": 1,
"best_of": best_of,
"use_beam_search": use_beam_search,
"temperature": 0.0 if use_beam_search else 1.0,
"top_p": 1.0,
"max_tokens": output_len,
"ignore_eos": False,
"stream": False,
}
elif backend == "tgi":
params = {
"best_of": best_of,
"max_new_tokens": output_len,
"do_sample": True,
}
pload = {
"inputs": prompt,
"parameters": params,
}
elif backend == "tensorrt_llm_triton":
pload = {
"text_input": prompt,
"max_tokens": output_len,
"beam_width": 1 if not use_beam_search else best_of,
"temperature": 0.0 if use_beam_search else 1.0,
"top_p": 1.0,
"bad_words": "",
"stop_words": "",
"stream": False,
}
elif backend == "sax":
pload = {
"model": sax_model,
"prompt": prompt,
"n": 1,
"best_of": best_of,
"use_beam_search": use_beam_search,
"temperature": 0.0 if use_beam_search else 1.0,
"top_p": 1.0,
"top_k": 50,
"max_tokens": output_len,
"stream": False,
}
elif backend == "jetstream":
pload = {
"prompt": prompt,
"max_tokens": output_len,
}
else:
raise ValueError(f"Unknown backend: {backend}")
return pload