in load_tests/benchmarks.py [0:0]
def main(sha, results_file):
results_dir = "results"
# get absolute path
results_dir = os.path.join(os.path.dirname(__file__), results_dir)
logger.info("Starting benchmark")
models = [
("meta-llama/Llama-3.1-8B-Instruct", 1),
# ('meta-llama/Llama-3.1-70B-Instruct', 4),
# ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
]
success = True
for model in models:
tgi_runner = TGIDockerRunner(model[0])
# create results directory
model_dir = os.path.join(
results_dir, f'{model[0].replace("/", "_").replace(".", "_")}'
)
os.makedirs(model_dir, exist_ok=True)
runner = BenchmarkRunner(
volumes=[(model_dir, "/opt/text-generation-inference-benchmark/results")]
)
try:
tgi_runner.run([("max-concurrent-requests", 512)], gpus=model[1])
logger.info(f"TGI started for model {model[0]}")
parameters = [
("tokenizer-name", model[0]),
("max-vus", 800),
("url", "http://localhost:8080"),
("duration", "120s"),
("warmup", "30s"),
("benchmark-kind", "rate"),
(
"prompt-options",
"num_tokens=200,max_tokens=220,min_tokens=180,variance=10",
),
(
"decode-options",
"num_tokens=200,max_tokens=220,min_tokens=180,variance=10",
),
(
"extra-meta",
f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"',
),
("no-console", None),
]
rates = [("rates", f"{r / 10.}") for r in list(range(8, 248, 8))]
parameters.extend(rates)
runner.run(parameters, f"container:{tgi_runner.container.id}")
except Exception as e:
logger.error(f"Error running benchmark for model {model[0]}: {e}")
# print the stack trace
print(traceback.format_exc())
success = False
finally:
tgi_runner.stop()
runner.stop()
if not success:
logger.error("Some benchmarks failed")
exit(1)
df = pd.DataFrame()
# list recursively directories
directories = [
f"{results_dir}/{d}"
for d in os.listdir(results_dir)
if os.path.isdir(f"{results_dir}/{d}")
]
logger.info(f"Found result directories: {directories}")
for directory in directories:
data_files = {}
for filename in os.listdir(directory):
if filename.endswith(".json"):
data_files[filename.split(".")[-2]] = f"{directory}/{filename}"
logger.info(f"Processing directory {directory}")
df = pd.concat([df, build_df(directory.split("/")[-1], data_files)])
df["device"] = get_gpu_name()
df["error_rate"] = (
df["failed_requests"]
/ (df["failed_requests"] + df["successful_requests"])
* 100.0
)
df.to_parquet(results_file)