extra/slurm/benchmark.py (50 lines of code) (raw):
import os
import subprocess
CPUS_PER_GPU = 20
MEM_PER_CPU_GB = 11
def main():
models = [
('meta-llama/Llama-3.1-8B-Instruct', 1),
('meta-llama/Llama-3.1-70B-Instruct', 4),
('meta-llama/Llama-3.1-70B-Instruct', 2),
('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
]
num_passes = 1
engines = ['tgi', 'vllm']
for i in range(num_passes):
for model in models:
print(f"PASS {i} - Submitting job for {model[0]}")
gpus = model[1]
cpus_per_task = gpus * CPUS_PER_GPU
for engine in engines:
job_name = f'bench_{model[0].replace("/", "_")}_{engine}_pass_{i}'
args = ['sbatch',
'--job-name', job_name,
'--output', f'/fsx/%u/logs/%x-%j.log',
'--time', '1:50:00',
'--qos', 'normal',
'--partition', 'hopper-prod',
'--gpus', str(gpus),
'--ntasks', '1',
'--cpus-per-task', str(cpus_per_task),
'--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
'--nodes', '1',
':',
'--gpus', '1',
'--ntasks', '1',
'--cpus-per-task', str(CPUS_PER_GPU),
'--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
'--nodes', '1',
f'{engine}.slurm']
env = os.environ.copy()
env['MODEL'] = model[0]
env['TP'] = str(gpus)
process = subprocess.run(args, capture_output=True,
env=env)
print(process.stdout.decode())
print(process.stderr.decode())
if process.returncode != 0:
print(f'Error while submitting :: {args}')
exit(1)
if __name__ == '__main__':
main()