in Benchmarks/AMD/LLMBenchmark.py [0:0]
def run_benchmark(self):
for model_name in self.config['models']:
if self.config['models'][model_name]['use_model'] and self.config['models'][model_name]['type'] == "amd":
for tp_size in self.config['models'][model_name]['tp_sizes']:
for max_num_seq in self.config['models'][model_name]['max_num_seqs']:
for i in range(len(self.config['models'][model_name]['input_length'])):
for request in self.config['models'][model_name]['num_requests']:
input_size = self.config['models'][model_name]['input_length'][i]
output_size = self.config['models'][model_name]['output_length'][i]
print(f"Benchmarking {model_name} | TP Size: {tp_size} | Input Size: {input_size} | Output Size: {output_size}")
run_benchmark_command = f'''
/bin/bash -c \
"python /app/vllm/benchmarks/benchmark_throughput.py \
--model amd/{model_name} \
--quantization fp8 \
--kv-cache-dtype fp8 \
--dtype half \
--gpu-memory-utilization 0.90 \
--distributed-executor-backend mp \
--num-scheduler-steps 10 \
--tensor-parallel-size {tp_size} \
--enable-chunked-prefill false \
--max-seq-len-to-capture 131072 \
--max-num-batched-tokens 131072 \
--max-model-len 8192 \
--max-num-seqs {max_num_seq} \
--num-prompts {request} \
--input-len {input_size} \
--output-len {output_size}"
'''
rb1 = self.container.exec_run(run_benchmark_command)
tools.write_log(rb1.output.decode('utf-8'))
temp = rb1.output.decode('utf-8').split('\n')
for line in temp:
if "Throughput: " in line:
result = line.split(' ')[6]
table1 = PrettyTable()
table1.add_row(['Model Name', model_name])
table1.add_row(['Input/Output lengths', str(input_size) + "/" + str(output_size)])
table1.add_row(['World Size (TP size)', str(tp_size)])
table1.add_row(['Throughput (tokens/sec)', str(result)])
print(table1.get_string(header=False))
self.save_data([model_name, str(input_size), str(output_size), str(tp_size), str(result)], 'Outputs/LLMBenchmark_' + self.machine + '.csv')
self.container.kill()