Benchmarks/AMD/LLMBenchmark.py (93 lines of code) (raw):

import docker import os import json import csv from prettytable import PrettyTable import json from Infra import tools class LLMBenchmark: def __init__(self, config_path: str, dir_path: str, machine: str): self.name = "LLMBenchmark" self.config = self.get_config(config_path) self.dir_path = dir_path self.precision = "half" self.container = None self.machine = machine def get_config(self, path: str): file = open(path) data = json.load(file) file.close() try: return data[self.name] except KeyError: raise KeyError("no value found") def create_container(self): client = docker.from_env() # Define the Docker run options docker_run_options = { 'ipc_mode':'host', 'network': 'host', 'entrypoint':'/bin/bash', 'group_add': ['render'], 'privileged': True, 'security_opt': ['seccomp=unconfined'], 'cap_add': ['CAP_SYS_ADMIN', 'SYS_PTRACE'], 'devices': ['/dev/kfd', '/dev/dri', '/dev/mem'], 'volumes': {str(self.dir_path): {'bind': str(self.dir_path), 'mode': 'rw'}}, 'environment': {'HF_HOME': str(self.dir_path)}, 'tty': True, 'detach': True } # Creates new Docker container print("Pulling docker container rocm/vllm-dev:20241121-tuned") self.container = client.containers.run('rocm/vllm-dev:20241121-tuned', **docker_run_options) print(f"Docker Container ID: {self.container.id}") def run_benchmark(self): for model_name in self.config['models']: if self.config['models'][model_name]['use_model'] and self.config['models'][model_name]['type'] == "amd": for tp_size in self.config['models'][model_name]['tp_sizes']: for max_num_seq in self.config['models'][model_name]['max_num_seqs']: for i in range(len(self.config['models'][model_name]['input_length'])): for request in self.config['models'][model_name]['num_requests']: input_size = self.config['models'][model_name]['input_length'][i] output_size = self.config['models'][model_name]['output_length'][i] print(f"Benchmarking {model_name} | TP Size: {tp_size} | Input Size: {input_size} | Output Size: {output_size}") run_benchmark_command = f''' /bin/bash -c \ "python /app/vllm/benchmarks/benchmark_throughput.py \ --model amd/{model_name} \ --quantization fp8 \ --kv-cache-dtype fp8 \ --dtype half \ --gpu-memory-utilization 0.90 \ --distributed-executor-backend mp \ --num-scheduler-steps 10 \ --tensor-parallel-size {tp_size} \ --enable-chunked-prefill false \ --max-seq-len-to-capture 131072 \ --max-num-batched-tokens 131072 \ --max-model-len 8192 \ --max-num-seqs {max_num_seq} \ --num-prompts {request} \ --input-len {input_size} \ --output-len {output_size}" ''' rb1 = self.container.exec_run(run_benchmark_command) tools.write_log(rb1.output.decode('utf-8')) temp = rb1.output.decode('utf-8').split('\n') for line in temp: if "Throughput: " in line: result = line.split(' ')[6] table1 = PrettyTable() table1.add_row(['Model Name', model_name]) table1.add_row(['Input/Output lengths', str(input_size) + "/" + str(output_size)]) table1.add_row(['World Size (TP size)', str(tp_size)]) table1.add_row(['Throughput (tokens/sec)', str(result)]) print(table1.get_string(header=False)) self.save_data([model_name, str(input_size), str(output_size), str(tp_size), str(result)], 'Outputs/LLMBenchmark_' + self.machine + '.csv') self.container.kill() def save_data(self, data, file_path): file_exists = os.path.exists(file_path) # Open the file in append mode if it exists, otherwise create it with open(file_path, mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) if not file_exists: writer.writerow(["Model_name", "Input_length", "Output_length", "TP_size", "Tokens per sec"]) writer.writerow(data)