Benchmarks/AMD/RCCLBandwidth.py (105 lines of code) (raw):

import json import docker import os import csv import csv from prettytable import PrettyTable from Infra import tools class RCCLBandwidth: def __init__(self, config_path:str, dir_path:str, machine: str): self.name='RCCLBandwidth' self.machine_name = machine config = self.get_config(config_path) self.start, self.end, self.num_gpus = self.config_conversion(config) self.dir_path = dir_path self.container = None self.buffer = [] def get_config(self, path: str): file = open(path) data = json.load(file) file.close() try: return data[self.name] except KeyError: raise KeyError("no value found") def parse_json(self, config): return config['inputs']['start'], config['inputs']['end'], config['inputs']['num_gpus'] def config_conversion(self, config)->tuple[list, list, list]: return self.parse_json(config) def create_container(self): client = docker.from_env() # Define the Docker run options docker_run_options = { 'ipc_mode':'host', 'entrypoint': '/bin/bash', 'network': 'host', 'group_add': ['render'], 'privileged': True, 'security_opt': ['seccomp=unconfined'], 'cap_add': ['CAP_SYS_ADMIN', 'SYS_PTRACE'], 'devices': ['/dev/kfd', '/dev/dri', '/dev/mem'], 'volumes': {str(self.dir_path): {'bind': str(self.dir_path), 'mode': 'rw'}}, 'tty': True, 'detach': True } # Creates new Docker container from https://hub.docker.com/r/rocm/pytorch/tags print("Pulling docker container rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue...") self.container = client.containers.run('rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0_triton_llvm_reg_issue', **docker_run_options) print(f"Docker Container ID: {self.container.id}") def build(self): path ='rccl' isdir = os.path.isdir(path) if not isdir: print("Building RCCL Library...") clone_cmd = "git clone https://github.com/ROCm/rccl.git " + self.dir_path + "/rccl" results = self.container.exec_run(clone_cmd, stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) results = self.container.exec_run(f'/bin/sh -c "cd {self.dir_path}/rccl && cmake . && make"', stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) results = self.container.exec_run(f'/bin/sh -c "cd .."', stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) path ='rccl-tests' isdir = os.path.isdir(path) if not isdir: print("Building RCCL Tests...") clone_cmd = "git clone https://github.com/ROCm/rccl-tests.git " + self.dir_path + "/rccl-tests" results = self.container.exec_run(clone_cmd, stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) results = self.container.exec_run(f'/bin/sh -c "cd {self.dir_path}/rccl-tests && make HIP_HOME=/opt/rocm NCCL_HOME={self.dir_path}/rccl CUSTOM_RCCL_LIB={self.dir_path}/rccl/librccl.so && make MPI=1 MPI_HOME=/opt/ompi HIP_HOME=/opt/rocm NCCL_HOME={self.dir_path}/rccl"', stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) def run(self): buffer=[["8 ","16 ","32 ","64 ","128 ","256 ","512 ","1K","2K","4K","8K","16K","32K","65K","132K","256K", "524K","1M","2M","4M","8M","16M","33M","67M","134M","268M","536M","1G","2G","4G","8G"]] runs = ["Tree", "Ring", "NVLS", "NVLSTree"] print("Running RCCL AllReduce...") for run in runs: run_cmd = "NCCL_ALGO=" + run + " " + self.dir_path +"/rccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 8 -n 40 | grep float" run_cmd = '/bin/sh -c "' + run_cmd + '"' results = self.container.exec_run(run_cmd, stderr=True) if results.exit_code != 0: tools.write_log(results.output.decode('utf-8')) return res = results.output.decode('utf-8').split('\n') log = [] for line in res: line = line.split() if len(line) == 13: log.append(line[11]) buffer.append(log) table1 = PrettyTable() runs = ["Message Size", "Tree", "Ring", "NVLS", "NVLSTree"] for i in range(len(buffer)): table1.add_column(runs[i], buffer[i]) print(table1) self.buffer=buffer self.container.kill() self.save() def save(self): with open('Outputs/RCCLBandwidth_' + self.machine_name + '.csv', 'w') as csvFile: writer = csv.writer(csvFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Message Size", "Tree", "Ring", "NVLS", "NVLSTree"]) for i in range(len(self.buffer[0])): row = [self.buffer[0][i], self.buffer[1][i], self.buffer[2][i], self.buffer[3][i], self.buffer[4][i]] writer.writerow(row)