Benchmarks/NVIDIA/NCCLBandwidth.py (84 lines of code) (raw):

import json import os import csv import subprocess import csv from Infra import tools from prettytable import PrettyTable class NCCLBandwidth: def __init__(self, path:str, machine: str): self.name='NCCLBandwidth' self.machine_name = machine config = self.get_config(path) self.start, self.end, self.num_gpus = self.config_conversion(config) self.buffer = [] self.algo = "NVLS" def get_config(self, path: str): file = open(path) data = json.load(file) file.close() try: return data[self.name] except KeyError: raise KeyError("no value found") def parse_json(self, config): return config['inputs']['start'], config['inputs']['end'], config['inputs']['num_gpus'] def config_conversion(self, config)->tuple[list, list, list]: return self.parse_json(config) def build(self): current = os.getcwd() path ='nccl' isdir = os.path.isdir(path) if not isdir: print("Building NCCL Library...") results = subprocess.run(['git', 'clone', 'https://github.com/NVIDIA/nccl.git', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) build_path = os.path.join(current, 'nccl') os.chdir(build_path) results = subprocess.run('make -j src.build', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) tools.write_log(tools.check_error(results)) os.chdir(current) results = subprocess.run('export NCCL_HOME=' + current + '/nccl/build', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) results = subprocess.run('export LD_LIBRARY_PATH=' + current + '/nccl/build/lib:$LD_LIBRARY_PATH', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) path ='nccl-tests' isdir = os.path.isdir(path) if not isdir: print("Building NCCL Test..") results = subprocess.run(['git', 'clone', 'https://github.com/NVIDIA/nccl-tests.git', path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) build_path = os.path.join(current, 'nccl-tests') os.chdir(build_path) results = subprocess.run(['make'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) tools.write_log(tools.check_error(results)) else: build_path = os.path.join(current, 'nccl-tests') os.chdir(build_path) def run(self): current = os.getcwd() buffer=[["8 ","16 ","32 ","64 ","128 ","256 ","512 ","1K","2K","4K","8K","16K","32K","65K","132K","256K", "524K","1M","2M","4M","8M","16M","33M","67M","134M","268M","536M","1G","2G","4G","8G"]] num_gpus = str(subprocess.run("nvidia-smi --query-gpu=name --format=csv,noheader | wc -l", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.decode('utf-8')).strip() if num_gpus == '4': self.algo = "Ring" print("Running NCCL AllReduce on " + num_gpus + " GPUs") results = subprocess.run('NCCL_ALGO='+ self.algo +' ./build/all_reduce_perf -b 8 -e 8G -f 2 -g ' + num_gpus + ' -n 40 | grep float', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) tools.write_log(tools.check_error(results)) res = results.stdout.decode('utf-8').split('\n') log = [] for line in res: line = line.split() if len(line) == 13: log.append(line[11]) buffer.append(log) table1 = PrettyTable() runs = ["Message Size", "Bandwidth (" + self.algo + ")"] for i in range(len(buffer)): table1.add_column(runs[i], buffer[i]) print(table1) self.buffer=buffer self.save() os.chdir(current) def save(self): with open('../Outputs/NCCLBandwidth_' + self.machine_name + '.csv', 'w') as csvFile: writer = csv.writer(csvFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Message Size", "Bandwidth (" + self.algo + ")"]) for i in range(len(self.buffer[0])): row = [self.buffer[0][i], self.buffer[1][i]] writer.writerow(row)