perfkitbenchmarker/linux_benchmarks/nccl_tcpxo

"""Run NCCL benchmarks with GPUDirect-TCPXO framework.""" import time from absl import flags from perfkitbenchmarker import configs from perfkitbenchmarker import errors from perfkitbenchmarker.linux_benchmarks import nccl_benchmark from perfkitbenchmarker.linux_packages import optimize_gpu from perfkitbenchmarker.linux_packages import slurm FLAGS = flags.FLAGS BENCHMARK_NAME = 'nccl_tcpxo' BENCHMARK_CONFIG = """ nccl_tcpxo: description: Runs NCCL Benchmark with TCPXO. vm_groups: default: vm_count: null disk_spec: *default_500_gb # Store enroot images os_type: debian12_dl vm_spec: GCP: machine_type: a3-megagpu-8g zone: us-central1-a # User should build their own image following: # https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml image_family: slurm-a3mega boot_disk_size: 500 flags: placement_group_style: closest_supported gcloud_scopes: cloud-platform scratch_dir: /mnt/localssd """ _IMAGE = 'docker://nvcr.io#nvidia/pytorch:24.04-py3' _SQSH_IMAGE = './nvidia+pytorch+24.04-py3.sqsh' _GPUS_PER_NODE = 8 # This benchmark works specifically for a3-megagpu-8g def CheckPrerequisites(_): """Perform flag checks.""" if not FLAGS.image_project: raise errors.Benchmarks.UnsupportedConfigError( '--image_project is required. Please follow' ' https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster' ' to build your own image.' ) def GetConfig(user_config): """Load and return benchmark config. Args: user_config: user supplied configuration (flags and config file) Returns: loaded benchmark configuration """ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) def Prepare(benchmark_spec): """Install and set up NCCL on the target vm. Args: benchmark_spec: The benchmark specification """ benchmark_spec.always_call_cleanup = True nnodes = len(benchmark_spec.vms) for vm in benchmark_spec.vms: vm.AuthenticateVm() vm.Install('optimize_gpu') slurm.ConfigureSlurm(benchmark_spec.vms) controller = benchmark_spec.vms[0] controller.RemoteCommand(f'srun -N {nnodes} enroot import {_IMAGE}') controller.RemoteCommand( f'srun -N {nnodes} git clone https://github.com/NVIDIA/nccl-tests.git' ) controller.RemoteCommand( f'srun -N {nnodes} ' '--container-mounts="$PWD:/nccl" ' f'--container-image="{_SQSH_IMAGE}" ' 'bash -c "cd /nccl/nccl-tests/ && MPI=1 CC=mpicc CXX=mpicxx make -j"' ) optimize_gpu.BuildHostFile(controller, len(benchmark_spec.vms)) def Run(benchmark_spec): """Run NCCL on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ controller = benchmark_spec.vms[0] samples = [] mount_path = ','.join( ['$PWD:/nccl'] + optimize_gpu.GetContainerMounts(benchmark_spec.vms[0]) ) for operation in FLAGS.nccl_operations: stdout, _ = controller.RemoteCommand( 'export SLURM_HOSTFILE=/var/tmp/hostfile; ' 'srun ' '--mpi=pmi2 ' f'--ntasks-per-node={_GPUS_PER_NODE} ' f'--container-image="{_SQSH_IMAGE}" ' f'--container-mounts="{mount_path}" ' 'bash -c "' 'export NCCL_DEBUG=INFO; ' f'{optimize_gpu.SetContainerEnv(benchmark_spec.vms[0])} ' f'/nccl/nccl-tests/build/{operation}_perf ' f'--minbytes {FLAGS.nccl_minbytes} ' f'--maxbytes {FLAGS.nccl_maxbytes} ' f'--stepfactor {FLAGS.nccl_stepfactor} ' f'--ngpus {FLAGS.nccl_ngpus} ' f'--check {int(FLAGS.nccl_check)} ' f'--nthreads {FLAGS.nccl_nthreads} ' f'--iters {FLAGS.nccl_iters};"' ) samples.extend( nccl_benchmark.MakeSamplesFromOutput( { 'slots': _GPUS_PER_NODE, 'operation': operation, 'minbytes': FLAGS.nccl_minbytes, 'maxbytes': FLAGS.nccl_maxbytes, 'nccl_net_plugin': 'TCPFastrak', }, stdout, )[0] ) time.sleep(FLAGS.nccl_seconds_between_runs) return samples def Cleanup(unused_benchmark_spec): """Cleanup NCCL on the cluster. Args: unused_benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. """ pass

perfkitbenchmarker/linux_benchmarks/nccl_tcpxo_benchmark.py (83 lines of code) (raw):