perfkitbenchmarker/linux_benchmarks/nccl_tcpxo_benchmark.py (83 lines of code) (raw):
"""Run NCCL benchmarks with GPUDirect-TCPXO framework."""
import time
from absl import flags
from perfkitbenchmarker import configs
from perfkitbenchmarker import errors
from perfkitbenchmarker.linux_benchmarks import nccl_benchmark
from perfkitbenchmarker.linux_packages import optimize_gpu
from perfkitbenchmarker.linux_packages import slurm
FLAGS = flags.FLAGS
BENCHMARK_NAME = 'nccl_tcpxo'
BENCHMARK_CONFIG = """
nccl_tcpxo:
description: Runs NCCL Benchmark with TCPXO.
vm_groups:
default:
vm_count: null
disk_spec: *default_500_gb # Store enroot images
os_type: debian12_dl
vm_spec:
GCP:
machine_type: a3-megagpu-8g
zone: us-central1-a
# User should build their own image following:
# https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml
image_family: slurm-a3mega
boot_disk_size: 500
flags:
placement_group_style: closest_supported
gcloud_scopes: cloud-platform
scratch_dir: /mnt/localssd
"""
_IMAGE = 'docker://nvcr.io#nvidia/pytorch:24.04-py3'
_SQSH_IMAGE = './nvidia+pytorch+24.04-py3.sqsh'
_GPUS_PER_NODE = 8 # This benchmark works specifically for a3-megagpu-8g
def CheckPrerequisites(_):
"""Perform flag checks."""
if not FLAGS.image_project:
raise errors.Benchmarks.UnsupportedConfigError(
'--image_project is required. Please follow'
' https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster'
' to build your own image.'
)
def GetConfig(user_config):
"""Load and return benchmark config.
Args:
user_config: user supplied configuration (flags and config file)
Returns:
loaded benchmark configuration
"""
return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
def Prepare(benchmark_spec):
"""Install and set up NCCL on the target vm.
Args:
benchmark_spec: The benchmark specification
"""
benchmark_spec.always_call_cleanup = True
nnodes = len(benchmark_spec.vms)
for vm in benchmark_spec.vms:
vm.AuthenticateVm()
vm.Install('optimize_gpu')
slurm.ConfigureSlurm(benchmark_spec.vms)
controller = benchmark_spec.vms[0]
controller.RemoteCommand(f'srun -N {nnodes} enroot import {_IMAGE}')
controller.RemoteCommand(
f'srun -N {nnodes} git clone https://github.com/NVIDIA/nccl-tests.git'
)
controller.RemoteCommand(
f'srun -N {nnodes} '
'--container-mounts="$PWD:/nccl" '
f'--container-image="{_SQSH_IMAGE}" '
'bash -c "cd /nccl/nccl-tests/ && MPI=1 CC=mpicc CXX=mpicxx make -j"'
)
optimize_gpu.BuildHostFile(controller, len(benchmark_spec.vms))
def Run(benchmark_spec):
"""Run NCCL on the cluster.
Args:
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
Returns:
A list of sample.Sample objects.
"""
controller = benchmark_spec.vms[0]
samples = []
mount_path = ','.join(
['$PWD:/nccl'] + optimize_gpu.GetContainerMounts(benchmark_spec.vms[0])
)
for operation in FLAGS.nccl_operations:
stdout, _ = controller.RemoteCommand(
'export SLURM_HOSTFILE=/var/tmp/hostfile; '
'srun '
'--mpi=pmi2 '
f'--ntasks-per-node={_GPUS_PER_NODE} '
f'--container-image="{_SQSH_IMAGE}" '
f'--container-mounts="{mount_path}" '
'bash -c "'
'export NCCL_DEBUG=INFO; '
f'{optimize_gpu.SetContainerEnv(benchmark_spec.vms[0])} '
f'/nccl/nccl-tests/build/{operation}_perf '
f'--minbytes {FLAGS.nccl_minbytes} '
f'--maxbytes {FLAGS.nccl_maxbytes} '
f'--stepfactor {FLAGS.nccl_stepfactor} '
f'--ngpus {FLAGS.nccl_ngpus} '
f'--check {int(FLAGS.nccl_check)} '
f'--nthreads {FLAGS.nccl_nthreads} '
f'--iters {FLAGS.nccl_iters};"'
)
samples.extend(
nccl_benchmark.MakeSamplesFromOutput(
{
'slots': _GPUS_PER_NODE,
'operation': operation,
'minbytes': FLAGS.nccl_minbytes,
'maxbytes': FLAGS.nccl_maxbytes,
'nccl_net_plugin': 'TCPFastrak',
},
stdout,
)[0]
)
time.sleep(FLAGS.nccl_seconds_between_runs)
return samples
def Cleanup(unused_benchmark_spec):
"""Cleanup NCCL on the cluster.
Args:
unused_benchmark_spec: The benchmark specification. Contains all data that
is required to run the benchmark.
"""
pass