perfkitbenchmarker/linux_benchmarks/nvidia_hpl_benchmark.py (136 lines of code) (raw):

"""Runs Nvidia HPL benchmark. The test uses the sample data files within the Nvidia HPC image: HPL-dgx-{nnodes}N.dat, which requires each node having 8xH100 GPUs. Source: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks """ from absl import flags from perfkitbenchmarker import background_tasks from perfkitbenchmarker import configs from perfkitbenchmarker import data from perfkitbenchmarker import errors from perfkitbenchmarker import sample from perfkitbenchmarker.linux_packages import cuda_toolkit from perfkitbenchmarker.linux_packages import nvidia_driver from perfkitbenchmarker.linux_packages import optimize_gpu from perfkitbenchmarker.linux_packages import slurm BENCHMARK_NAME = 'nvidia_hpl' BENCHMARK_CONFIG = """ nvidia_hpl: description: Runs nvidia hpl benchmark. vm_groups: default: vm_spec: GCP: machine_type: a3-megagpu-8g gpu_count: 8 gpu_type: h100 zone: us-east1-d boot_disk_size: 1000 AWS: machine_type: p5.48xlarge zone: us-east-1 boot_disk_size: 1000 disk_spec: *default_500_gb vm_count: null flags: placement_group_style: closest_supported scratch_dir: /mnt/localssd data_disk_type: local preprovision_ignore_checksum: True gce_num_local_ssds: 16 gce_ssd_interface: NVME gcloud_scopes: https://www.googleapis.com/auth/devstorage.read_write,cloud-platform """ FLAGS = flags.FLAGS def CheckPrerequisites(_): """Perform flag checks.""" if FLAGS.cloud == 'GCP' and not FLAGS.image_project: raise errors.Benchmarks.UnsupportedConfigError( '--image_project is required. Please follow' ' https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster' ' to build your own image.' ) def GetConfig(user_config): """Load and return benchmark config. Args: user_config: user supplied configuration (flags and config file) Returns: loaded benchmark configuration """ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) def _PrepareNvidiaHPL(vm): """Install packages and configure VM.""" vm.Install('nvidia_hpc') nvidia_driver.EnablePersistenceMode(vm) vm.RemoteCommand('sudo mount -o remount,size=75% /run') vm.RemoteCommand( 'echo "FROM nvcr.io/nvidia/hpc-benchmarks:24.09" >> Dockerfile') vm.RemoteCommand( 'echo "WORKDIR /workspace" >> Dockerfile') vm.UpdateDockerfile('Dockerfile') # TODO(yuyanting): Figure out proper math for other node counts # 128 GPUs dat from Sam Skillman vm.RemoteCopy( data.ResourcePath('HPL-H200-128GPUs.dat') ) vm.RemoteCommand( 'echo "COPY HPL-H200-128GPUs.dat ' '/workspace/hpl-linux-x86_64/sample-dat/HPL-H200-128GPUs.dat"' ' >> Dockerfile') vm.RemoteCommand('docker build --network=host -t pkb-hpc-image .') def Prepare(benchmark_spec): """Install and setup Nvidia HPL benchmark. Args: benchmark_spec: The benchmark specification. """ vms = benchmark_spec.vms background_tasks.RunThreaded(_PrepareNvidiaHPL, vms) slurm.ConfigureSlurm(vms) background_tasks.RunThreaded(optimize_gpu.Install, vms) optimize_gpu.BuildHostFile(vms[0], len(benchmark_spec.vms)) def _CreateMetadata(vms, result_line_parts): """Constructing benchmark metadata.""" metadata = dict() metadata.update(cuda_toolkit.GetMetadata(vms[0])) metadata['num_nodes'] = len(vms) gpus_per_node = FLAGS.hpcg_gpus_per_node or nvidia_driver.QueryNumberOfGpus( vms[0] ) metadata['cpus_per_rank'] = int(vms[0].NumCpusForBenchmark() / gpus_per_node) metadata['gpus_per_node'] = gpus_per_node metadata['total_gpus'] = gpus_per_node * len(vms) metadata['N'] = float(result_line_parts[1]) metadata['NB'] = float(result_line_parts[2]) metadata['P'] = float(result_line_parts[3]) metadata['Q'] = float(result_line_parts[4]) metadata['time'] = float(result_line_parts[5]) metadata['gflops_per_gpu'] = float(result_line_parts[8][:-1]) return metadata def Run(benchmark_spec): """Runs Nvidia HPL benchmark. Sample output: ============================================================================== T/V N NB P Q Time Gflops ( per GPU) ------------------------------------------------------------------------------ WC0 376832 1024 4 4 85.42 4.176e+05 ( 2.610e+04) Args: benchmark_spec: The benchmark specification. Returns: A list of sample.Sample objects. """ samples = [] controller = benchmark_spec.vms[0] gpus_per_node = FLAGS.hpcg_gpus_per_node or nvidia_driver.QueryNumberOfGpus( benchmark_spec.vms[0]) provider_env = optimize_gpu.SetContainerEnv(controller) hpl_command = '' if nvidia_driver.GetGpuType(controller) == nvidia_driver.NVIDIA_H100: hpl_dat = f'HPL-dgx-{len(benchmark_spec.vms)}N.dat' elif nvidia_driver.GetGpuType(controller) == nvidia_driver.NVIDIA_H200: hpl_dat = f'HPL-H200-{len(benchmark_spec.vms) * 8}GPUs.dat' if len(benchmark_spec.vms) > 4 and len(benchmark_spec.vms) != 16: raise ValueError( f'Unsupported number of nodes: {len(benchmark_spec.vms)}' ) else: raise ValueError( f'Unsupported GPU type: {nvidia_driver.GetGpuType(controller)}' ) hpl_command += ( f'./hpl.sh --dat /workspace/hpl-linux-x86_64/sample-dat/{hpl_dat}' ) # pylint: disable=protected-access hostfile = controller._RemoteFileExists('/var/tmp/hostfile') if hostfile: hostfile_arg = 'export SLURM_HOSTFILE=/var/tmp/hostfile; ' slurm_args = '' else: hostfile_arg = '' slurm_args = f'-N {len(benchmark_spec.vms)} ' mount_args = ','.join(optimize_gpu.GetContainerMounts(controller)) if mount_args: slurm_args += f'--container-mounts="{mount_args}" ' stdout, _ = controller.RemoteCommand( f'{hostfile_arg}' 'export TMPDIR=/tmp; ' 'export NCCL_DEBUG=INFO; ' 'export HPL_FCT_COMM_POLICY=1; ' 'export HPL_P2P_AS_BCAST=0; ' 'export HPL_USE_NVSHMEM=0; ' 'export NVSHMEM_DISABLE_CUDA_VMM=1; ' 'export OMPI_MCA_pml="ucx"; ' 'export UCX_MAX_RNDV_RAILS=8; ' # environment variables to use f'srun ' f'--ntasks-per-node {gpus_per_node} ' '--cpus-per-task ' f'{int(controller.NumCpusForBenchmark() / gpus_per_node)} ' '--cpu-bind=none --mpi=pmi2 ' '--container-image="dockerd://pkb-hpc-image" ' f'{slurm_args} bash -c "{provider_env} {hpl_command}"' ) lines = stdout.splitlines() result_line_idx = None for line_idx, line in enumerate(lines): if line.startswith( 'T/V N NB P Q Time Gflops ' '( per GPU)'): result_line_idx = line_idx + 2 break if not result_line_idx: raise ValueError('Failed to find result line.') else: result_line = lines[result_line_idx] result_line_parts = result_line.split() samples.append( sample.Sample( 'HPL Throughput', float(result_line_parts[6]), 'Gflops', _CreateMetadata(benchmark_spec.vms, result_line_parts), ) ) return samples def Cleanup(_): """Cleanup Nvidia HPL.""" pass