perfkitbenchmarker/linux_benchmarks/nvidia_hpc_benchmark.py (78 lines of code) (raw):
"""Runs Nvidia HPC benchmark.
Source:
https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks
"""
from absl import flags
from perfkitbenchmarker import background_tasks
from perfkitbenchmarker import configs
from perfkitbenchmarker import regex_util
from perfkitbenchmarker import sample
from perfkitbenchmarker.linux_packages import cuda_toolkit
from perfkitbenchmarker.linux_packages import nvidia_driver
from perfkitbenchmarker.linux_packages import slurm
NVIDIA_HPC = 'nvcr.io#nvidia/hpc-benchmarks:23.10'
THROUGHPUT_REGEX = (
r'Final Summary::HPCG result is VALID with a GFLOP/s rating of=(.*)')
BENCHMARK_NAME = 'nvidia_hpc'
BENCHMARK_CONFIG = """
nvidia_hpc:
description: Runs Nvidia HPC.
vm_groups:
default:
vm_spec:
GCP:
machine_type: g2-standard-4
gpu_count: 1
gpu_type: l4
zone: us-east1-d
boot_disk_size: 1000
AWS:
machine_type: g5.xlarge
zone: us-east-1
boot_disk_size: 1000
Azure:
machine_type: Standard_NC6
zone: eastus
boot_disk_size: 1000
disk_spec: *default_500_gb
vm_count: null
flags:
placement_group_style: closest_supported
scratch_dir: /mnt/localssd
data_disk_type: local
"""
FLAGS = flags.FLAGS
def GetConfig(user_config):
"""Load and return benchmark config.
Args:
user_config: user supplied configuration (flags and config file)
Returns:
loaded benchmark configuration
"""
return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
def _PrepareNvidiaHPL(vm):
"""Install packages and configure VM."""
vm.Install('nvidia_hpc')
nvidia_driver.EnablePersistenceMode(vm)
vm.RemoteCommand('sudo mount -o remount,size=75% /run')
def Prepare(benchmark_spec):
"""Install and setup Nvidia HPL benchmark.
Args:
benchmark_spec: The benchmark specification.
"""
vms = benchmark_spec.vms
background_tasks.RunThreaded(_PrepareNvidiaHPL, vms)
slurm.ConfigureSlurm(vms)
def _CreateMetadata(vms):
"""Constructing benchmark metadata."""
metadata = dict()
metadata.update(cuda_toolkit.GetMetadata(vms[0]))
metadata['num_nodes'] = len(vms)
gpus_per_node = FLAGS.hpcg_gpus_per_node or nvidia_driver.QueryNumberOfGpus(
vms[0]
)
metadata['cpus_per_rank'] = int(vms[0].NumCpusForBenchmark() / gpus_per_node)
metadata['gpus_per_node'] = gpus_per_node
metadata['total_gpus'] = gpus_per_node * len(vms)
metadata['runtime'] = FLAGS.hpcg_runtime
metadata['problem_size'] = ','.join(str(n) for n in FLAGS.hpcg_problem_size)
return metadata
def Run(benchmark_spec):
"""Runs Nvidia HPL benchmark.
Args:
benchmark_spec: The benchmark specification.
Returns:
A list of sample.Sample objects.
"""
samples = []
controller = benchmark_spec.vms[0]
gpus_per_node = FLAGS.hpcg_gpus_per_node or nvidia_driver.QueryNumberOfGpus(
benchmark_spec.vms[0])
nx, ny, nz = FLAGS.hpcg_problem_size
stdout, _ = controller.RobustRemoteCommand(
'srun '
f'-N {len(benchmark_spec.vms)} '
f'--ntasks-per-node {gpus_per_node} '
'--cpus-per-task '
f'{int(controller.NumCpusForBenchmark() / gpus_per_node)} '
# Public instruction uses pmix, but need extra work.
'--cpu-bind=none --mpi=pmi2 '
f'--container-image="{NVIDIA_HPC}" '
f'./hpcg.sh --nx {nx} --ny {ny} --nz {nz} --rt {FLAGS.hpcg_runtime}')
samples.append(
sample.Sample(
'HPCG Throughput',
regex_util.ExtractFloat(THROUGHPUT_REGEX, stdout),
'Gflops',
_CreateMetadata(benchmark_spec.vms)))
if gpus_per_node > 1 and len(benchmark_spec.vms) == 1:
# measure per gpu performance
metadata = _CreateMetadata(benchmark_spec.vms)
metadata['cpus_per_task'] = controller.NumCpusForBenchmark()
metadata['gpus_per_node'] = 1
stdout, _ = controller.RobustRemoteCommand(
'srun '
f'-N {len(benchmark_spec.vms)} '
f'--ntasks-per-node {1} '
f'--cpus-per-task {controller.NumCpusForBenchmark()} '
'--cpu-bind=none --mpi=pmi2 '
f'--container-image="{NVIDIA_HPC}" '
f'./hpcg.sh --nx {nx} --ny {ny} --nz {nz} --rt {FLAGS.hpcg_runtime}')
samples.append(
sample.Sample(
'HPCG Throughput',
regex_util.ExtractFloat(THROUGHPUT_REGEX, stdout),
'Gflops', metadata))
return samples
def Cleanup(_):
"""Cleanup Nvidia HPL."""
pass