perfkitbenchmarker/linux_packages/optimize_gpu.py (91 lines of code) (raw):
"""Module containing GPU optimization logic."""
from absl import flags
from absl import logging
from perfkitbenchmarker.linux_packages import nvidia_driver
FLAGS = flags.FLAGS
_GPUS_PER_NODE = 8
TUNER = flags.DEFINE_boolean('enable_ofi_tuner', False, 'Enable aws ofi tuner')
def _CheckSupported(vm):
"""Check if the GPU type is supported."""
# TODO(yuyanting): Add support for other GPU types.
if nvidia_driver.GetGpuType(vm) not in (
nvidia_driver.NVIDIA_H100, nvidia_driver.NVIDIA_H200):
logging.warn('Skipping GPU optimization for non-H100, H200 GPU.')
return False
return True
def Install(vm):
"""Optimize GPU settings on the VM."""
if not _CheckSupported(vm):
return
# Following:
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/optimize_gpu.html
nvidia_driver.EnablePersistenceMode(vm)
if nvidia_driver.GetGpuType(vm) == nvidia_driver.NVIDIA_H100:
vm.RemoteCommand('sudo nvidia-smi -ac 2619,1980')
elif nvidia_driver.GetGpuType(vm) == nvidia_driver.NVIDIA_H200:
vm.RemoteCommand('sudo nvidia-smi -ac 3201,1980')
# Consider moving driver installation to this module.
def GetContainerMounts(vm):
if FLAGS.cloud == 'GCP' and vm.machine_type == 'a3-megagpu-8g':
return [
'/var/tmp:/var/tmp',
'/var/lib/tcpxo/lib64:/var/lib/tcpxo/lib64',
'/dev/aperture_devices:/dev/aperture_devices',
]
return []
def MountFuse(vm, bucket, path):
"""Mount fuse for the container."""
if FLAGS.cloud == 'GCP':
vm.RemoteCommand(
'sudo mount -t gcsfuse -o'
' allow_other,uid=$USER,gid=$USER,dir_mode=777,file_mode=777,implicit_dirs'
f' {bucket} {path}'
)
elif FLAGS.cloud == 'AWS':
vm.RemoteCommand(
'sudo sed -i "s/#user_allow_other/user_allow_other/g" /etc/fuse.conf'
)
vm.RemoteCommand(
'mount-s3 --allow-delete --allow-other --allow-overwrite '
f'{bucket} {path}'
)
else:
raise NotImplementedError()
def SetContainerEnv(vm):
"""Set container environment to use optimized network stack.
For AWS p5, this method assumes EFA is installed inside the container
(rather than through mount).
For GCP a3-mega, tcpxo is installed following the guide and passed to the
container through mount.
Args:
vm: VirtualMachine object.
Returns:
String of commands that can be run inside the container to set environment.
"""
if not _CheckSupported(vm):
return ''
tuner = ''
if TUNER.value:
tuner = (
'export NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/lib/libnccl-ofi-tuner.so; '
'export FI_EFA_FORK_SAFE=1; '
'export NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent,eth; '
'export NCCL_BUFFSIZE=8388608; '
'export NCCL_P2P_NET_CHUNKSIZE=524288;')
if FLAGS.cloud == 'AWS':
return (
'export LD_LIBRARY_PATH='
# pylint: disable=anomalous-backslash-in-string
'/opt/aws-ofi-nccl/lib:/opt/amazon/efa:\$LD_LIBRARY_PATH; '
# pylint: disable=anomalous-backslash-in-string
'export PATH=/opt/amazon/openmpi/bin/:\$PATH; '
'export FI_PROVIDER=efa; export FI_EFA_USE_DEVICE_RDMA=1;'
) + tuner
if FLAGS.cloud == 'GCP' and vm.machine_type == 'a3-megagpu-8g':
return (
'NCCL_LIB_DIR=/var/lib/tcpxo/lib64; '
'source /var/lib/tcpxo/lib64/nccl-env-profile.sh; '
'export NCCL_NET=FasTrak; ' # enforce using FasTrak
'export NCCL_FASTRAK_CTRL_DEV=enp0s12; '
# pylint: disable=line-too-long
'export'
' NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0; '
'export NCCL_SOCKET_IFNAME=enp0s12; '
'export NCCL_FASTRAK_USE_SNAP=1; '
'export NCCL_FASTRAK_USE_LLCM=1; '
'export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices; '
# pylint: disable=anomalous-backslash-in-string
'export LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:\$LD_LIBRARY_PATH; '
)
def _BuildGCPTopoAwareHostFile(controller, nnodes):
"""Build topo aware hostfile.
https://cloud.google.com/cluster-toolkit/docs/machine-learning/a3-mega-enable-gpudirect-tcpxo#run_nccl_test
Args:
controller: The controller VM
nnodes: The number of nodes in the cluster
"""
controller.RemoteCommand(
'srun --mpi=pmi2 -n'
f' {nnodes * _GPUS_PER_NODE} --ntasks-per-node={_GPUS_PER_NODE} bash -c'
" 'curl -s"
' "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host"'
' -H "Metadata-Flavor: Google"; echo /$SLURMD_NODENAME\' | sort -t / -s'
' -k 1,4 | awk -F "/" \'{print $NF}\' >/var/tmp/hostfile'
)
def BuildHostFile(controller, nnodes):
# pylint: disable=protected-access
if controller._RemoteFileExists('/var/tmp/hostfile'):
return
elif FLAGS.cloud == 'GCP':
_BuildGCPTopoAwareHostFile(controller, nnodes)