perfkitbenchmarker/linux_packages/tcpxo.py (48 lines of code) (raw):

"""Module containing GPUDirect-TCPXO installation.""" import time def Install(vm): """Install TCPXO on the VM.""" # Install tcpxo, based on: # https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager-mega vm.RemoteCommand('sudo modprobe import-helper') vm.RemoteCommand('gcloud auth configure-docker --quiet us-docker.pkg.dev') vm.RemoteCommand( 'docker run --rm --name nccl-installer --network=host ' '-v /var/lib:/var/lib ' 'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/' 'nccl-plugin-gpudirecttcpx-dev:v1.0.4 install' ) vm.RemoteCommand( 'sudo sed -i ' '"s|NCCL_FASTRAK_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8|' 'NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,' 'enp141s0,enp142s0|g" ' '/var/lib/tcpxo/lib64/nccl-env-profile.sh' ) vm.RemoteCommand( 'sudo sed -i "s|NCCL_SOCKET_IFNAME=eth0|NCCL_SOCKET_IFNAME=enp0s12|g" ' '/var/lib/tcpxo/lib64/nccl-env-profile.sh' ) vm.RemoteCommand( 'sudo sed -i "s|NCCL_FASTRAK_CTRL_DEV=eth0|' 'NCCL_FASTRAK_CTRL_DEV=enp0s12|g" ' '/var/lib/tcpxo/lib64/nccl-env-profile.sh' ) vm.RemoteCommand( 'echo "export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices"' ' | sudo tee -a /var/lib/tcpxo/lib64/nccl-env-profile.sh' ) vm.RemoteCommand( 'docker run --detach --pull=always --rm ' '--name receive-datapath-manager ' '--cap-add=NET_ADMIN ' '--network=host ' '--privileged ' '--gpus all ' '--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 ' '--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper ' '--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu ' 'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/' 'tcpgpudmarxd-dev:v1.0.10 ' '--num_hops=2 --num_nics=8 --uid= --alsologtostderr' ) time.sleep(20) vm.RemoteCommand('docker logs receive-datapath-manager') def AptInstall(vm): Install(vm)