gpudirect-tcpxo/nccl-tcpxo-installer.yaml (94 lines of code) (raw):
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nccl-tcpxo-installer
namespace: kube-system
labels:
k8s-app: nccl-tcpxo-installer
spec:
selector:
matchLabels:
k8s-app: nccl-tcpxo-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nccl-tcpxo-installer
k8s-app: nccl-tcpxo-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-mega-80gb
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: var-lib
hostPath:
path: /var/lib
- name: tcpxo
hostPath:
path: /var/lib/tcpxo
- name: library-dir-host
hostPath:
path: /home/kubernetes/bin
initContainers:
- image: "ubuntu"
name: pre-installation
securityContext:
privileged: true
command:
- nsenter
- -at
- '1'
- --
- sh
- -c
- |
/sbin/iptables -I INPUT -p tcp -m tcp -j ACCEPT && modprobe import-helper
sudo mkdir -p /dev/aperture_devices
while IFS= read -r line; do
BDF=$( echo "$line" | awk '{print $1}' );
target_aperture_path="/dev/aperture_devices/$BDF"
host_aperture_device=$(readlink -f "/sys/bus/pci/devices/$BDF");
sudo mkdir -p $target_aperture_path;
sudo umount -R $target_aperture_path;
sudo mount --bind $host_aperture_device $target_aperture_path;
done < <(lspci -nn -D | grep '1ae0:0084')
if [ -d /dev/aperture_devices ]; then
chmod -R a+r /dev/aperture_devices/
chmod a+rw /dev/aperture_devices/*/resource*
fi
- name: nccl-tcpxo-installer
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: var-lib
mountPath: /var/lib
- name: library-dir-host
mountPath: /usr/local
command: ["/bin/sh", "-c"]
args:
- |
set -ex
chmod 755 /scripts/container_entry.sh
/scripts/container_entry.sh install --install-nccl
mkdir -p /usr/local/nvidia/lib64
cp -r /var/lib/tcpxo/lib64/. /usr/local/nvidia/lib64
echo "installation finishes"
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause