gpudirect-tcpxo/nccl-test.yaml (173 lines of code) (raw):
apiVersion: v1
kind: Service
metadata:
name: nccl-host-1
spec:
selector:
name: nccl-host-1
clusterIP: None
---
apiVersion: v1
kind: Service
metadata:
name: nccl-host-2
spec:
selector:
name: nccl-host-2
clusterIP: None
---
apiVersion: v1
kind: Pod
metadata:
name: nccl-test-host-1
labels:
name: nccl-host-1
tcpxo: daemon
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: tcpxo
operator: In
values:
- daemon
topologyKey: "kubernetes.io/hostname"
hostname: host1
subdomain: nccl-host-1
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: tcpxo-daemon
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
- |
set -ex
chmod 755 /fts/entrypoint_rxdm_container.sh
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: nccl-test
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
cat >/scripts/allgather.sh <<EOF
#!/bin/bash
/scripts/init_ssh.sh \${@};
pushd /scripts;
/scripts/gen_hostfiles.sh \${@};
popd;
BENCHMARK=all_gather_perf NHOSTS=2 NCCL_LIB_DIR="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
EOF
chmod +x /scripts/allgather.sh
service ssh restart;
sleep infinity;
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: shared-memory
mountPath: /dev/shm
resources:
limits:
nvidia.com/gpu: 8
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 1Gi
---
apiVersion: v1
kind: Pod
metadata:
name: nccl-test-host-2
labels:
name: nccl-host-2
tcpxo: daemon
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: tcpxo
operator: In
values:
- daemon
topologyKey: "kubernetes.io/hostname"
hostname: host2
subdomain: nccl-host-2
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: tcpxo-daemon
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
- |
set -ex
chmod 755 /fts/entrypoint_rxdm_container.sh
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: nccl-test
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
cat >/scripts/allgather.sh <<EOF
#!/bin/bash
/scripts/init_ssh.sh \${@};
pushd /scripts;
/scripts/gen_hostfiles.sh \${@};
popd;
BENCHMARK=all_gather_perf NHOSTS=2 NCCL_LIB_DIR="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
EOF
chmod +x /scripts/allgather.sh
service ssh restart;
sleep infinity;
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
securityContext:
privileged: true
volumeMounts:
- name: shared-memory
mountPath: /dev/shm
resources:
limits:
nvidia.com/gpu: 8
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 1Gi