gpudirect-tcpx/nccl-test.yaml (41 lines of code) (raw):
apiVersion: v1
kind: Service
metadata:
name: nccl-host-1
spec:
selector:
name: nccl-host-1
clusterIP: None
---
apiVersion: v1
kind: Service
metadata:
name: nccl-host-2
spec:
selector:
name: nccl-host-2
clusterIP: None
---
apiVersion: v1
kind: Pod
metadata:
name: nccl-test-host-1
labels:
name: nccl-host-1
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: tcpx-daemon
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9
imagePullPolicy: Always
command:
- /tcpgpudmarxd/build/app/tcpgpudmarxd
- --gpu_nic_preset
- a3vm
- --gpu_shmem_type
- fd
- --uds_path
- /run/tcpx
- --setup_param
- \"--verbose 128 2 0 \"
securityContext:
privileged: true
volumeMounts:
- name: libraries
mountPath: /usr/local/nvidia/lib64
readOnly: true
- name: tcpx-socket
mountPath: /run/tcpx
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: nccl-test
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
service ssh restart;
sleep infinity;
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- name: tcpx-socket
mountPath: /tmp
- name: config-volume
mountPath: /configs
resources:
limits:
nvidia.com/gpu: 8
volumes:
- name: config-volume
configMap:
name: nccl-configmap
defaultMode: 0777
- name: libraries
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: tcpx-socket
emptyDir:
---
apiVersion: v1
kind: Pod
metadata:
name: nccl-test-host-2
labels:
name: nccl-host-2
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: tcpx-daemon
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9
imagePullPolicy: Always
command:
- /tcpgpudmarxd/build/app/tcpgpudmarxd
- --gpu_nic_preset
- a3vm
- --gpu_shmem_type
- fd
- --uds_path
- /run/tcpx
- --setup_param
- \"--verbose 128 2 0 \"
securityContext:
privileged: true
volumeMounts:
- name: libraries
mountPath: /usr/local/nvidia/lib64
readOnly: true
- name: tcpx-socket
mountPath: /run/tcpx
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: nccl-test
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
service ssh restart;
sleep infinity;
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- name: tcpx-socket
mountPath: /tmp
- name: config-volume
mountPath: /configs
resources:
limits:
nvidia.com/gpu: 8
volumes:
- name: config-volume
configMap:
name: nccl-configmap
defaultMode: 0777
- name: libraries
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: tcpx-socket
emptyDir: