gpudirect-tcpx/nccl-test.yaml

apiVersion: v1 kind: Service metadata: name: nccl-host-1 spec: selector: name: nccl-host-1 clusterIP: None --- apiVersion: v1 kind: Service metadata: name: nccl-host-2 spec: selector: name: nccl-host-2 clusterIP: None --- apiVersion: v1 kind: Pod metadata: name: nccl-test-host-1 labels: name: nccl-host-1 spec: hostNetwork: true dnsPolicy: ClusterFirstWithHostNet containers: - name: tcpx-daemon image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9 imagePullPolicy: Always command: - /tcpgpudmarxd/build/app/tcpgpudmarxd - --gpu_nic_preset - a3vm - --gpu_shmem_type - fd - --uds_path - /run/tcpx - --setup_param - \"--verbose 128 2 0 \" securityContext: privileged: true volumeMounts: - name: libraries mountPath: /usr/local/nvidia/lib64 readOnly: true - name: tcpx-socket mountPath: /run/tcpx env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9 imagePullPolicy: Always command: - /bin/sh - -c - | service ssh restart; sleep infinity; env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 securityContext: capabilities: add: - IPC_LOCK volumeMounts: - name: tcpx-socket mountPath: /tmp - name: config-volume mountPath: /configs resources: limits: nvidia.com/gpu: 8 volumes: - name: config-volume configMap: name: nccl-configmap defaultMode: 0777 - name: libraries hostPath: path: /home/kubernetes/bin/nvidia/lib64 - name: tcpx-socket emptyDir: --- apiVersion: v1 kind: Pod metadata: name: nccl-test-host-2 labels: name: nccl-host-2 spec: hostNetwork: true dnsPolicy: ClusterFirstWithHostNet containers: - name: tcpx-daemon image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.9 imagePullPolicy: Always command: - /tcpgpudmarxd/build/app/tcpgpudmarxd - --gpu_nic_preset - a3vm - --gpu_shmem_type - fd - --uds_path - /run/tcpx - --setup_param - \"--verbose 128 2 0 \" securityContext: privileged: true volumeMounts: - name: libraries mountPath: /usr/local/nvidia/lib64 readOnly: true - name: tcpx-socket mountPath: /run/tcpx env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9 imagePullPolicy: Always command: - /bin/sh - -c - | service ssh restart; sleep infinity; env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 securityContext: capabilities: add: - IPC_LOCK volumeMounts: - name: tcpx-socket mountPath: /tmp - name: config-volume mountPath: /configs resources: limits: nvidia.com/gpu: 8 volumes: - name: config-volume configMap: name: nccl-configmap defaultMode: 0777 - name: libraries hostPath: path: /home/kubernetes/bin/nvidia/lib64 - name: tcpx-socket emptyDir:

gpudirect-tcpx/nccl-test.yaml (41 lines of code) (raw):