fast-socket-installer/fast-socket-installer.yaml (59 lines of code) (raw):

apiVersion: apps/v1 kind: DaemonSet metadata: name: nccl-fastsocket-manual-installer namespace: kube-system labels: k8s-app: nccl-fastsocket-manual-installer spec: selector: matchLabels: k8s-app: nccl-fastsocket-manual-installer updateStrategy: type: RollingUpdate template: metadata: labels: name: nccl-fastsocket-manual-installer k8s-app: nccl-fastsocket-manual-installer spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: cloud.google.com/gke-accelerator operator: Exists - key: cloud.google.com/gke-nccl-fastsocket operator: Exists tolerations: - operator: "Exists" hostNetwork: true hostPID: true volumes: - name: nvidia-install-lib64-dir-host hostPath: path: /home/kubernetes/bin/nvidia/lib64 type: DirectoryOrCreate initContainers: - image: gcr.io/gke-release/fastsocket-installer@sha256:cb8dca70b5611769fd2e0e8eb9aebf81a89d4378537cff104775c873abf2d9c5 name: nccl-fastsocket-installer command: - bash - -c - | cp /usr/lib/libnccl-net.so $NCCL_INSTALL_DIR/ securityContext: privileged: true resources: limits: memory: 100Mi env: - name: NCCL_INSTALL_DIR value: /usr/local/nvidia/lib64 volumeMounts: - name: nvidia-install-lib64-dir-host mountPath: /usr/local/nvidia/lib64 containers: - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" name: pause