fast-socket-installer/fast-socket-installer.yaml (59 lines of code) (raw):
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nccl-fastsocket-manual-installer
namespace: kube-system
labels:
k8s-app: nccl-fastsocket-manual-installer
spec:
selector:
matchLabels:
k8s-app: nccl-fastsocket-manual-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nccl-fastsocket-manual-installer
k8s-app: nccl-fastsocket-manual-installer
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: cloud.google.com/gke-nccl-fastsocket
operator: Exists
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: nvidia-install-lib64-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
type: DirectoryOrCreate
initContainers:
- image: gcr.io/gke-release/fastsocket-installer@sha256:cb8dca70b5611769fd2e0e8eb9aebf81a89d4378537cff104775c873abf2d9c5
name: nccl-fastsocket-installer
command:
- bash
- -c
- |
cp /usr/lib/libnccl-net.so $NCCL_INSTALL_DIR/
securityContext:
privileged: true
resources:
limits:
memory: 100Mi
env:
- name: NCCL_INSTALL_DIR
value: /usr/local/nvidia/lib64
volumeMounts:
- name: nvidia-install-lib64-dir-host
mountPath: /usr/local/nvidia/lib64
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause