configs/nvidia-peermem-reloader/ds.yaml

--- apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-peermem-reloader # More info about the namespace: # https://learn.microsoft.com/en-us/azure/aks/gpu-cluster?tabs=add-ubuntu-gpu-node-pool#manually-install-the-nvidia-device-plugin namespace: gpu-resources spec: selector: matchLabels: name: nvidia-peermem-reloader updateStrategy: type: RollingUpdate template: metadata: labels: name: nvidia-peermem-reloader spec: tolerations: - key: "sku" operator: "Equal" value: "gpu" effect: "NoSchedule" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.azure.com/accelerator operator: In values: - nvidia # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - name: nvidia-peermem-reloader-ctr image: mcr.microsoft.com/mirror/docker/library/ubuntu:24.04 command: - /bin/bash - -xc - apt update && apt install -y kmod && modprobe nvidia-peermem && echo "sleeping" && sleep inf # This is using the same logic as: https://github.com/NVIDIA/gpu-operator/blob/24e4a0e35272921248ba6b3ed0ce60872bbc29e1/assets/state-driver/0500_daemonset.yaml#L150-L189 startupProbe: exec: command: - /bin/bash - -c - modprobe nvidia-peermem initialDelaySeconds: 10 failureThreshold: 120 successThreshold: 1 periodSeconds: 10 timeoutSeconds: 10 livenessProbe: exec: command: - /bin/bash - -c - modprobe nvidia-peermem periodSeconds: 30 initialDelaySeconds: 30 failureThreshold: 1 successThreshold: 1 timeoutSeconds: 10 securityContext: privileged: true volumeMounts: - name: lib-modules mountPath: /lib/modules readOnly: true volumes: - name: lib-modules hostPath: path: /lib/modules type: Directory

configs/nvidia-peermem-reloader/ds.yaml (72 lines of code) (raw):