modules/management/kubectl-apply/manifests/gpu-operator-v25.3.0.yaml (1,482 lines of code) (raw):

# Copyright 2025 "Google LLC" # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Source: gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: name: node-feature-discovery namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm --- # Source: gpu-operator/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" --- # Source: gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml apiVersion: v1 kind: ConfigMap metadata: name: release-name-node-feature-discovery-master-conf namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm data: nfd-master.conf: |- extraLabelNs: - nvidia.com --- # Source: gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml apiVersion: v1 kind: ConfigMap metadata: name: release-name-node-feature-discovery-worker-conf namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm data: nfd-worker.conf: |- sources: pci: deviceClassWhitelist: - "02" - "0200" - "0207" - "0300" - "0302" deviceLabelFields: - vendor --- # Source: gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: release-name-node-feature-discovery labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm rules: - apiGroups: - "" resources: - namespaces verbs: - watch - list - apiGroups: - "" resources: - nodes - nodes/status verbs: - get - patch - update - list - apiGroups: - nfd.k8s-sigs.io resources: - nodefeatures - nodefeaturerules - nodefeaturegroups verbs: - get - list - watch - apiGroups: - nfd.k8s-sigs.io resources: - nodefeaturegroups/status verbs: - patch - update - apiGroups: - coordination.k8s.io resources: - leases verbs: - create - apiGroups: - coordination.k8s.io resources: - leases resourceNames: - "nfd-master.nfd.kubernetes.io" verbs: - get - update --- # Source: gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: release-name-node-feature-discovery-gc labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm rules: - apiGroups: - "" resources: - nodes verbs: - list - watch - apiGroups: - "" resources: - nodes/proxy verbs: - get - apiGroups: - topology.node.k8s.io resources: - noderesourcetopologies verbs: - delete - list - apiGroups: - nfd.k8s-sigs.io resources: - nodefeatures verbs: - delete - list --- # Source: gpu-operator/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" rules: - apiGroups: - config.openshift.io resources: - clusterversions - proxies verbs: - get - list - watch - apiGroups: - image.openshift.io resources: - imagestreams verbs: - get - list - watch - apiGroups: - security.openshift.io resources: - securitycontextconstraints verbs: - create - get - list - watch - update - patch - delete - use - apiGroups: - rbac.authorization.k8s.io resources: - clusterroles - clusterrolebindings verbs: - create - get - list - watch - update - patch - delete - apiGroups: - "" resources: - nodes verbs: - get - list - watch - update - patch - apiGroups: - "" resources: - namespaces verbs: - get - list - watch - update - patch - apiGroups: - "" resources: - events verbs: - create - get - list - watch - delete - apiGroups: - "" resources: - pods verbs: - get - list - watch - apiGroups: - "" resources: - pods/eviction verbs: - create - apiGroups: - apps resources: - daemonsets verbs: - get - list - watch - apiGroups: - nvidia.com resources: - clusterpolicies - clusterpolicies/finalizers - clusterpolicies/status - nvidiadrivers - nvidiadrivers/finalizers - nvidiadrivers/status verbs: - create - get - list - watch - update - patch - delete - deletecollection - apiGroups: - scheduling.k8s.io resources: - priorityclasses verbs: - get - list - watch - create - apiGroups: - node.k8s.io resources: - runtimeclasses verbs: - get - list - create - update - watch - delete - apiGroups: - apiextensions.k8s.io resources: - customresourcedefinitions verbs: - get - list - watch - update - patch - create --- # Source: gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: release-name-node-feature-discovery namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: release-name-node-feature-discovery subjects: - kind: ServiceAccount name: node-feature-discovery namespace: gpu-operator --- # Source: gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: release-name-node-feature-discovery-gc namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: release-name-node-feature-discovery-gc subjects: - kind: ServiceAccount name: node-feature-discovery namespace: gpu-operator --- # Source: gpu-operator/templates/clusterrolebinding.yaml kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" subjects: - kind: ServiceAccount name: gpu-operator namespace: gpu-operator roleRef: kind: ClusterRole name: gpu-operator apiGroup: rbac.authorization.k8s.io --- # Source: gpu-operator/charts/node-feature-discovery/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: release-name-node-feature-discovery-worker namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm rules: - apiGroups: - nfd.k8s-sigs.io resources: - nodefeatures verbs: - create - get - update - delete - apiGroups: - "" resources: - pods verbs: - get --- # Source: gpu-operator/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" rules: - apiGroups: - rbac.authorization.k8s.io resources: - roles - rolebindings verbs: - create - get - list - watch - update - patch - delete - apiGroups: - apps resources: - controllerrevisions verbs: - get - list - watch - apiGroups: - apps resources: - daemonsets verbs: - create - get - list - watch - update - patch - delete - apiGroups: - "" resources: - configmaps - endpoints - pods - pods/eviction - secrets - services - services/finalizers - serviceaccounts verbs: - create - get - list - watch - update - patch - delete - apiGroups: - coordination.k8s.io resources: - leases verbs: - get - list - watch - create - update - patch - delete - apiGroups: - monitoring.coreos.com resources: - servicemonitors - prometheusrules verbs: - get - list - create - watch - update - delete --- # Source: gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: release-name-node-feature-discovery-worker namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: release-name-node-feature-discovery-worker subjects: - kind: ServiceAccount name: node-feature-discovery namespace: gpu-operator --- # Source: gpu-operator/templates/rolebinding.yaml kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" subjects: - kind: ServiceAccount name: gpu-operator namespace: gpu-operator roleRef: kind: Role name: gpu-operator apiGroup: rbac.authorization.k8s.io --- # Source: gpu-operator/charts/node-feature-discovery/templates/worker.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: release-name-node-feature-discovery-worker namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm role: worker spec: revisionHistoryLimit: selector: matchLabels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: worker template: metadata: labels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: worker annotations: checksum/config: 0345e7bb2f41fd6df72ff495017fc89df2ab81e3929f7002d961ce46e6864365 spec: dnsPolicy: ClusterFirstWithHostNet priorityClassName: system-node-critical serviceAccountName: node-feature-discovery securityContext: {} hostNetwork: false containers: - name: worker securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: true runAsNonRoot: true image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2" imagePullPolicy: IfNotPresent livenessProbe: grpc: port: 8082 initialDelaySeconds: 10 readinessProbe: grpc: port: 8082 initialDelaySeconds: 5 failureThreshold: 10 env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: POD_UID valueFrom: fieldRef: fieldPath: metadata.uid resources: limits: memory: 512Mi requests: cpu: 5m memory: 64Mi command: - "nfd-worker" args: # Go over featureGate and add the feature-gate flag - "-feature-gates=NodeFeatureGroupAPI=false" - "-metrics=8081" - "-grpc-health=8082" ports: - containerPort: 8081 name: metrics - containerPort: 8082 name: health volumeMounts: - name: host-boot mountPath: "/host-boot" readOnly: true - name: host-os-release mountPath: "/host-etc/os-release" readOnly: true - name: host-sys mountPath: "/host-sys" readOnly: true - name: host-usr-lib mountPath: "/host-usr/lib" readOnly: true - name: host-lib mountPath: "/host-lib" readOnly: true - name: host-proc-swaps mountPath: "/host-proc/swaps" readOnly: true - name: features-d mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" readOnly: true - name: nfd-worker-conf mountPath: "/etc/kubernetes/node-feature-discovery" readOnly: true volumes: - name: host-boot hostPath: path: "/boot" - name: host-os-release hostPath: path: "/etc/os-release" - name: host-sys hostPath: path: "/sys" - name: host-usr-lib hostPath: path: "/usr/lib" - name: host-lib hostPath: path: "/lib" - name: host-proc-swaps hostPath: path: "/proc/swaps" - name: features-d hostPath: path: "/etc/kubernetes/node-feature-discovery/features.d/" - name: nfd-worker-conf configMap: name: release-name-node-feature-discovery-worker-conf items: - key: nfd-worker.conf path: nfd-worker.conf tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Equal value: "" - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Equal value: "" - effect: NoSchedule key: nvidia.com/gpu operator: Exists --- # Source: gpu-operator/charts/node-feature-discovery/templates/master.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-node-feature-discovery-master namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm role: master spec: replicas: 1 revisionHistoryLimit: selector: matchLabels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: master template: metadata: labels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: master annotations: checksum/config: f57e7832742bcbf3c257da4765c1da5ba2acdcab7411a32b3ed3429f4e86d8e2 spec: priorityClassName: system-node-critical serviceAccountName: node-feature-discovery enableServiceLinks: false securityContext: {} hostNetwork: false containers: - name: master securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: true runAsNonRoot: true image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2" imagePullPolicy: IfNotPresent startupProbe: grpc: port: 8082 failureThreshold: 30 livenessProbe: grpc: port: 8082 readinessProbe: grpc: port: 8082 failureThreshold: 10 ports: - containerPort: 8081 name: metrics - containerPort: 8082 name: health env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName command: - "nfd-master" resources: limits: memory: 4Gi requests: cpu: 100m memory: 128Mi args: - "-enable-leader-election" # Go over featureGates and add the feature-gate flag - "-feature-gates=NodeFeatureGroupAPI=false" - "-metrics=8081" - "-grpc-health=8082" volumeMounts: - name: nfd-master-conf mountPath: "/etc/kubernetes/node-feature-discovery" readOnly: true volumes: - name: nfd-master-conf configMap: name: release-name-node-feature-discovery-master-conf items: - key: nfd-master.conf path: nfd-master.conf affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: In values: - "" weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: In values: - "" weight: 1 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Equal value: "" - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Equal value: "" --- # Source: gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-node-feature-discovery-gc namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm role: gc spec: replicas: 1 revisionHistoryLimit: selector: matchLabels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: gc template: metadata: labels: app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name role: gc spec: serviceAccountName: node-feature-discovery dnsPolicy: ClusterFirstWithHostNet priorityClassName: system-node-critical securityContext: {} hostNetwork: false containers: - name: gc image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2" imagePullPolicy: "IfNotPresent" env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName command: - "nfd-gc" args: - "-gc-interval=1h" resources: limits: memory: 1Gi requests: cpu: 10m memory: 128Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: [ "ALL" ] readOnlyRootFilesystem: true runAsNonRoot: true ports: - name: metrics containerPort: 8081 --- # Source: gpu-operator/templates/operator.yaml apiVersion: apps/v1 kind: Deployment metadata: name: gpu-operator labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" nvidia.com/gpu-driver-upgrade-drain.skip: "true" spec: replicas: 1 selector: matchLabels: app.kubernetes.io/component: "gpu-operator" app: "gpu-operator" template: metadata: labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" app: "gpu-operator" nvidia.com/gpu-driver-upgrade-drain.skip: "true" annotations: openshift.io/scc: restricted-readonly spec: serviceAccountName: gpu-operator priorityClassName: system-node-critical containers: - name: gpu-operator image: nvcr.io/nvidia/gpu-operator:v25.3.0 imagePullPolicy: IfNotPresent command: ["gpu-operator"] args: - --leader-elect - --zap-time-encoding=epoch - --zap-log-level=info env: - name: WATCH_NAMESPACE value: "" - name: OPERATOR_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace - name: "DRIVER_MANAGER_IMAGE" value: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0" volumeMounts: - name: host-os-release mountPath: "/host-etc/os-release" readOnly: true livenessProbe: httpGet: path: /healthz port: 8081 initialDelaySeconds: 15 periodSeconds: 20 readinessProbe: httpGet: path: /readyz port: 8081 initialDelaySeconds: 5 periodSeconds: 10 resources: limits: cpu: 500m memory: 350Mi requests: cpu: 200m memory: 100Mi ports: - name: metrics containerPort: 8080 volumes: - name: host-os-release hostPath: path: "/etc/os-release" affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: In values: - "" weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: In values: - "" weight: 1 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Equal value: "" - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Equal value: "" --- # Source: gpu-operator/templates/clusterpolicy.yaml apiVersion: nvidia.com/v1 kind: ClusterPolicy metadata: name: cluster-policy labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" spec: hostPaths: rootFS: / driverInstallDir: /home/kubernetes/bin/nvidia operator: runtimeClass: nvidia initContainer: repository: nvcr.io/nvidia image: cuda version: "12.8.1-base-ubi9" imagePullPolicy: IfNotPresent daemonsets: labels: helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/managed-by: gpu-operator tolerations: - effect: NoSchedule key: nvidia.com/gpu operator: Exists priorityClassName: system-node-critical updateStrategy: RollingUpdate rollingUpdate: maxUnavailable: "1" validator: repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator version: "v25.3.0" imagePullPolicy: IfNotPresent plugin: env: - name: WITH_WORKLOAD value: "false" mig: strategy: single psa: enabled: false cdi: enabled: true default: true driver: enabled: false useNvidiaDriverCRD: false kernelModuleType: auto usePrecompiled: false repository: nvcr.io/nvidia image: driver version: "550.127.05" imagePullPolicy: IfNotPresent startupProbe: failureThreshold: 120 initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 60 rdma: enabled: false useHostMofed: false manager: repository: nvcr.io/nvidia/cloud-native image: k8s-driver-manager version: "v0.8.0" imagePullPolicy: IfNotPresent env: - name: ENABLE_GPU_POD_EVICTION value: "true" - name: ENABLE_AUTO_DRAIN value: "false" - name: DRAIN_USE_FORCE value: "false" - name: DRAIN_POD_SELECTOR_LABEL value: "" - name: DRAIN_TIMEOUT_SECONDS value: 0s - name: DRAIN_DELETE_EMPTYDIR_DATA value: "false" repoConfig: configMapName: "" certConfig: name: "" licensingConfig: configMapName: "" nlsEnabled: true virtualTopology: config: "" kernelModuleConfig: name: "" upgradePolicy: autoUpgrade: true maxParallelUpgrades: 1 maxUnavailable : 25% waitForCompletion: timeoutSeconds: 0 podDeletion: force: false timeoutSeconds: 300 deleteEmptyDir: false drain: enable: false force: false timeoutSeconds: 300 deleteEmptyDir: false vgpuManager: enabled: false image: vgpu-manager imagePullPolicy: IfNotPresent driverManager: repository: nvcr.io/nvidia/cloud-native image: k8s-driver-manager version: "v0.8.0" imagePullPolicy: IfNotPresent env: - name: ENABLE_GPU_POD_EVICTION value: "false" - name: ENABLE_AUTO_DRAIN value: "false" kataManager: enabled: false config: artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses runtimeClasses: - artifacts: pullSecret: "" url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 name: kata-nvidia-gpu nodeSelector: {} - artifacts: pullSecret: "" url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp name: kata-nvidia-gpu-snp nodeSelector: nvidia.com/cc.capable: "true" repository: nvcr.io/nvidia/cloud-native image: k8s-kata-manager version: "v0.2.3" imagePullPolicy: IfNotPresent vfioManager: enabled: true repository: nvcr.io/nvidia image: cuda version: "12.8.1-base-ubi9" imagePullPolicy: IfNotPresent driverManager: repository: nvcr.io/nvidia/cloud-native image: k8s-driver-manager version: "v0.8.0" imagePullPolicy: IfNotPresent env: - name: ENABLE_GPU_POD_EVICTION value: "false" - name: ENABLE_AUTO_DRAIN value: "false" vgpuDeviceManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: vgpu-device-manager version: "v0.3.0" imagePullPolicy: IfNotPresent config: default: default name: "" ccManager: enabled: false defaultMode: "off" repository: nvcr.io/nvidia/cloud-native image: k8s-cc-manager version: "v0.1.1" imagePullPolicy: IfNotPresent env: [] toolkit: enabled: true repository: nvcr.io/nvidia/k8s image: container-toolkit version: "v1.17.5-ubuntu20.04" imagePullPolicy: IfNotPresent installDir: /home/kubernetes/bin/nvidia devicePlugin: enabled: true repository: nvcr.io/nvidia image: k8s-device-plugin version: "v0.17.1" imagePullPolicy: IfNotPresent env: - name: PASS_DEVICE_SPECS value: "true" - name: FAIL_ON_INIT_ERROR value: "true" - name: DEVICE_LIST_STRATEGY value: envvar - name: DEVICE_ID_STRATEGY value: uuid - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: all dcgm: enabled: false repository: nvcr.io/nvidia/cloud-native image: dcgm version: "4.1.1-2-ubuntu22.04" imagePullPolicy: IfNotPresent dcgmExporter: enabled: true repository: nvcr.io/nvidia/k8s image: dcgm-exporter version: "4.1.1-4.0.4-ubuntu22.04" imagePullPolicy: IfNotPresent env: - name: DCGM_EXPORTER_LISTEN value: :9400 - name: DCGM_EXPORTER_KUBERNETES value: "true" - name: DCGM_EXPORTER_COLLECTORS value: /etc/dcgm-exporter/dcp-metrics-included.csv serviceMonitor: additionalLabels: {} enabled: false honorLabels: false interval: 15s relabelings: [] gfd: enabled: true repository: nvcr.io/nvidia image: k8s-device-plugin version: "v0.17.1" imagePullPolicy: IfNotPresent env: - name: GFD_SLEEP_INTERVAL value: 60s - name: GFD_FAIL_ON_INIT_ERROR value: "true" migManager: enabled: true repository: nvcr.io/nvidia/cloud-native image: k8s-mig-manager version: "v0.12.1-ubuntu20.04" imagePullPolicy: IfNotPresent env: - name: WITH_REBOOT value: "false" config: name: default: all-disabled gpuClientsConfig: name: "" nodeStatusExporter: enabled: false repository: nvcr.io/nvidia/cloud-native image: gpu-operator-validator version: "v25.3.0" imagePullPolicy: IfNotPresent gdrcopy: enabled: false repository: nvcr.io/nvidia/cloud-native image: gdrdrv version: "v2.4.4" imagePullPolicy: IfNotPresent sandboxWorkloads: enabled: false defaultWorkload: container sandboxDevicePlugin: enabled: true repository: nvcr.io/nvidia image: kubevirt-gpu-device-plugin version: "v1.3.1" imagePullPolicy: IfNotPresent --- # Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml apiVersion: v1 kind: ServiceAccount metadata: name: release-name-node-feature-discovery-prune namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm annotations: "helm.sh/hook": post-delete "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded --- # Source: gpu-operator/templates/upgrade_crd.yaml apiVersion: v1 kind: ServiceAccount metadata: name: gpu-operator-upgrade-crd-hook-sa annotations: helm.sh/hook: pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" --- # Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: release-name-node-feature-discovery-prune labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm annotations: "helm.sh/hook": post-delete "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded rules: - apiGroups: - "" resources: - nodes - nodes/status verbs: - get - patch - update - list --- # Source: gpu-operator/templates/upgrade_crd.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: gpu-operator-upgrade-crd-hook-role annotations: helm.sh/hook: pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" rules: - apiGroups: - apiextensions.k8s.io resources: - customresourcedefinitions verbs: - create - get - list - watch - patch - update --- # Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: release-name-node-feature-discovery-prune labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm annotations: "helm.sh/hook": post-delete "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: release-name-node-feature-discovery-prune subjects: - kind: ServiceAccount name: release-name-node-feature-discovery-prune namespace: gpu-operator --- # Source: gpu-operator/templates/upgrade_crd.yaml kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: gpu-operator-upgrade-crd-hook-binding annotations: helm.sh/hook: pre-upgrade helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation helm.sh/hook-weight: "0" subjects: - kind: ServiceAccount name: gpu-operator-upgrade-crd-hook-sa namespace: gpu-operator roleRef: kind: ClusterRole name: gpu-operator-upgrade-crd-hook-role apiGroup: rbac.authorization.k8s.io --- # Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml apiVersion: batch/v1 kind: Job metadata: name: release-name-node-feature-discovery-prune namespace: gpu-operator labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm annotations: "helm.sh/hook": post-delete "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: template: metadata: labels: helm.sh/chart: node-feature-discovery-0.17.2 app.kubernetes.io/name: node-feature-discovery app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v0.17.2" app.kubernetes.io/managed-by: Helm role: prune spec: serviceAccountName: release-name-node-feature-discovery-prune containers: - name: nfd-master securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: true runAsNonRoot: true image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2" imagePullPolicy: IfNotPresent command: - "nfd-master" args: - "-prune" restartPolicy: Never affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: In values: - "" weight: 1 - preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: In values: - "" weight: 1 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Equal value: "" - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Equal value: "" --- # Source: gpu-operator/templates/upgrade_crd.yaml apiVersion: batch/v1 kind: Job metadata: name: gpu-operator-upgrade-crd namespace: gpu-operator annotations: "helm.sh/hook": pre-upgrade "helm.sh/hook-weight": "1" "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" spec: template: metadata: name: gpu-operator-upgrade-crd labels: app.kubernetes.io/name: gpu-operator helm.sh/chart: gpu-operator-v25.3.0 app.kubernetes.io/instance: release-name app.kubernetes.io/version: "v25.3.0" app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: "gpu-operator" spec: serviceAccountName: gpu-operator-upgrade-crd-hook-sa tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Equal value: "" - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Equal value: "" containers: - name: upgrade-crd image: nvcr.io/nvidia/gpu-operator:v25.3.0 imagePullPolicy: IfNotPresent command: - /bin/sh - -c - > kubectl apply -f /opt/gpu-operator/nvidia.com_clusterpolicies.yaml; kubectl apply -f /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml; kubectl apply -f /opt/gpu-operator/nfd-api-crds.yaml; restartPolicy: OnFailure