modules/management/kubectl-apply/manifests/gpu-operator-v25.3.0.yaml (1,482 lines of code) (raw):
# Copyright 2025 "Google LLC"
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Source: gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-feature-discovery
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
---
# Source: gpu-operator/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
---
# Source: gpu-operator/charts/node-feature-discovery/templates/nfd-master-conf.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: release-name-node-feature-discovery-master-conf
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
data:
nfd-master.conf: |-
extraLabelNs:
- nvidia.com
---
# Source: gpu-operator/charts/node-feature-discovery/templates/nfd-worker-conf.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: release-name-node-feature-discovery-worker-conf
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
data:
nfd-worker.conf: |-
sources:
pci:
deviceClassWhitelist:
- "02"
- "0200"
- "0207"
- "0300"
- "0302"
deviceLabelFields:
- vendor
---
# Source: gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: release-name-node-feature-discovery
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
rules:
- apiGroups:
- ""
resources:
- namespaces
verbs:
- watch
- list
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
- nodefeaturerules
- nodefeaturegroups
verbs:
- get
- list
- watch
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeaturegroups/status
verbs:
- patch
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- coordination.k8s.io
resources:
- leases
resourceNames:
- "nfd-master.nfd.kubernetes.io"
verbs:
- get
- update
---
# Source: gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: release-name-node-feature-discovery-gc
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes/proxy
verbs:
- get
- apiGroups:
- topology.node.k8s.io
resources:
- noderesourcetopologies
verbs:
- delete
- list
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
verbs:
- delete
- list
---
# Source: gpu-operator/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
rules:
- apiGroups:
- config.openshift.io
resources:
- clusterversions
- proxies
verbs:
- get
- list
- watch
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- security.openshift.io
resources:
- securitycontextconstraints
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- use
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterroles
- clusterrolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- get
- list
- watch
- delete
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- pods/eviction
verbs:
- create
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
- watch
- apiGroups:
- nvidia.com
resources:
- clusterpolicies
- clusterpolicies/finalizers
- clusterpolicies/status
- nvidiadrivers
- nvidiadrivers/finalizers
- nvidiadrivers/status
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- deletecollection
- apiGroups:
- scheduling.k8s.io
resources:
- priorityclasses
verbs:
- get
- list
- watch
- create
- apiGroups:
- node.k8s.io
resources:
- runtimeclasses
verbs:
- get
- list
- create
- update
- watch
- delete
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- update
- patch
- create
---
# Source: gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: release-name-node-feature-discovery
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: release-name-node-feature-discovery
subjects:
- kind: ServiceAccount
name: node-feature-discovery
namespace: gpu-operator
---
# Source: gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: release-name-node-feature-discovery-gc
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: release-name-node-feature-discovery-gc
subjects:
- kind: ServiceAccount
name: node-feature-discovery
namespace: gpu-operator
---
# Source: gpu-operator/templates/clusterrolebinding.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
subjects:
- kind: ServiceAccount
name: gpu-operator
namespace: gpu-operator
roleRef:
kind: ClusterRole
name: gpu-operator
apiGroup: rbac.authorization.k8s.io
---
# Source: gpu-operator/charts/node-feature-discovery/templates/role.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: release-name-node-feature-discovery-worker
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
rules:
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
verbs:
- create
- get
- update
- delete
- apiGroups:
- ""
resources:
- pods
verbs:
- get
---
# Source: gpu-operator/templates/role.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
rules:
- apiGroups:
- rbac.authorization.k8s.io
resources:
- roles
- rolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- apps
resources:
- controllerrevisions
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- ""
resources:
- configmaps
- endpoints
- pods
- pods/eviction
- secrets
- services
- services/finalizers
- serviceaccounts
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
- prometheusrules
verbs:
- get
- list
- create
- watch
- update
- delete
---
# Source: gpu-operator/charts/node-feature-discovery/templates/rolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: release-name-node-feature-discovery-worker
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: release-name-node-feature-discovery-worker
subjects:
- kind: ServiceAccount
name: node-feature-discovery
namespace: gpu-operator
---
# Source: gpu-operator/templates/rolebinding.yaml
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
subjects:
- kind: ServiceAccount
name: gpu-operator
namespace: gpu-operator
roleRef:
kind: Role
name: gpu-operator
apiGroup: rbac.authorization.k8s.io
---
# Source: gpu-operator/charts/node-feature-discovery/templates/worker.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: release-name-node-feature-discovery-worker
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
role: worker
spec:
revisionHistoryLimit:
selector:
matchLabels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: worker
template:
metadata:
labels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: worker
annotations:
checksum/config: 0345e7bb2f41fd6df72ff495017fc89df2ab81e3929f7002d961ce46e6864365
spec:
dnsPolicy: ClusterFirstWithHostNet
priorityClassName: system-node-critical
serviceAccountName: node-feature-discovery
securityContext:
{}
hostNetwork: false
containers:
- name: worker
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2"
imagePullPolicy: IfNotPresent
livenessProbe:
grpc:
port: 8082
initialDelaySeconds: 10
readinessProbe:
grpc:
port: 8082
initialDelaySeconds: 5
failureThreshold: 10
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
resources:
limits:
memory: 512Mi
requests:
cpu: 5m
memory: 64Mi
command:
- "nfd-worker"
args:
# Go over featureGate and add the feature-gate flag
- "-feature-gates=NodeFeatureGroupAPI=false"
- "-metrics=8081"
- "-grpc-health=8082"
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
readOnly: true
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
- name: host-sys
mountPath: "/host-sys"
readOnly: true
- name: host-usr-lib
mountPath: "/host-usr/lib"
readOnly: true
- name: host-lib
mountPath: "/host-lib"
readOnly: true
- name: host-proc-swaps
mountPath: "/host-proc/swaps"
readOnly: true
- name: features-d
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
readOnly: true
- name: nfd-worker-conf
mountPath: "/etc/kubernetes/node-feature-discovery"
readOnly: true
volumes:
- name: host-boot
hostPath:
path: "/boot"
- name: host-os-release
hostPath:
path: "/etc/os-release"
- name: host-sys
hostPath:
path: "/sys"
- name: host-usr-lib
hostPath:
path: "/usr/lib"
- name: host-lib
hostPath:
path: "/lib"
- name: host-proc-swaps
hostPath:
path: "/proc/swaps"
- name: features-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d/"
- name: nfd-worker-conf
configMap:
name: release-name-node-feature-discovery-worker-conf
items:
- key: nfd-worker.conf
path: nfd-worker.conf
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
---
# Source: gpu-operator/charts/node-feature-discovery/templates/master.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: release-name-node-feature-discovery-master
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
role: master
spec:
replicas: 1
revisionHistoryLimit:
selector:
matchLabels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: master
template:
metadata:
labels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: master
annotations:
checksum/config: f57e7832742bcbf3c257da4765c1da5ba2acdcab7411a32b3ed3429f4e86d8e2
spec:
priorityClassName: system-node-critical
serviceAccountName: node-feature-discovery
enableServiceLinks: false
securityContext:
{}
hostNetwork: false
containers:
- name: master
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2"
imagePullPolicy: IfNotPresent
startupProbe:
grpc:
port: 8082
failureThreshold: 30
livenessProbe:
grpc:
port: 8082
readinessProbe:
grpc:
port: 8082
failureThreshold: 10
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "nfd-master"
resources:
limits:
memory: 4Gi
requests:
cpu: 100m
memory: 128Mi
args:
- "-enable-leader-election"
# Go over featureGates and add the feature-gate flag
- "-feature-gates=NodeFeatureGroupAPI=false"
- "-metrics=8081"
- "-grpc-health=8082"
volumeMounts:
- name: nfd-master-conf
mountPath: "/etc/kubernetes/node-feature-discovery"
readOnly: true
volumes:
- name: nfd-master-conf
configMap:
name: release-name-node-feature-discovery-master-conf
items:
- key: nfd-master.conf
path: nfd-master.conf
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
- ""
weight: 1
- preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- ""
weight: 1
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
---
# Source: gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: release-name-node-feature-discovery-gc
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
role: gc
spec:
replicas: 1
revisionHistoryLimit:
selector:
matchLabels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: gc
template:
metadata:
labels:
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
role: gc
spec:
serviceAccountName: node-feature-discovery
dnsPolicy: ClusterFirstWithHostNet
priorityClassName: system-node-critical
securityContext:
{}
hostNetwork: false
containers:
- name: gc
image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2"
imagePullPolicy: "IfNotPresent"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "nfd-gc"
args:
- "-gc-interval=1h"
resources:
limits:
memory: 1Gi
requests:
cpu: 10m
memory: 128Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
readOnlyRootFilesystem: true
runAsNonRoot: true
ports:
- name: metrics
containerPort: 8081
---
# Source: gpu-operator/templates/operator.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
nvidia.com/gpu-driver-upgrade-drain.skip: "true"
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: "gpu-operator"
app: "gpu-operator"
template:
metadata:
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
app: "gpu-operator"
nvidia.com/gpu-driver-upgrade-drain.skip: "true"
annotations:
openshift.io/scc: restricted-readonly
spec:
serviceAccountName: gpu-operator
priorityClassName: system-node-critical
containers:
- name: gpu-operator
image: nvcr.io/nvidia/gpu-operator:v25.3.0
imagePullPolicy: IfNotPresent
command: ["gpu-operator"]
args:
- --leader-elect
- --zap-time-encoding=epoch
- --zap-log-level=info
env:
- name: WATCH_NAMESPACE
value: ""
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: "DRIVER_MANAGER_IMAGE"
value: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0"
volumeMounts:
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
resources:
limits:
cpu: 500m
memory: 350Mi
requests:
cpu: 200m
memory: 100Mi
ports:
- name: metrics
containerPort: 8080
volumes:
- name: host-os-release
hostPath:
path: "/etc/os-release"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
- ""
weight: 1
- preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- ""
weight: 1
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
---
# Source: gpu-operator/templates/clusterpolicy.yaml
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: cluster-policy
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
spec:
hostPaths:
rootFS: /
driverInstallDir: /home/kubernetes/bin/nvidia
operator:
runtimeClass: nvidia
initContainer:
repository: nvcr.io/nvidia
image: cuda
version: "12.8.1-base-ubi9"
imagePullPolicy: IfNotPresent
daemonsets:
labels:
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/managed-by: gpu-operator
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
priorityClassName: system-node-critical
updateStrategy: RollingUpdate
rollingUpdate:
maxUnavailable: "1"
validator:
repository: nvcr.io/nvidia/cloud-native
image: gpu-operator-validator
version: "v25.3.0"
imagePullPolicy: IfNotPresent
plugin:
env:
- name: WITH_WORKLOAD
value: "false"
mig:
strategy: single
psa:
enabled: false
cdi:
enabled: true
default: true
driver:
enabled: false
useNvidiaDriverCRD: false
kernelModuleType: auto
usePrecompiled: false
repository: nvcr.io/nvidia
image: driver
version: "550.127.05"
imagePullPolicy: IfNotPresent
startupProbe:
failureThreshold: 120
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 60
rdma:
enabled: false
useHostMofed: false
manager:
repository: nvcr.io/nvidia/cloud-native
image: k8s-driver-manager
version: "v0.8.0"
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "true"
- name: ENABLE_AUTO_DRAIN
value: "false"
- name: DRAIN_USE_FORCE
value: "false"
- name: DRAIN_POD_SELECTOR_LABEL
value: ""
- name: DRAIN_TIMEOUT_SECONDS
value: 0s
- name: DRAIN_DELETE_EMPTYDIR_DATA
value: "false"
repoConfig:
configMapName: ""
certConfig:
name: ""
licensingConfig:
configMapName: ""
nlsEnabled: true
virtualTopology:
config: ""
kernelModuleConfig:
name: ""
upgradePolicy:
autoUpgrade: true
maxParallelUpgrades: 1
maxUnavailable : 25%
waitForCompletion:
timeoutSeconds: 0
podDeletion:
force: false
timeoutSeconds: 300
deleteEmptyDir: false
drain:
enable: false
force: false
timeoutSeconds: 300
deleteEmptyDir: false
vgpuManager:
enabled: false
image: vgpu-manager
imagePullPolicy: IfNotPresent
driverManager:
repository: nvcr.io/nvidia/cloud-native
image: k8s-driver-manager
version: "v0.8.0"
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
kataManager:
enabled: false
config:
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
runtimeClasses:
- artifacts:
pullSecret: ""
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
name: kata-nvidia-gpu
nodeSelector: {}
- artifacts:
pullSecret: ""
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
name: kata-nvidia-gpu-snp
nodeSelector:
nvidia.com/cc.capable: "true"
repository: nvcr.io/nvidia/cloud-native
image: k8s-kata-manager
version: "v0.2.3"
imagePullPolicy: IfNotPresent
vfioManager:
enabled: true
repository: nvcr.io/nvidia
image: cuda
version: "12.8.1-base-ubi9"
imagePullPolicy: IfNotPresent
driverManager:
repository: nvcr.io/nvidia/cloud-native
image: k8s-driver-manager
version: "v0.8.0"
imagePullPolicy: IfNotPresent
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
vgpuDeviceManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: vgpu-device-manager
version: "v0.3.0"
imagePullPolicy: IfNotPresent
config:
default: default
name: ""
ccManager:
enabled: false
defaultMode: "off"
repository: nvcr.io/nvidia/cloud-native
image: k8s-cc-manager
version: "v0.1.1"
imagePullPolicy: IfNotPresent
env:
[]
toolkit:
enabled: true
repository: nvcr.io/nvidia/k8s
image: container-toolkit
version: "v1.17.5-ubuntu20.04"
imagePullPolicy: IfNotPresent
installDir: /home/kubernetes/bin/nvidia
devicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: "v0.17.1"
imagePullPolicy: IfNotPresent
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
dcgm:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: dcgm
version: "4.1.1-2-ubuntu22.04"
imagePullPolicy: IfNotPresent
dcgmExporter:
enabled: true
repository: nvcr.io/nvidia/k8s
image: dcgm-exporter
version: "4.1.1-4.0.4-ubuntu22.04"
imagePullPolicy: IfNotPresent
env:
- name: DCGM_EXPORTER_LISTEN
value: :9400
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
- name: DCGM_EXPORTER_COLLECTORS
value: /etc/dcgm-exporter/dcp-metrics-included.csv
serviceMonitor:
additionalLabels: {}
enabled: false
honorLabels: false
interval: 15s
relabelings: []
gfd:
enabled: true
repository: nvcr.io/nvidia
image: k8s-device-plugin
version: "v0.17.1"
imagePullPolicy: IfNotPresent
env:
- name: GFD_SLEEP_INTERVAL
value: 60s
- name: GFD_FAIL_ON_INIT_ERROR
value: "true"
migManager:
enabled: true
repository: nvcr.io/nvidia/cloud-native
image: k8s-mig-manager
version: "v0.12.1-ubuntu20.04"
imagePullPolicy: IfNotPresent
env:
- name: WITH_REBOOT
value: "false"
config:
name:
default: all-disabled
gpuClientsConfig:
name: ""
nodeStatusExporter:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: gpu-operator-validator
version: "v25.3.0"
imagePullPolicy: IfNotPresent
gdrcopy:
enabled: false
repository: nvcr.io/nvidia/cloud-native
image: gdrdrv
version: "v2.4.4"
imagePullPolicy: IfNotPresent
sandboxWorkloads:
enabled: false
defaultWorkload: container
sandboxDevicePlugin:
enabled: true
repository: nvcr.io/nvidia
image: kubevirt-gpu-device-plugin
version: "v1.3.1"
imagePullPolicy: IfNotPresent
---
# Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: release-name-node-feature-discovery-prune
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
---
# Source: gpu-operator/templates/upgrade_crd.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-operator-upgrade-crd-hook-sa
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
---
# Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: release-name-node-feature-discovery-prune
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
---
# Source: gpu-operator/templates/upgrade_crd.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-operator-upgrade-crd-hook-role
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
rules:
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- create
- get
- list
- watch
- patch
- update
---
# Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: release-name-node-feature-discovery-prune
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: release-name-node-feature-discovery-prune
subjects:
- kind: ServiceAccount
name: release-name-node-feature-discovery-prune
namespace: gpu-operator
---
# Source: gpu-operator/templates/upgrade_crd.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: gpu-operator-upgrade-crd-hook-binding
annotations:
helm.sh/hook: pre-upgrade
helm.sh/hook-delete-policy: hook-succeeded,before-hook-creation
helm.sh/hook-weight: "0"
subjects:
- kind: ServiceAccount
name: gpu-operator-upgrade-crd-hook-sa
namespace: gpu-operator
roleRef:
kind: ClusterRole
name: gpu-operator-upgrade-crd-hook-role
apiGroup: rbac.authorization.k8s.io
---
# Source: gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: release-name-node-feature-discovery-prune
namespace: gpu-operator
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
template:
metadata:
labels:
helm.sh/chart: node-feature-discovery-0.17.2
app.kubernetes.io/name: node-feature-discovery
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v0.17.2"
app.kubernetes.io/managed-by: Helm
role: prune
spec:
serviceAccountName: release-name-node-feature-discovery-prune
containers:
- name: nfd-master
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
image: "registry.k8s.io/nfd/node-feature-discovery:v0.17.2"
imagePullPolicy: IfNotPresent
command:
- "nfd-master"
args:
- "-prune"
restartPolicy: Never
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: In
values:
- ""
weight: 1
- preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: In
values:
- ""
weight: 1
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
---
# Source: gpu-operator/templates/upgrade_crd.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-operator-upgrade-crd
namespace: gpu-operator
annotations:
"helm.sh/hook": pre-upgrade
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
spec:
template:
metadata:
name: gpu-operator-upgrade-crd
labels:
app.kubernetes.io/name: gpu-operator
helm.sh/chart: gpu-operator-v25.3.0
app.kubernetes.io/instance: release-name
app.kubernetes.io/version: "v25.3.0"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: "gpu-operator"
spec:
serviceAccountName: gpu-operator-upgrade-crd-hook-sa
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Equal
value: ""
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Equal
value: ""
containers:
- name: upgrade-crd
image: nvcr.io/nvidia/gpu-operator:v25.3.0
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- >
kubectl apply -f /opt/gpu-operator/nvidia.com_clusterpolicies.yaml;
kubectl apply -f /opt/gpu-operator/nvidia.com_nvidiadrivers.yaml;
kubectl apply -f /opt/gpu-operator/nfd-api-crds.yaml;
restartPolicy: OnFailure