modules/management/kubectl-apply/manifests/nvidia-dra-driver.yaml (338 lines of code) (raw):
# Copyright 2025 "Google LLC"
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Namespace
metadata:
name: nvidia-dra-driver-gpu
labels:
helm.sh/chart: nvidia-dra-driver-gpu-25.3.0-rc.2
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
app.kubernetes.io/version: "25.3.0-rc.2"
app.kubernetes.io/managed-by: Helm
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: nvidia-dra-driver-gpu-quota
namespace: nvidia-dra-driver-gpu
labels:
helm.sh/chart: nvidia-dra-driver-gpu-25.3.0-rc.2
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
app.kubernetes.io/version: "25.3.0-rc.2"
app.kubernetes.io/managed-by: Helm
spec:
hard:
pods: 100
scopeSelector:
matchExpressions:
- operator: In
scopeName: PriorityClass
values:
- system-node-critical
- system-cluster-critical
---
# Source: nvidia-dra-driver-gpu/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-dra-driver-gpu-service-account
namespace: nvidia-dra-driver-gpu
labels:
helm.sh/chart: nvidia-dra-driver-gpu-25.3.0-rc.2
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
app.kubernetes.io/version: "25.3.0-rc.2"
app.kubernetes.io/managed-by: Helm
---
# Source: nvidia-dra-driver-gpu/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dra-driver-gpu-role
namespace: nvidia-dra-driver-gpu
rules:
- apiGroups: ["resource.nvidia.com"]
resources: ["computedomains"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.nvidia.com"]
resources: ["computedomains/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaims"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaimtemplates"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.k8s.io"]
resources: ["deviceclasses"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.k8s.io"]
resources: ["resourceslices"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaims/status"]
verbs: ["update"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
---
# Source: nvidia-dra-driver-gpu/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dra-driver-gpu-role-binding
namespace: nvidia-dra-driver-gpu
subjects:
- kind: ServiceAccount
name: nvidia-dra-driver-gpu-service-account
namespace: nvidia-dra-driver-gpu
roleRef:
kind: ClusterRole
name: nvidia-dra-driver-gpu-role
apiGroup: rbac.authorization.k8s.io
---
# Source: nvidia-dra-driver-gpu/templates/kubeletplugin.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-dra-driver-gpu-kubelet-plugin
namespace: nvidia-dra-driver-gpu
labels:
helm.sh/chart: nvidia-dra-driver-gpu-25.3.0-rc.2
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
app.kubernetes.io/version: "25.3.0-rc.2"
app.kubernetes.io/managed-by: Helm
spec:
selector:
matchLabels:
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-dra-driver-gpu-service-account
securityContext:
{}
containers:
- name: compute-domains
securityContext:
privileged: true
image: nvcr.io/nvidia/k8s-dra-driver-gpu:v25.3.0-rc.2
imagePullPolicy: IfNotPresent
command: ["bash", "-c"]
args:
- |-
# Conditionally mask the params file to prevent this container from
# recreating any missing GPU device nodes. This is necessary, for
# example, when running under nvkind to limit the set GPUs governed
# by the plugin even though it has cgroup access to all of them.
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
cp /proc/driver/nvidia/params root/gpu-params
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
compute-domain-kubelet-plugin
resources:
{}
env:
- name: MASK_NVIDIA_DRIVER_PARAMS
value: ""
- name: NVIDIA_CTK_PATH
value: "/home/kubernetes/bin/nvidia/toolkit/nvidia-ctk"
- name: NVIDIA_DRIVER_ROOT
value: "/home/kubernetes/bin/nvidia"
- name: NVIDIA_VISIBLE_DEVICES
value: void
- name: CDI_ROOT
value: /var/run/cdi
- name: NVIDIA_MIG_CONFIG_DEVICES
value: all
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- name: plugins-registry
mountPath: /var/lib/kubelet/plugins_registry
- name: plugins
mountPath: /var/lib/kubelet/plugins
mountPropagation: Bidirectional
- name: cdi
mountPath: /var/run/cdi
# We always mount the driver root at /driver-root in the container.
- name: driver-root
mountPath: /driver-root
readOnly: true
# Pragmatic solution for host-managed drivers located not at /.
- name: host-dev
mountPath: /dev
volumes:
- name: plugins-registry
hostPath:
path: /var/lib/kubelet/plugins_registry
- name: plugins
hostPath:
path: /var/lib/kubelet/plugins
- name: cdi
hostPath:
path: /var/run/cdi
- name: driver-root
hostPath:
path: /home/kubernetes/bin/nvidia
- name: host-dev
hostPath:
path: /dev
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: feature.node.kubernetes.io/pci-10de.present
operator: In
values:
- "true"
- matchExpressions:
- key: feature.node.kubernetes.io/cpu-model.vendor_id
operator: In
values:
- ARM
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Equal
value: present
---
# Source: nvidia-dra-driver-gpu/templates/controller.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nvidia-dra-driver-gpu-controller
namespace: nvidia-dra-driver-gpu
labels:
helm.sh/chart: nvidia-dra-driver-gpu-25.3.0-rc.2
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
app.kubernetes.io/version: "25.3.0-rc.2"
app.kubernetes.io/managed-by: Helm
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
template:
metadata:
labels:
app.kubernetes.io/name: nvidia-dra-driver-gpu
app.kubernetes.io/instance: nvidia-dra-driver-gpu
spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-dra-driver-gpu-service-account
securityContext:
{}
containers:
- name: compute-domain
securityContext:
{}
image: nvcr.io/nvidia/k8s-dra-driver-gpu:v25.3.0-rc.2
imagePullPolicy: IfNotPresent
command: ["compute-domain-controller", "-v", "6"]
resources:
{}
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu
operator: DoesNotExist
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
---
# Source: nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml
apiVersion: resource.k8s.io/v1beta1
kind: DeviceClass
metadata:
name: compute-domain-daemon.nvidia.com
spec:
selectors:
- cel:
expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'"
---
# Source: nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml
apiVersion: resource.k8s.io/v1beta1
kind: DeviceClass
metadata:
name: compute-domain-default-channel.nvidia.com
spec:
selectors:
- cel:
expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0"
---
# Source: nvidia-dra-driver-gpu/templates/validatingadmissionpolicy.yaml
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: resourceslices-policy-nvidia-dra-driver-gpu
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups: ["resource.k8s.io"]
apiVersions: ["v1beta1"]
operations: ["CREATE", "UPDATE", "DELETE"]
resources: ["resourceslices"]
matchConditions:
- name: isRestrictedUser
expression: >-
request.userInfo.username == "system:serviceaccount:nvidia-dra-driver-gpu:nvidia-dra-driver-gpu-service-account"
variables:
- name: userNodeName
expression: >-
request.userInfo.extra[?'authentication.kubernetes.io/node-name'][0].orValue('')
- name: objectNodeName
expression: >-
(request.operation == "DELETE" ? oldObject : object).spec.?nodeName.orValue("")
- name: nodeSelectorValue
expression: >-
(request.operation == "DELETE" ? oldObject : object).spec.?nodeSelector.orValue(null)
- name: allNodesValue
expression: >-
(request.operation == "DELETE" ? oldObject : object).spec.?allNodes.orValue(false)
validations:
- expression: variables.userNodeName != ""
message: >-
no node association found for user, this user must run in a pod on a node and ServiceAccountTokenPodNodeInfo must be enabled
- expression: variables.userNodeName == variables.objectNodeName || variables.allNodesValue == true || variables.nodeSelectorValue != null
messageExpression: >-
"this user running on node '"+variables.userNodeName+"' may not modify cluster or node resourceslices"
---
# Source: nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: resourceslices-policy-nvidia-dra-driver-gpu
spec:
policyName: resourceslices-policy-nvidia-dra-driver-gpu
validationActions: [Deny]
# All ResourceSlices are matched.