nvidia-driver-installer/cos/daemonset-confidential.yaml (186 lines of code) (raw):

# Copyright 2017 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The Dockerfile and other source for this daemonset are in # https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/ # # This is the same as ../../daemonset.yaml except that it assumes that the # docker image is present on the node instead of downloading from GCR. This # allows easier upgrades because GKE can preload the correct image on the # node and the daemonset can just use that image. apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-driver-installer namespace: kube-system labels: k8s-app: nvidia-driver-installer spec: selector: matchLabels: k8s-app: nvidia-driver-installer updateStrategy: type: RollingUpdate template: metadata: labels: name: nvidia-driver-installer k8s-app: nvidia-driver-installer spec: priorityClassName: system-node-critical affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: cloud.google.com/gke-accelerator operator: Exists - key: cloud.google.com/gke-gpu-driver-version operator: DoesNotExist - key: cloud.google.com/gke-confidential-nodes-instance-type operator: In values: - TDX tolerations: - operator: "Exists" hostNetwork: true hostPID: true volumes: - name: dev hostPath: path: /dev - name: vulkan-icd-mount hostPath: path: /home/kubernetes/bin/nvidia/vulkan/icd.d - name: nvidia-install-dir-host hostPath: path: /home/kubernetes/bin/nvidia - name: root-mount hostPath: path: / - name: cos-tools hostPath: path: /var/lib/cos-tools - name: nvidia-config hostPath: path: /etc/nvidia initContainers: - image: "cos-nvidia-installer:fixed" imagePullPolicy: Never name: nvidia-driver-installer resources: requests: cpu: 150m securityContext: privileged: true env: - name: NVIDIA_INSTALL_DIR_HOST value: /home/kubernetes/bin/nvidia - name: NVIDIA_INSTALL_DIR_CONTAINER value: /usr/local/nvidia - name: VULKAN_ICD_DIR_HOST value: /home/kubernetes/bin/nvidia/vulkan/icd.d - name: VULKAN_ICD_DIR_CONTAINER value: /etc/vulkan/icd.d - name: ROOT_MOUNT_DIR value: /root - name: COS_TOOLS_DIR_HOST value: /var/lib/cos-tools - name: COS_TOOLS_DIR_CONTAINER value: /build/cos-tools volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia - name: vulkan-icd-mount mountPath: /etc/vulkan/icd.d - name: dev mountPath: /dev - name: root-mount mountPath: /root - name: cos-tools mountPath: /build/cos-tools - name: nvidia-config mountPath: /etc/nvidia command: - bash - -c - | echo "Checking for existing GPU driver modules" LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) IFS=,; for label in $LABELS; do IFS==; read -r LABEL VALUE <<< "$label" if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then CONFIDENTIAL_INSTANCE_TYPE=$VALUE echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt fi done LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) IFS=,; for label in $LABELS; do IFS==; read -r LABEL VALUE <<< "$label" if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then CONFIDENTIAL_INSTANCE_TYPE=$VALUE echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt fi done if lsmod | grep nvidia; then echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation" exit 0 else if [[ "${CONFIDENTIAL_INSTANCE_TYPE}" == "TDX" ]]; then echo "No GPU driver module detected, installing now" /cos-gpu-installer install --no-verify --version=default || exit 1 sbin/modprobe -d /root drm_kms_helper; /sbin/insmod /usr/local/nvidia/drivers/nvidia.ko; sbin/insmod /usr/local/nvidia/drivers/nvidia-uvm.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-modeset.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-drm.ko /usr/local/nvidia/bin/nvidia-modprobe -c0 -u -m chmod 755 /root/home/kubernetes/bin/nvidia else echo "Confidential GPU is not supported on this VM, skipping driver installation" fi fi - image: "gcr.io/gke-release/nvidia-persistenced-installer@sha256:e875101ea7bddcef6e628359e3a8f02fdfbcd05f6efe75bc7ad9457ef4020a04" name: "nvidia-persistenced-installer" restartPolicy: Always securityContext: privileged: true env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 - name: NVIDIA_INSTALL_DIR_HOST value: /home/kubernetes/bin/nvidia - name: NVIDIA_INSTALL_DIR_CONTAINER value: /usr/local/nvidia - name: VULKAN_ICD_DIR_HOST value: /home/kubernetes/bin/nvidia/vulkan/icd.d - name: VULKAN_ICD_DIR_CONTAINER value: /etc/vulkan/icd.d - name: ROOT_MOUNT_DIR value: /root - name: COS_TOOLS_DIR_HOST value: /var/lib/cos-tools - name: COS_TOOLS_DIR_CONTAINER value: /build/cos-tools volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia mountPropagation: HostToContainer - name: root-mount mountPath: /root mountPropagation: HostToContainer - name: nvidia-config mountPath: /etc/nvidia mountPropagation: HostToContainer - name: vulkan-icd-mount mountPath: /etc/vulkan/icd.d - name: dev mountPath: /dev - name: cos-tools mountPath: /build/cos-tools - image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:116be6b7335c1d34366223b9a3780fe80d862fcf06cd2c580426fdc1697af693" name: partition-gpus env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 resources: requests: cpu: 150m securityContext: privileged: true volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia - name: dev mountPath: /dev - name: nvidia-config mountPath: /etc/nvidia containers: - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" name: pause