nvidia-driver-installer/cos/daemonset-confidential.yaml (186 lines of code) (raw):
# Copyright 2017 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The Dockerfile and other source for this daemonset are in
# https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/
#
# This is the same as ../../daemonset.yaml except that it assumes that the
# docker image is present on the node instead of downloading from GCR. This
# allows easier upgrades because GKE can preload the correct image on the
# node and the daemonset can just use that image.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
labels:
k8s-app: nvidia-driver-installer
spec:
selector:
matchLabels:
k8s-app: nvidia-driver-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-driver-installer
k8s-app: nvidia-driver-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: cloud.google.com/gke-gpu-driver-version
operator: DoesNotExist
- key: cloud.google.com/gke-confidential-nodes-instance-type
operator: In
values:
- TDX
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: vulkan-icd-mount
hostPath:
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
- name: cos-tools
hostPath:
path: /var/lib/cos-tools
- name: nvidia-config
hostPath:
path: /etc/nvidia
initContainers:
- image: "cos-nvidia-installer:fixed"
imagePullPolicy: Never
name: nvidia-driver-installer
resources:
requests:
cpu: 150m
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
- name: COS_TOOLS_DIR_HOST
value: /var/lib/cos-tools
- name: COS_TOOLS_DIR_CONTAINER
value: /build/cos-tools
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
- name: cos-tools
mountPath: /build/cos-tools
- name: nvidia-config
mountPath: /etc/nvidia
command:
- bash
- -c
- |
echo "Checking for existing GPU driver modules"
LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 )
IFS=,; for label in $LABELS; do
IFS==; read -r LABEL VALUE <<< "$label"
if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then
CONFIDENTIAL_INSTANCE_TYPE=$VALUE
echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt
fi
done
LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 )
IFS=,; for label in $LABELS; do
IFS==; read -r LABEL VALUE <<< "$label"
if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then
CONFIDENTIAL_INSTANCE_TYPE=$VALUE
echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt
fi
done
if lsmod | grep nvidia; then
echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation"
exit 0
else
if [[ "${CONFIDENTIAL_INSTANCE_TYPE}" == "TDX" ]]; then
echo "No GPU driver module detected, installing now"
/cos-gpu-installer install --no-verify --version=default || exit 1
sbin/modprobe -d /root drm_kms_helper; /sbin/insmod /usr/local/nvidia/drivers/nvidia.ko; sbin/insmod /usr/local/nvidia/drivers/nvidia-uvm.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-modeset.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-drm.ko
/usr/local/nvidia/bin/nvidia-modprobe -c0 -u -m
chmod 755 /root/home/kubernetes/bin/nvidia
else
echo "Confidential GPU is not supported on this VM, skipping driver installation"
fi
fi
- image: "gcr.io/gke-release/nvidia-persistenced-installer@sha256:e875101ea7bddcef6e628359e3a8f02fdfbcd05f6efe75bc7ad9457ef4020a04"
name: "nvidia-persistenced-installer"
restartPolicy: Always
securityContext:
privileged: true
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
- name: COS_TOOLS_DIR_HOST
value: /var/lib/cos-tools
- name: COS_TOOLS_DIR_CONTAINER
value: /build/cos-tools
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
mountPropagation: HostToContainer
- name: root-mount
mountPath: /root
mountPropagation: HostToContainer
- name: nvidia-config
mountPath: /etc/nvidia
mountPropagation: HostToContainer
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: cos-tools
mountPath: /build/cos-tools
- image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:116be6b7335c1d34366223b9a3780fe80d862fcf06cd2c580426fdc1697af693"
name: partition-gpus
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: dev
mountPath: /dev
- name: nvidia-config
mountPath: /etc/nvidia
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause