gpudirect-tcpx/nri-device-injector.yaml (80 lines of code) (raw):
# Copyright 2024 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This daemonset installs nvidia driver 450.80.02 and invokes the
# partition_gpu tool to enable MIG mode and create GPU instances as specified
# in the GPU config.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: device-injector
namespace: kube-system
labels:
k8s-app: device-injector
spec:
selector:
matchLabels:
k8s-app: device-injector
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: device-injector
k8s-app: device-injector
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-80gb
- nvidia-h100-mega-80gb
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
initContainers:
- image: "gke.gcr.io/gke-distroless/bash:latest"
name: enable-nri
securityContext:
privileged: true
volumeMounts:
- name: root
mountPath: /
command:
- '/bin/bash'
- '-c'
- |
if ! grep -q nri /etc/containerd/config.toml; then
echo "[plugins.\"io.containerd.nri.v1.nri\"]
disable = false
disable_connections = false
plugin_config_path = \"/etc/nri/conf.d\"
plugin_path = \"/home/kubernetes/nri/plugins\"
plugin_registration_timeout = \"5s\"
plugin_request_timeout = \"5s\"
socket_path = \"/var/run/nri/nri.sock\"">> /etc/containerd/config.toml
systemctl restart containerd.service
fi
containers:
- image: "gcr.io/gke-release/nri-device-injector@sha256:7704e2bd74b8edbb76b6913c7904cc2362f1fa887c4d4aba7b19778ea353537c"
name: device-injector
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: dev
mountPath: /dev
- name: nri
mountPath: /var/run/nri
volumes:
- name: root
hostPath:
path: /
- name: nri
hostPath:
path: /var/run/nri
- name: dev
hostPath:
path: /dev