deploy/helm/health_checks/gpu_healthcheck/templates/gpu_healthcheck.yaml (121 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# yamllint disable
{{- $guid := default (lower (randAlphaNum 8)) .Values.job.guid -}}
{{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}}
{{- $base_name := default "chs-hc" .Values.job.base_name -}}
{{- $unique_suffix := printf "%s-%s" $guid $check_time -}}
{{- $extra_info := printf "%s" .Values.health_check.name }}
{{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ $unique_name }}
spec:
completions: 1
parallelism: 1
completionMode: Indexed
template:
spec:
serviceAccountName: {{ $unique_name }}
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
# Multiple matchExpressions ORed between each other. Statements whithin matchExpressions ANDed.
- matchExpressions:
{{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }}
- key: cloud.google.com/gke-accelerator
operator: Exists
{{- end }}
# It will trigger if label value is expired (default=24h).
# or label does not exists.
- key: aiinfra/gpu-healthcheck-runtime-sec
operator: Lt
values:
- {{ $expiry_time | quote }}
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
- matchExpressions:
{{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }}
- key: cloud.google.com/gke-accelerator
operator: Exists
{{- end }}
- key: aiinfra/gpu-healthcheck-runtime-sec
operator: DoesNotExist
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
tolerations:
- operator: "Exists"
volumes: {{- toYaml .Values.volumes | nindent 8 }}
shareProcessNamespace: true
containers:
- image: {{ printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag }}
name: "gpu-healthcheck"
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args: ["python3 /app/gpu_healthcheck.py 2>&1 | tee /var/log/gpu_healthcheck.log"]
ports:
- containerPort: 5555
hostPort: 5555
securityContext:
privileged: true
volumeMounts: {{- toYaml .Values.health_check.volumeMounts | nindent 8 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
{{- range $key, $value := .Values.health_check.env }}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }} {{- /* end iteration over .env */}}
resources:
limits:
nvidia.com/gpu: !!int 8
- name: google-logging
image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest"
volumeMounts:
- name: varlog
mountPath: /var/log
- name: fluentbit-key
mountPath: /var/secrets/google
readOnly: true
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ $unique_name }}
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ $unique_name }}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "get", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ $unique_name }}
namespace: default
subjects:
- kind: ServiceAccount
name: {{ $unique_name }}
namespace: default
roleRef:
kind: ClusterRole
name: {{ $unique_name }}
apiGroup: rbac.authorization.k8s.io