deploy/helm/health_checks/tinymax_healthcheck/templates/tinymax_healthcheck.yaml (215 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# yamllint disable
{{- $guid := default (lower (randAlphaNum 4)) .Values.job.guid -}}
{{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}}
{{- $base_name := default "chs-hc" .Values.job.base_name -}}
{{- $unique_suffix := printf "%s-%s" $guid $check_time -}}
{{- $extra_info := printf "%s" .Values.health_check.name }}
{{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $unique_svc_name := printf "%s-%s-headless-svc-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}}
---
apiVersion: v1
kind: Service
metadata:
name: {{ $unique_svc_name }}
spec:
clusterIP: None # clusterIP must be None to create a headless service
selector:
job-name: {{ $unique_name }} # must match Job name
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ $unique_name }}
labels:
app-name: {{ .Values.health_check.name }}
spec:
completions: {{ .Values.health_check.env.NHOSTS }}
parallelism: {{ .Values.health_check.env.NHOSTS }}
completionMode: Indexed
template:
metadata:
labels:
app-name: {{ .Values.health_check.name }}
spec:
tolerations:
- operator: "Exists"
serviceAccountName: {{ $unique_name }}
restartPolicy: Never
hostNetwork: true
volumes: {{- toYaml .Values.volumes | nindent 8 }}
{{- if and (ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian") }}
nodeSelector:
node.kubernetes.io/instance-type: {{ .Values.health_check.env.INSTANCE_TYPE | quote }}
{{- end }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
{{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }}
- key: cloud.google.com/gke-accelerator
operator: Exists
{{- end }}
# It will trigger if label value is expired (default=24h).
# or label does not exists.
- key: kubernetes.io/hostname
operator: In
values:
- {{ .Values.health_check.env.NODE0 }}
- {{ .Values.health_check.env.NODE1 }}
- key: aiinfra/tinymax-healthcheck-runtime-sec
operator: Lt
values:
- "{{ $expiry_time }}"
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
- matchExpressions:
{{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }}
- key: cloud.google.com/gke-accelerator
operator: Exists
{{- end }}
- key: aiinfra/tinymax-healthcheck-runtime-sec
operator: DoesNotExist
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
# prefer rule with higher weight
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: aiinfra/tinymax-healthcheck-runtime-sec
operator: DoesNotExist
- weight: 1
preference:
matchExpressions:
- key: aiinfra/tinymax-healthcheck-runtime-sec
operator: Lt
values:
- "{{ $expiry_time }}"
{{- /* Different image & volume depending on accelerator type */}}
initContainers:
- name: nccl-plugin-installer
image: {{ .Values.initContainers.nccl_plugin_installer.image | quote }}
imagePullPolicy: {{ .Values.initContainers.nccl_plugin_installer.imagePullPolicy | quote }}
volumeMounts:
- name: {{ .Values.initContainers.nccl_plugin_installer.volumeMounts.name | quote }}
mountPath: {{ .Values.initContainers.nccl_plugin_installer.volumeMounts.mountPath| quote }}
resources:
requests:
cpu: 150m
command:
- /bin/sh
- -c
- |
{{- if eq .Values.health_check.env.INSTANCE_TYPE "a3-ultragpu-8g" }}
/scripts/container_entry.sh install --install-nccl
cp -R /var/lib/gib/. /usr/local/gib
{{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }}
mkdir -p /usr/local/nvidia/lib64
cp -r /var/lib/fastrak/lib64/. /usr/local/nvidia/lib64
echo "installation finishes"
{{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-highgpu-8g" }}
/scripts/container_entry.sh install --install-nccl
cp -R /var/lib/tcpx/. /usr/local/tcpx
echo "installation finishes"
{{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g"}}
/scripts/container_entry.sh install --install-nccl
cp -R /var/lib/tcpxo/. /usr/local/tcpxo
{{- end }}
{{- /* Different tcpd-daemon container depending on accelerator type */}}
shareProcessNamespace: true
containers:
{{- if or (eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g") (eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian") (eq .Values.health_check.env.INSTANCE_TYPE "a3-highgpu-8g") }}
- name: tcpd-daemon
image: {{ .Values.tcpd_daemon.image | quote }}
imagePullPolicy: {{ .Values.tcpd_daemon.imagePullPolicy | quote }}
command: {{ .Values.tcpd_daemon.command | toJson }}
args: {{ .Values.tcpd_daemon.args | toJson }}
securityContext:
privileged: true
volumeMounts: {{- toYaml .Values.tcpd_daemon.volumeMounts | nindent 8 }}
{{- end }}
- name: tinymax-healthcheck
image: {{ printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag }}
imagePullPolicy: {{ .Values.health_check.image.pull_policy | quote}}
command: ["/bin/sh", "-c"]
args: ["python3 /scripts/tinymax_runner.py 2>&1 | tee /var/log/tinymax_healthcheck.log"]
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- IPC_LOCK
{{- /* Different environment variables depending on accelerator type */}}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: NODE_RANK
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: JOB_NAME
value: "{{ $unique_name }}"
- name: SERVICE_NAME
value: "{{ $unique_svc_name }}"
- name: SHORT_GUID
value: "{{ $guid }}"
- name: CHECK_TIME_EPOCH_SEC
value: "{{ $check_time }}"
{{- range $key, $value := .Values.health_check.env }}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }} {{- /* end iteration over .env */}}
volumeMounts: {{- toYaml .Values.health_check.volumeMounts | nindent 8 }}
resources:
limits:
nvidia.com/gpu: !!int 8
- name: google-logging
image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest"
volumeMounts:
- name: varlog
mountPath: /var/log
- name: fluentbit-key
mountPath: /var/secrets/google
readOnly: true
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ $unique_name }}
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ $unique_name }}
rules:
- apiGroups: ["", "apps", "rbac.authorization.k8s.io", "batch"]
resources: ["daemonsets", "serviceaccounts", "clusterrolebindings", "clusterroles", "nodes", "jobs", "services"]
verbs: ["list", "get", "create", "delete", "watch", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ $unique_name }}
namespace: default
subjects:
- kind: ServiceAccount
name: {{ $unique_name }}
namespace: default
roleRef:
kind: ClusterRole
name: {{ $unique_name }}
apiGroup: rbac.authorization.k8s.io