deploy/helm/health_checks/tinymax_healthcheck/templates/tinymax_healthcheck.yaml (215 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # yamllint disable {{- $guid := default (lower (randAlphaNum 4)) .Values.job.guid -}} {{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}} {{- $base_name := default "chs-hc" .Values.job.base_name -}} {{- $unique_suffix := printf "%s-%s" $guid $check_time -}} {{- $extra_info := printf "%s" .Values.health_check.name }} {{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $unique_svc_name := printf "%s-%s-headless-svc-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}} --- apiVersion: v1 kind: Service metadata: name: {{ $unique_svc_name }} spec: clusterIP: None # clusterIP must be None to create a headless service selector: job-name: {{ $unique_name }} # must match Job name --- apiVersion: batch/v1 kind: Job metadata: name: {{ $unique_name }} labels: app-name: {{ .Values.health_check.name }} spec: completions: {{ .Values.health_check.env.NHOSTS }} parallelism: {{ .Values.health_check.env.NHOSTS }} completionMode: Indexed template: metadata: labels: app-name: {{ .Values.health_check.name }} spec: tolerations: - operator: "Exists" serviceAccountName: {{ $unique_name }} restartPolicy: Never hostNetwork: true volumes: {{- toYaml .Values.volumes | nindent 8 }} {{- if and (ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian") }} nodeSelector: node.kubernetes.io/instance-type: {{ .Values.health_check.env.INSTANCE_TYPE | quote }} {{- end }} affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: {{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }} - key: cloud.google.com/gke-accelerator operator: Exists {{- end }} # It will trigger if label value is expired (default=24h). # or label does not exists. - key: kubernetes.io/hostname operator: In values: - {{ .Values.health_check.env.NODE0 }} - {{ .Values.health_check.env.NODE1 }} - key: aiinfra/tinymax-healthcheck-runtime-sec operator: Lt values: - "{{ $expiry_time }}" - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} - matchExpressions: {{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }} - key: cloud.google.com/gke-accelerator operator: Exists {{- end }} - key: aiinfra/tinymax-healthcheck-runtime-sec operator: DoesNotExist - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} # prefer rule with higher weight preferredDuringSchedulingIgnoredDuringExecution: - weight: 50 preference: matchExpressions: - key: aiinfra/tinymax-healthcheck-runtime-sec operator: DoesNotExist - weight: 1 preference: matchExpressions: - key: aiinfra/tinymax-healthcheck-runtime-sec operator: Lt values: - "{{ $expiry_time }}" {{- /* Different image & volume depending on accelerator type */}} initContainers: - name: nccl-plugin-installer image: {{ .Values.initContainers.nccl_plugin_installer.image | quote }} imagePullPolicy: {{ .Values.initContainers.nccl_plugin_installer.imagePullPolicy | quote }} volumeMounts: - name: {{ .Values.initContainers.nccl_plugin_installer.volumeMounts.name | quote }} mountPath: {{ .Values.initContainers.nccl_plugin_installer.volumeMounts.mountPath| quote }} resources: requests: cpu: 150m command: - /bin/sh - -c - | {{- if eq .Values.health_check.env.INSTANCE_TYPE "a3-ultragpu-8g" }} /scripts/container_entry.sh install --install-nccl cp -R /var/lib/gib/. /usr/local/gib {{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }} mkdir -p /usr/local/nvidia/lib64 cp -r /var/lib/fastrak/lib64/. /usr/local/nvidia/lib64 echo "installation finishes" {{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-highgpu-8g" }} /scripts/container_entry.sh install --install-nccl cp -R /var/lib/tcpx/. /usr/local/tcpx echo "installation finishes" {{- else if eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g"}} /scripts/container_entry.sh install --install-nccl cp -R /var/lib/tcpxo/. /usr/local/tcpxo {{- end }} {{- /* Different tcpd-daemon container depending on accelerator type */}} shareProcessNamespace: true containers: {{- if or (eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g") (eq .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian") (eq .Values.health_check.env.INSTANCE_TYPE "a3-highgpu-8g") }} - name: tcpd-daemon image: {{ .Values.tcpd_daemon.image | quote }} imagePullPolicy: {{ .Values.tcpd_daemon.imagePullPolicy | quote }} command: {{ .Values.tcpd_daemon.command | toJson }} args: {{ .Values.tcpd_daemon.args | toJson }} securityContext: privileged: true volumeMounts: {{- toYaml .Values.tcpd_daemon.volumeMounts | nindent 8 }} {{- end }} - name: tinymax-healthcheck image: {{ printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag }} imagePullPolicy: {{ .Values.health_check.image.pull_policy | quote}} command: ["/bin/sh", "-c"] args: ["python3 /scripts/tinymax_runner.py 2>&1 | tee /var/log/tinymax_healthcheck.log"] securityContext: privileged: true capabilities: add: - SYS_ADMIN - SYS_PTRACE - IPC_LOCK {{- /* Different environment variables depending on accelerator type */}} env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP - name: NODE_RANK valueFrom: fieldRef: fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] - name: JOB_NAME value: "{{ $unique_name }}" - name: SERVICE_NAME value: "{{ $unique_svc_name }}" - name: SHORT_GUID value: "{{ $guid }}" - name: CHECK_TIME_EPOCH_SEC value: "{{ $check_time }}" {{- range $key, $value := .Values.health_check.env }} - name: {{ $key | quote }} value: {{ $value | quote }} {{- end }} {{- /* end iteration over .env */}} volumeMounts: {{- toYaml .Values.health_check.volumeMounts | nindent 8 }} resources: limits: nvidia.com/gpu: !!int 8 - name: google-logging image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest" volumeMounts: - name: varlog mountPath: /var/log - name: fluentbit-key mountPath: /var/secrets/google readOnly: true --- apiVersion: v1 kind: ServiceAccount metadata: name: {{ $unique_name }} namespace: default --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{ $unique_name }} rules: - apiGroups: ["", "apps", "rbac.authorization.k8s.io", "batch"] resources: ["daemonsets", "serviceaccounts", "clusterrolebindings", "clusterroles", "nodes", "jobs", "services"] verbs: ["list", "get", "create", "delete", "watch", "patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: {{ $unique_name }} namespace: default subjects: - kind: ServiceAccount name: {{ $unique_name }} namespace: default roleRef: kind: ClusterRole name: {{ $unique_name }} apiGroup: rbac.authorization.k8s.io