deploy/helm/health_checks/gpu_healthcheck/templates/gpu_healthcheck.yaml (121 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # yamllint disable {{- $guid := default (lower (randAlphaNum 8)) .Values.job.guid -}} {{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}} {{- $base_name := default "chs-hc" .Values.job.base_name -}} {{- $unique_suffix := printf "%s-%s" $guid $check_time -}} {{- $extra_info := printf "%s" .Values.health_check.name }} {{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}} --- apiVersion: batch/v1 kind: Job metadata: name: {{ $unique_name }} spec: completions: 1 parallelism: 1 completionMode: Indexed template: spec: serviceAccountName: {{ $unique_name }} restartPolicy: Never affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: # Multiple matchExpressions ORed between each other. Statements whithin matchExpressions ANDed. - matchExpressions: {{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }} - key: cloud.google.com/gke-accelerator operator: Exists {{- end }} # It will trigger if label value is expired (default=24h). # or label does not exists. - key: aiinfra/gpu-healthcheck-runtime-sec operator: Lt values: - {{ $expiry_time | quote }} - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} - matchExpressions: {{- if ne .Values.health_check.env.INSTANCE_TYPE "a3-megagpu-8g-debian" }} - key: cloud.google.com/gke-accelerator operator: Exists {{- end }} - key: aiinfra/gpu-healthcheck-runtime-sec operator: DoesNotExist - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} tolerations: - operator: "Exists" volumes: {{- toYaml .Values.volumes | nindent 8 }} shareProcessNamespace: true containers: - image: {{ printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag }} name: "gpu-healthcheck" imagePullPolicy: Always command: ["/bin/sh", "-c"] args: ["python3 /app/gpu_healthcheck.py 2>&1 | tee /var/log/gpu_healthcheck.log"] ports: - containerPort: 5555 hostPort: 5555 securityContext: privileged: true volumeMounts: {{- toYaml .Values.health_check.volumeMounts | nindent 8 }} env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP {{- range $key, $value := .Values.health_check.env }} - name: {{ $key | quote }} value: {{ $value | quote }} {{- end }} {{- /* end iteration over .env */}} resources: limits: nvidia.com/gpu: !!int 8 - name: google-logging image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest" volumeMounts: - name: varlog mountPath: /var/log - name: fluentbit-key mountPath: /var/secrets/google readOnly: true --- apiVersion: v1 kind: ServiceAccount metadata: name: {{ $unique_name }} namespace: default --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{ $unique_name }} rules: - apiGroups: [""] resources: ["nodes"] verbs: ["list", "get", "patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: {{ $unique_name }} namespace: default subjects: - kind: ServiceAccount name: {{ $unique_name }} namespace: default roleRef: kind: ClusterRole name: {{ $unique_name }} apiGroup: rbac.authorization.k8s.io