deploy/helm/health_checks/neper_healthcheck/templates/neper_healthcheck.yaml (189 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # yamllint disable {{- $guid := default (lower (randAlphaNum 8)) .Values.job.guid -}} {{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}} {{- $base_name := default "chs-hc" .Values.job.base_name -}} {{- $unique_suffix := printf "%s-%s" $guid $check_time -}} {{- $extra_info := printf "%s" .Values.health_check.name }} {{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $unique_svc_name := printf "%s-%s-headless-svc-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}} --- apiVersion: v1 kind: Service metadata: name: {{$unique_svc_name}} spec: clusterIP: None # clusterIP must be None to create a headless service selector: name: {{$unique_name}} # must match Job name --- apiVersion: batch/v1 kind: Job metadata: name: {{$unique_name}} spec: completions: 2 parallelism: 2 completionMode: Indexed template: metadata: labels: name: {{$unique_name}} spec: tolerations: - operator: "Exists" serviceAccountName: {{$unique_name}} subdomain: {{$unique_svc_name}} # has to match Service name restartPolicy: Never hostNetwork: true dnsPolicy: ClusterFirstWithHostNet volumes: - name: tmpfs emptyDir: {} - name: varlog emptyDir: {} - name: fluentbit-key secret: secretName: fluentbit-key optional: true affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: # Multiple matchExpressions ORed between each other. Statements whithin matchExpressions ANDed. - matchExpressions: - key: cloud.google.com/gke-accelerator operator: Exists # It will trigger if label value is expired (default=24h). # or label does not exists. - key: aiinfra/neper-healthcheck-runtime-sec operator: Lt values: - "{{$expiry_time}}" - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} - matchExpressions: - key: cloud.google.com/gke-accelerator operator: Exists - key: aiinfra/neper-healthcheck-runtime-sec operator: DoesNotExist - key: aiinfra/node-not-ready operator: DoesNotExist # If label is true then it will trigger - key: {{ .Values.health_check.test_label.name | quote }} operator: In values: - {{ .Values.health_check.test_label.value | quote }} preferredDuringSchedulingIgnoredDuringExecution: - weight: 50 preference: matchExpressions: - key: aiinfra/neper-healthcheck-runtime-sec operator: DoesNotExist - weight: 1 preference: matchExpressions: - key: aiinfra/neper-healthcheck-runtime-sec operator: Lt values: - "{{$expiry_time}}" initContainers: - name: get-ip-addrs image: {{printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag}} command: ["sh", "-c"] args: - > NODE_NAME=$NODE_NAME; ANNOTATION=$(curl -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$KUBERNETES_SERVICE_HOST:443/api/v1/nodes/$NODE_NAME); echo -n $ANNOTATION | jq -r '.metadata.annotations."networking.gke.io/nic-info"' | jq -r '.[] | select(.birthName != "eth0") | .birthIP' > /tmp/ip_addrs; env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: KUBERNETES_SERVICE_HOST value: "kubernetes.default.svc" - name: KUBERNETES_PORT_443 value: "443" volumeMounts: - name: tmpfs mountPath: /tmp shareProcessNamespace: true containers: - name: neper-healthcheck image: {{printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag}} imagePullPolicy: Always command: ["/bin/sh", "-c"] args: ["python3 /scripts/neper_runner.py 2>&1 | tee /var/log/neper_healthcheck.log"] securityContext: privileged: true capabilities: add: - SYS_ADMIN - SYS_PTRACE - IPC_LOCK env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: JOB_NAME value: {{$unique_name}} - name: NODE_IP valueFrom: fieldRef: fieldPath: status.hostIP - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: SERVICE_NAME value: {{$unique_svc_name}} # has to match Service name {{- range $key, $value := .Values.health_check.env}} - name: {{ $key | quote }} value: {{ $value | quote }} {{- end }} # iteration over .env volumeMounts: - name: tmpfs mountPath: /tmp - name: varlog mountPath: /var/log resources: limits: nvidia.com/gpu: !!int 8 - name: google-logging image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest" volumeMounts: - name: varlog mountPath: /var/log - name: fluentbit-key mountPath: /var/secrets/google readOnly: true --- apiVersion: v1 kind: ServiceAccount metadata: name: {{$unique_name}} namespace: default --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: {{$unique_name}} rules: - apiGroups: [""] resources: ["nodes"] verbs: ["list", "get", "patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: {{$unique_name}} namespace: default subjects: - kind: ServiceAccount name: {{$unique_name}} namespace: default roleRef: kind: ClusterRole name: {{$unique_name}} apiGroup: rbac.authorization.k8s.io