deploy/helm/health_checks/neper_healthcheck/templates/neper_healthcheck.yaml (189 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# yamllint disable
{{- $guid := default (lower (randAlphaNum 8)) .Values.job.guid -}}
{{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.job.check_time -}}
{{- $base_name := default "chs-hc" .Values.job.base_name -}}
{{- $unique_suffix := printf "%s-%s" $guid $check_time -}}
{{- $extra_info := printf "%s" .Values.health_check.name }}
{{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $unique_svc_name := printf "%s-%s-headless-svc-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $expiry_time := int (sub $check_time (mul .Values.health_check.env.HEALTH_VALIDITY_HOURS 60 60)) -}}
---
apiVersion: v1
kind: Service
metadata:
name: {{$unique_svc_name}}
spec:
clusterIP: None # clusterIP must be None to create a headless service
selector:
name: {{$unique_name}} # must match Job name
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{$unique_name}}
spec:
completions: 2
parallelism: 2
completionMode: Indexed
template:
metadata:
labels:
name: {{$unique_name}}
spec:
tolerations:
- operator: "Exists"
serviceAccountName: {{$unique_name}}
subdomain: {{$unique_svc_name}} # has to match Service name
restartPolicy: Never
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
volumes:
- name: tmpfs
emptyDir: {}
- name: varlog
emptyDir: {}
- name: fluentbit-key
secret:
secretName: fluentbit-key
optional: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
# Multiple matchExpressions ORed between each other. Statements whithin matchExpressions ANDed.
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
# It will trigger if label value is expired (default=24h).
# or label does not exists.
- key: aiinfra/neper-healthcheck-runtime-sec
operator: Lt
values:
- "{{$expiry_time}}"
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: aiinfra/neper-healthcheck-runtime-sec
operator: DoesNotExist
- key: aiinfra/node-not-ready
operator: DoesNotExist
# If label is true then it will trigger
- key: {{ .Values.health_check.test_label.name | quote }}
operator: In
values:
- {{ .Values.health_check.test_label.value | quote }}
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: aiinfra/neper-healthcheck-runtime-sec
operator: DoesNotExist
- weight: 1
preference:
matchExpressions:
- key: aiinfra/neper-healthcheck-runtime-sec
operator: Lt
values:
- "{{$expiry_time}}"
initContainers:
- name: get-ip-addrs
image: {{printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag}}
command: ["sh", "-c"]
args:
- >
NODE_NAME=$NODE_NAME;
ANNOTATION=$(curl -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$KUBERNETES_SERVICE_HOST:443/api/v1/nodes/$NODE_NAME);
echo -n $ANNOTATION | jq -r '.metadata.annotations."networking.gke.io/nic-info"' | jq -r '.[] | select(.birthName != "eth0") | .birthIP' > /tmp/ip_addrs;
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: KUBERNETES_SERVICE_HOST
value: "kubernetes.default.svc"
- name: KUBERNETES_PORT_443
value: "443"
volumeMounts:
- name: tmpfs
mountPath: /tmp
shareProcessNamespace: true
containers:
- name: neper-healthcheck
image: {{printf "%s:%s" .Values.health_check.image.repo .Values.health_check.image.tag}}
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args: ["python3 /scripts/neper_runner.py 2>&1 | tee /var/log/neper_healthcheck.log"]
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- IPC_LOCK
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: JOB_NAME
value: {{$unique_name}}
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: SERVICE_NAME
value: {{$unique_svc_name}} # has to match Service name
{{- range $key, $value := .Values.health_check.env}}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }} # iteration over .env
volumeMounts:
- name: tmpfs
mountPath: /tmp
- name: varlog
mountPath: /var/log
resources:
limits:
nvidia.com/gpu: !!int 8
- name: google-logging
image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest"
volumeMounts:
- name: varlog
mountPath: /var/log
- name: fluentbit-key
mountPath: /var/secrets/google
readOnly: true
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{$unique_name}}
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{$unique_name}}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "get", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{$unique_name}}
namespace: default
subjects:
- kind: ServiceAccount
name: {{$unique_name}}
namespace: default
roleRef:
kind: ClusterRole
name: {{$unique_name}}
apiGroup: rbac.authorization.k8s.io