deploy/helm/health_runner/templates/health_runner.yaml (105 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# yamllint disable
{{ $guid := default (lower (randAlphaNum 8)) .Values.health_runner.guid }}
{{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.health_runner.check_time -}}
{{- $base_name := default "chs-hr" .Values.health_runner.base_name -}}
{{- $unique_suffix := printf "%s-%s" $guid $check_time -}}
{{- range $hc_name, $config := .Values.health_checks }}
{{- if .run_check}}
{{- $extra_info := printf "%s" $hc_name }}
{{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }}
{{- $version := $.Files.Get "version.txt" | trim }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ $unique_name }}
labels:
app: {{ $unique_name }}
spec:
completions: 1
parallelism: 1
completionMode: Indexed
template:
spec:
restartPolicy: OnFailure
securityContext:
runAsUser: 1000
runAsGroup: 1000
seccompProfile:
type: RuntimeDefault
serviceAccountName: {{ $unique_name }}
shareProcessNamespace: true
containers:
- name: {{ $unique_name | quote}}
image: "{{ .image.repo }}:{{ .image.tag | default $version }}"
imagePullPolicy: {{ .image.pull_policy }}
command: ["/bin/sh", "-c"]
args: ["python3 /app/health_runner.py 2>&1 | tee /var/log/healthrunner.log"]
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
volumeMounts:
- name: varlog
mountPath: /var/log
env:
{{- range $key, $value := .env}}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }} # iteration over .env
{{- with .blast_mode }}
{{- if .blast_mode_enabled }}
- name: BLAST_MODE_ENABLED
value: "true" # "true" or "1"
{{- range $key, $value := .env}}
- name: {{ $key | quote }}
value: {{ $value | quote }}
{{- end }} # iteration over .blast_mode.env
{{- end }} # if .blast_mode.blast_mode_enabled
{{- end }} # if .blast_mode scope
- name: google-logging
image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest"
volumeMounts:
- name: varlog
mountPath: /var/log
- name: fluentbit-key
mountPath: /var/secrets/google
readOnly: true
volumes:
- name: varlog
emptyDir: {}
- name: fluentbit-key
secret:
secretName: fluentbit-key
optional: true
---
## Below should be the same for all health checks
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ $unique_name }}
namespace: default
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ $unique_name }}
rules:
- apiGroups: ["", "apps", "rbac.authorization.k8s.io", "batch"]
resources: ["daemonsets", "serviceaccounts", "clusterrolebindings", "clusterroles", "nodes", "jobs", "pods", "services", "secrets", "jobs/status"]
verbs: ["list", "get", "create", "delete", "watch", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ $unique_name }}
namespace: default
subjects:
- kind: ServiceAccount
name: {{ $unique_name }}
namespace: default
roleRef:
kind: ClusterRole
name: {{ $unique_name }}
apiGroup: rbac.authorization.k8s.io
{{- end }} # if .run_check
{{- end }} # iteration over .Values.health_checks