deploy/helm/health_runner/templates/health_runner.yaml (105 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # yamllint disable {{ $guid := default (lower (randAlphaNum 8)) .Values.health_runner.guid }} {{- $check_time := default (printf "%s" (now | unixEpoch)) .Values.health_runner.check_time -}} {{- $base_name := default "chs-hr" .Values.health_runner.base_name -}} {{- $unique_suffix := printf "%s-%s" $guid $check_time -}} {{- range $hc_name, $config := .Values.health_checks }} {{- if .run_check}} {{- $extra_info := printf "%s" $hc_name }} {{- $unique_name := printf "%s-%s-%s" $base_name $extra_info $unique_suffix | replace "_" "-" }} {{- $version := $.Files.Get "version.txt" | trim }} --- apiVersion: batch/v1 kind: Job metadata: name: {{ $unique_name }} labels: app: {{ $unique_name }} spec: completions: 1 parallelism: 1 completionMode: Indexed template: spec: restartPolicy: OnFailure securityContext: runAsUser: 1000 runAsGroup: 1000 seccompProfile: type: RuntimeDefault serviceAccountName: {{ $unique_name }} shareProcessNamespace: true containers: - name: {{ $unique_name | quote}} image: "{{ .image.repo }}:{{ .image.tag | default $version }}" imagePullPolicy: {{ .image.pull_policy }} command: ["/bin/sh", "-c"] args: ["python3 /app/health_runner.py 2>&1 | tee /var/log/healthrunner.log"] securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL volumeMounts: - name: varlog mountPath: /var/log env: {{- range $key, $value := .env}} - name: {{ $key | quote }} value: {{ $value | quote }} {{- end }} # iteration over .env {{- with .blast_mode }} {{- if .blast_mode_enabled }} - name: BLAST_MODE_ENABLED value: "true" # "true" or "1" {{- range $key, $value := .env}} - name: {{ $key | quote }} value: {{ $value | quote }} {{- end }} # iteration over .blast_mode.env {{- end }} # if .blast_mode.blast_mode_enabled {{- end }} # if .blast_mode scope - name: google-logging image: "us-docker.pkg.dev/gce-ai-infra/health-check/logging:latest" volumeMounts: - name: varlog mountPath: /var/log - name: fluentbit-key mountPath: /var/secrets/google readOnly: true volumes: - name: varlog emptyDir: {} - name: fluentbit-key secret: secretName: fluentbit-key optional: true --- ## Below should be the same for all health checks apiVersion: v1 kind: ServiceAccount metadata: name: {{ $unique_name }} namespace: default --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ $unique_name }} rules: - apiGroups: ["", "apps", "rbac.authorization.k8s.io", "batch"] resources: ["daemonsets", "serviceaccounts", "clusterrolebindings", "clusterroles", "nodes", "jobs", "pods", "services", "secrets", "jobs/status"] verbs: ["list", "get", "create", "delete", "watch", "patch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ $unique_name }} namespace: default subjects: - kind: ServiceAccount name: {{ $unique_name }} namespace: default roleRef: kind: ClusterRole name: {{ $unique_name }} apiGroup: rbac.authorization.k8s.io {{- end }} # if .run_check {{- end }} # iteration over .Values.health_checks