deploy/helm/health_checks/gpu_healthcheck/values.yaml (30 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. job: base_name: "chs-hc" # guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated. # check_time: "1590303600" # Will automatically be set if not given health_check: name: "gpu" test_label: name: "aiinfra/gpu-healthcheck-test" value: "true" image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/gpu-healthcheck" tag: "a3-megagpu-8g_4.4.0" pull_policy: "Always" env: R_LEVEL: "2" HEALTH_VALIDITY_HOURS: "24" DRY_RUN: "true" volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia - name: varlog mountPath: /var/log volumes: - name: nvidia-install-dir-host hostPath: path: /home/kubernetes/bin/nvidia - name: varlog emptyDir: {} - name: fluentbit-key secret: secretName: fluentbit-key optional: true