deploy/helm/health_runner/a4.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. health_runner: name: "health-runner" # Multiple health checks can be added below # Note that only NCCL health check (A4) is enabled by default health_checks: nccl_healthcheck: run_check: true runner_name: nccl-health-runner-a4 image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a4-highgpu-8g" FILTER_LABEL_NAME: "aiinfra/nccl-healthcheck-test" FILTER_LABEL_VALUE: "true" DRY_RUN: "true" SLEEP_TIME_MINUTES: "10" HELM_CHART: "/app/health_checks/nccl_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "-f /app/health_checks/nccl_healthcheck/a4.yaml --set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" # Specific to A4 ACCELERATOR_TYPE: "nvidia-b200-141gb" HEALTH_APP: "nccl" PAIRING_MODE: "random" SECOND_PASS_ENABLED: "true" # Blast Mode blast_mode: blast_mode_enabled: false env: # BLAST_MODE_NUM_TESTS_LIMIT: "10" NODES_CHECKED_PER_TEST: "2" gpu_healthcheck: run_check: false runner_name: gpu-health-runner image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a4-highgpu-8g" DRY_RUN: "true" SLEEP_TIME_MINUTES: "15" HELM_CHART: "/app/health_checks/gpu_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "--set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" ACCELERATOR_TYPE: "nvidia-b200-141gb" HC_ENV_R_LEVEL: 1 # HC_ENV_DCGM_PARAMS: "pcie.dont_bind_numa=1" # Add this for HC_ENV_R_LEVEL: 2/3/4 ADJUST_SLEEP_TIME: "true" # Blast Mode blast_mode: blast_mode_enabled: true # Defaults to run multiple health checks in parallel env: # BLAST_MODE_NUM_TESTS_LIMIT: "10" # Number of health checks to run in parallel NODES_CHECKED_PER_TEST: "1" neper_healthcheck: run_check: false runner_name: neper-health-runner image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a4-highgpu-8g" DRY_RUN: "true" SLEEP_TIME_MINUTES: "10" HELM_CHART: "/app/health_checks/neper_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "--set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" ACCELERATOR_TYPE: "nvidia-b200-141gb" blast_mode: blast_mode_enabled: true # Defaults to run multiple health checks in parallel env: # BLAST_MODE_NUM_TESTS_LIMIT: "200" # Number of health checks to run in parallel NODES_CHECKED_PER_TEST: "2"

deploy/helm/health_runner/a4.yaml (67 lines of code) (raw):