deploy/helm/health_runner/a3ultra.yaml (88 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # health_runner: name: "health-runner" # Multiple health checks can be added below # Note that only NCCL health check (A3Ultra) is enabled by default health_checks: nccl_healthcheck: run_check: true runner_name: nccl-health-runner-a3ultra image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a3-ultragpu-8g" FILTER_LABEL_NAME: "aiinfra/nccl-healthcheck-test" FILTER_LABEL_VALUE: "true" DRY_RUN: "true" TIMEOUT_MINUTES: "30" SLEEP_TIME_MINUTES: "10" HELM_CHART: "/app/health_checks/nccl_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "-f /app/health_checks/nccl_healthcheck/a3ultra.yaml --set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" # Specific to A3Ultra ACCELERATOR_TYPE: "nvidia-h200-141gb" HEALTH_APP: "nccl" PAIRING_MODE: "random" SECOND_PASS_ENABLED: "true" # Blast Mode blast_mode: blast_mode_enabled: false env: # BLAST_MODE_NUM_TESTS_LIMIT: "10" NODES_CHECKED_PER_TEST: "2" gpu_healthcheck: run_check: false runner_name: gpu-health-runner image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a3-ultragpu-8g" DRY_RUN: "true" TIMEOUT_MINUTES: "30" SLEEP_TIME_MINUTES: "15" HELM_CHART: "/app/health_checks/gpu_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "--set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" ACCELERATOR_TYPE: "nvidia-h200-141gb" # Blast Mode blast_mode: blast_mode_enabled: true # Defaults to run multiple health checks in parallel env: # BLAST_MODE_NUM_TESTS_LIMIT: "10" # Number of health checks to run in parallel NODES_CHECKED_PER_TEST: "1" neper_healthcheck: run_check: false runner_name: neper-health-runner image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.4-latest" MACHINE_TYPE: "a3-ultragpu-8g" DRY_RUN: "true" TIMEOUT_MINUTES: "30" SLEEP_TIME_MINUTES: "10" HELM_CHART: "/app/health_checks/neper_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "--set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" ACCELERATOR_TYPE: "nvidia-h200-141gb" HC_ENV_DCGM_PARAMS: "pcie.dont_bind_numa=1" # Add this for HC_ENV_R_LEVEL: 2/3/4 ADJUST_SLEEP_TIME: "true" blast_mode: blast_mode_enabled: true # Defaults to run multiple health checks in parallel env: # BLAST_MODE_NUM_TESTS_LIMIT: "200" # Number of health checks to run in parallel NODES_CHECKED_PER_TEST: "2" straggler_healthcheck: run_check: false image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/health-runner" tag: "" # This will be populated by the version in the version.txt file pull_policy: "Always" env: HC_IMAGE_TAG: "4.6-latest" MACHINE_TYPE: "a3-ultragpu-8g" DRY_RUN: "true" SLEEP_TIME_MINUTES: "30" HELM_CHART: "/app/health_checks/straggler_healthcheck" # Path to Helm chart in container HELM_INSTALL_FLAGS: "-f /app/health_checks/straggler_healthcheck/a3ultra.yaml --set health_check.image.tag=${MACHINE_TYPE}_${HC_IMAGE_TAG}" ACCELERATOR_TYPE: "nvidia-h200-141gb" HOSTS_CSV: nil # Allow health runner to identify the nodes N_NODES: nil # Default to run on all nodes in the cluster HC_ENV_STRAGGLER_THRESHOLD_MS: "4" GCS_BUCKET_NAME: "straggler-healthcheck-logs"