deploy/helm/health_checks/straggler_healthcheck/a3.yaml (64 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
job:
base_name: "chs-hc"
# guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated.
# check_time: "1590303600" # Will automatically be set if not given
health_check:
name: "straggler"
image:
repo: "us-docker.pkg.dev/gce-ai-infra/health-check/straggler-healthcheck"
tag: "a3-highgpu-8g_4.6.2"
pull_policy: "Always"
params:
bm_wait_time: "60"
max_run_time: "900"
n_batch: 50
n_microbatch: 100
interesting_event_offset: 4
use_fastrak: false
debug: "INFO" # set "INFO" when debugging
debug_subsys: "INIT,GRAPH,ENV,TUNING" # set "INIT,GRAPH,ENV,TUNING" when debugging
bidirectional: "false"
message_sizes_mb: "16,32"
test_label:
name: "aiinfra/straggler-healthcheck-test"
value: "true"
cluster:
gcloud_override: ""
rxdm:
image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev"
tag: "v2.0.12"
ncclPlugin:
image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev"
tag: "v3.1.9-2.19.4-12.0"
envs:
NCCL_SOCKET_IFNAME: "eth0"
NCCL_CROSS_NIC: "0"
NCCL_ALGO: "Ring"
NCCL_PROTO: "Simple"
NCCL_NSOCKS_PERTHREAD: "4"
NCCL_SOCKET_NTHREADS: "1"
NCCL_DYNAMIC_CHUNK_SIZE: "524288"
NCCL_BUFFSIZE: "4194304"
NCCL_GPUDIRECTTCPX_CTRL_DEV: "eth0"
NCCL_NET_GDR_LEVEL: "PIX"
NCCL_P2P_PXN_LEVEL: "0"
NCCL_DEBUG: "INFO"
NCCL_DEBUG_SUBSYS: "ENV"
NCCL_GPUDIRECTTCPX_SOCKET_IFNAME: "eth1,eth2,eth3,eth4"
NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX: "/run/tcpx"
NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS: "500000"
NCCL_GPUDIRECTTCPX_FORCE_ACK: "1"
NCCL_GPUDIRECTTCPX_TX_BINDINGS: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
NCCL_GPUDIRECTTCPX_RX_BINDINGS: "eth1:22-35,124-137;eth2:22-35,124-137;eth3:74-87,178-191;eth4:74-87,178-191"
env:
INSTANCE_TYPE: "a3-highgpu-8g"
GOOD_THROUGHPUT: "70000000000"
HEALTH_VALIDITY_HOURS: "5"
DRY_RUN: "true"
HOSTS_CSV: nil # Set by Health Runner
N_NODES: 2 # Set by Health Runner
GCS_BUCKET_NAME: "" # Set by Health Runner
UNIX_CLIENT_PREFIX: "/run/tcpx"
USE_TCPX: "true"
USE_FASTRAK: "false"
NCCL_DEBUG_SUBSYS: "INIT,NET"
NCCL_SOCKET_IFNAME: "eth0"
STRAGGLER_THRESHOLD_MS: NIL