deploy/helm/health_checks/straggler_healthcheck/a3.yaml (64 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. job: base_name: "chs-hc" # guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated. # check_time: "1590303600" # Will automatically be set if not given health_check: name: "straggler" image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/straggler-healthcheck" tag: "a3-highgpu-8g_4.6.2" pull_policy: "Always" params: bm_wait_time: "60" max_run_time: "900" n_batch: 50 n_microbatch: 100 interesting_event_offset: 4 use_fastrak: false debug: "INFO" # set "INFO" when debugging debug_subsys: "INIT,GRAPH,ENV,TUNING" # set "INIT,GRAPH,ENV,TUNING" when debugging bidirectional: "false" message_sizes_mb: "16,32" test_label: name: "aiinfra/straggler-healthcheck-test" value: "true" cluster: gcloud_override: "" rxdm: image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev" tag: "v2.0.12" ncclPlugin: image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev" tag: "v3.1.9-2.19.4-12.0" envs: NCCL_SOCKET_IFNAME: "eth0" NCCL_CROSS_NIC: "0" NCCL_ALGO: "Ring" NCCL_PROTO: "Simple" NCCL_NSOCKS_PERTHREAD: "4" NCCL_SOCKET_NTHREADS: "1" NCCL_DYNAMIC_CHUNK_SIZE: "524288" NCCL_BUFFSIZE: "4194304" NCCL_GPUDIRECTTCPX_CTRL_DEV: "eth0" NCCL_NET_GDR_LEVEL: "PIX" NCCL_P2P_PXN_LEVEL: "0" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "ENV" NCCL_GPUDIRECTTCPX_SOCKET_IFNAME: "eth1,eth2,eth3,eth4" NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX: "/run/tcpx" NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS: "500000" NCCL_GPUDIRECTTCPX_FORCE_ACK: "1" NCCL_GPUDIRECTTCPX_TX_BINDINGS: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" NCCL_GPUDIRECTTCPX_RX_BINDINGS: "eth1:22-35,124-137;eth2:22-35,124-137;eth3:74-87,178-191;eth4:74-87,178-191" env: INSTANCE_TYPE: "a3-highgpu-8g" GOOD_THROUGHPUT: "70000000000" HEALTH_VALIDITY_HOURS: "5" DRY_RUN: "true" HOSTS_CSV: nil # Set by Health Runner N_NODES: 2 # Set by Health Runner GCS_BUCKET_NAME: "" # Set by Health Runner UNIX_CLIENT_PREFIX: "/run/tcpx" USE_TCPX: "true" USE_FASTRAK: "false" NCCL_DEBUG_SUBSYS: "INIT,NET" NCCL_SOCKET_IFNAME: "eth0" STRAGGLER_THRESHOLD_MS: NIL