deploy/helm/health_checks/tinymax_healthcheck/a3plus.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. job: base_name: "chs-hc" # guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated. # check_time: "1590303600" # Will automatically be set if not given health_check: name: "tinymax" image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/tinymax-healthcheck" tag: "a3-megagpu-8g_4.4.0" pull_policy: "Always" test_label: name: "aiinfra/tinymax-healthcheck-test" value: "true" env: # A3+ instance type is a3-megagpu-8g INSTANCE_TYPE: "a3-megagpu-8g" HEALTH_VALIDITY_HOURS: "24" DRY_RUN: "true" NHOSTS: "1" nr: "8" # Specific to A3+ LD_LIBRARY_PATH: /usr/local/nvidia/lib64 USE_TCPX: "false" USE_FASTRAK: "true" # Note A3+ (a3-megagpu-8g) has no UNIX_CLIENT_PREFIX NCCL_LIB_DIR: /usr/local/nvidia/lib64 NCCL_FASTRAK_USE_SNAP: "1" NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL: "0" NCCL_FASTRAK_NUM_FLOWS: "2" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,NET" JAX_COORDINATOR_ADDRESS: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)" USE_GPUDIRECT: "fastrak" GPUS_PER_NODE: "8" JAX_COORDINATOR_PORT: "6002" NCCL_FASTRAK_CTRL_DEV: "eth0" NCCL_FASTRAK_IFNAME: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" NCCL_SOCKET_IFNAME: "eth0" NCCL_CROSS_NIC: "0" NCCL_ALGO: "Ring,Tree" NCCL_PROTO: "Simple" NCCL_MIN_NCHANNELS: "4" NCCL_DYNAMIC_CHUNK_SIZE: "524288" NCCL_P2P_NET_CHUNKSIZE: "524288" NCCL_P2P_PCI_CHUNKSIZE: "524288" NCCL_P2P_NVL_CHUNKSIZE: "1048576" NCCL_BUFFSIZE: "8388608" CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" NCCL_NET_GDR_LEVEL: "PIX" NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING: "0" NCCL_FASTRAK_USE_LLCM: "1" TF_CPP_VMODULE: "profile_guided_latency_estimator=10" TF_CPP_MIN_LOG_LEVEL: "0" TF_CPP_MAX_LOG_LEVEL: "100" XLA_PYTHON_CLIENT_MEM_FRACTION: "0.94" CUDA_DEVICE_MAX_CONNECTIONS: "1" NVTE_FUSED_ATTN: "1" NCCL_NVLS_ENABLE: "0" NCCL_TUNER_PLUGIN: "libnccl-tuner.so" NCCL_TUNER_CONFIG_PATH: "/usr/local/nvidia/lib64/a3plus_tuner_config.textproto" NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE: "/usr/local/nvidia/lib64/a3plus_guest_config.textproto" NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS: "600000" volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia/lib64 - name: tcpxo-nccl-plugin-volume mountPath: /usr/local/tcpxo - name: tcpd-socket mountPath: /tmp - name: workload-terminated-volume mountPath: /usr/share/nemo - name: shared-memory mountPath: /dev/shm - name: varlog mountPath: /var/log # Configuration specific to a3plus volumes: - name: nvidia-install-dir-host hostPath: path: /home/kubernetes/bin/nvidia/lib64 - name: tcpd-socket hostPath: path: /run/tcpxo - name: tcpxo-nccl-plugin-volume emptyDir: {} - name: workload-terminated-volume emptyDir: {} - name: shared-memory emptyDir: medium: "Memory" sizeLimit: 250Gi - name: varlog emptyDir: {} - name: fluentbit-key secret: secretName: fluentbit-key optional: true initContainers: nccl_plugin_installer: name: "nccl-plugin-installer" image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.6" imagePullPolicy: "Always" volumeMounts: name: "tcpxo-nccl-plugin-volume" mountPath: "/usr/local/tcpxo" tcpd_daemon: image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.12" imagePullPolicy: "Always" command: - "bash" args: - "-c" - | set -ex chmod 755 /fts/entrypoint_rxdm_container.sh /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr & cleanup() { echo "Received SIGTERM or workload terminated, starting cleanup..." pkill -e "^"tcpgpudmarxd || true exit 0 } trap cleanup SIGTERM while [ ! -e "/usr/share/nemo/workload_terminated" ]; do sleep 1; done cleanup sleep 30 volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia/lib64 - name: tcpd-socket mountPath: /tmp - name: workload-terminated-volume mountPath: /usr/share/nemo

deploy/helm/health_checks/tinymax_healthcheck/a3plus.yaml (126 lines of code) (raw):