deploy/helm/health_checks/tinymax_healthcheck/a3plus.yaml (126 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
job:
base_name: "chs-hc"
# guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated.
# check_time: "1590303600" # Will automatically be set if not given
health_check:
name: "tinymax"
image:
repo: "us-docker.pkg.dev/gce-ai-infra/health-check/tinymax-healthcheck"
tag: "a3-megagpu-8g_4.4.0"
pull_policy: "Always"
test_label:
name: "aiinfra/tinymax-healthcheck-test"
value: "true"
env:
# A3+ instance type is a3-megagpu-8g
INSTANCE_TYPE: "a3-megagpu-8g"
HEALTH_VALIDITY_HOURS: "24"
DRY_RUN: "true"
NHOSTS: "1"
nr: "8"
# Specific to A3+
LD_LIBRARY_PATH: /usr/local/nvidia/lib64
USE_TCPX: "false"
USE_FASTRAK: "true"
# Note A3+ (a3-megagpu-8g) has no UNIX_CLIENT_PREFIX
NCCL_LIB_DIR: /usr/local/nvidia/lib64
NCCL_FASTRAK_USE_SNAP: "1"
NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL: "0"
NCCL_FASTRAK_NUM_FLOWS: "2"
NCCL_DEBUG: "INFO"
NCCL_DEBUG_SUBSYS: "INIT,NET"
JAX_COORDINATOR_ADDRESS: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
USE_GPUDIRECT: "fastrak"
GPUS_PER_NODE: "8"
JAX_COORDINATOR_PORT: "6002"
NCCL_FASTRAK_CTRL_DEV: "eth0"
NCCL_FASTRAK_IFNAME: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8"
NCCL_SOCKET_IFNAME: "eth0"
NCCL_CROSS_NIC: "0"
NCCL_ALGO: "Ring,Tree"
NCCL_PROTO: "Simple"
NCCL_MIN_NCHANNELS: "4"
NCCL_DYNAMIC_CHUNK_SIZE: "524288"
NCCL_P2P_NET_CHUNKSIZE: "524288"
NCCL_P2P_PCI_CHUNKSIZE: "524288"
NCCL_P2P_NVL_CHUNKSIZE: "1048576"
NCCL_BUFFSIZE: "8388608"
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
NCCL_NET_GDR_LEVEL: "PIX"
NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING: "0"
NCCL_FASTRAK_USE_LLCM: "1"
TF_CPP_VMODULE: "profile_guided_latency_estimator=10"
TF_CPP_MIN_LOG_LEVEL: "0"
TF_CPP_MAX_LOG_LEVEL: "100"
XLA_PYTHON_CLIENT_MEM_FRACTION: "0.94"
CUDA_DEVICE_MAX_CONNECTIONS: "1"
NVTE_FUSED_ATTN: "1"
NCCL_NVLS_ENABLE: "0"
NCCL_TUNER_PLUGIN: "libnccl-tuner.so"
NCCL_TUNER_CONFIG_PATH: "/usr/local/nvidia/lib64/a3plus_tuner_config.textproto"
NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE: "/usr/local/nvidia/lib64/a3plus_guest_config.textproto"
NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS: "600000"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpxo-nccl-plugin-volume
mountPath: /usr/local/tcpxo
- name: tcpd-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /usr/share/nemo
- name: shared-memory
mountPath: /dev/shm
- name: varlog
mountPath: /var/log
# Configuration specific to a3plus
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: tcpd-socket
hostPath:
path: /run/tcpxo
- name: tcpxo-nccl-plugin-volume
emptyDir: {}
- name: workload-terminated-volume
emptyDir: {}
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 250Gi
- name: varlog
emptyDir: {}
- name: fluentbit-key
secret:
secretName: fluentbit-key
optional: true
initContainers:
nccl_plugin_installer:
name: "nccl-plugin-installer"
image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.6"
imagePullPolicy: "Always"
volumeMounts:
name: "tcpxo-nccl-plugin-volume"
mountPath: "/usr/local/tcpxo"
tcpd_daemon:
image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.12"
imagePullPolicy: "Always"
command:
- "bash"
args:
- "-c"
- |
set -ex
chmod 755 /fts/entrypoint_rxdm_container.sh
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr &
cleanup() {
echo "Received SIGTERM or workload terminated, starting cleanup..."
pkill -e "^"tcpgpudmarxd || true
exit 0
}
trap cleanup SIGTERM
while [ ! -e "/usr/share/nemo/workload_terminated" ]; do sleep 1; done
cleanup
sleep 30
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpd-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /usr/share/nemo