deploy/helm/health_checks/nccl

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. job: base_name: "chs-hc" # guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated. # check_time: "1590303600" # Will automatically be set if not given health_check: name: "nccl" image: repo: "us-docker.pkg.dev/gce-ai-infra/health-check/nccl-healthcheck" tag: "a3-highgpu-8g_4.4.0" pull_policy: "Always" env: # A3 instance type is a3-highgpu-8g INSTANCE_TYPE: "a3-highgpu-8g" ENABLE_TWO_PASS_STRATEGY: "true" HEALTH_VALIDITY_HOURS: "24" DRY_RUN: "true" START_MESSAGE_SIZE: "2G" END_MESSAGE_SIZE: "8G" TEST_ITERATIONS: "3" NHOSTS: "2" nr: "8" # Specific to A3 LD_LIBRARY_PATH: /usr/local/nvidia/lib64 BANDWIDTH_THRESHOLD: "60" USE_TCPX: "true" USE_FASTRAK: "false" UNIX_CLIENT_PREFIX: "/run/tcpx" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,NET" # Note A3 (a3-highgpu-8g) has no NCCL_LIB_DIR # Note A3 (a3-highgpu-8g) has no NCCL_FASTRAK_USE_SNAP # Note A3 (a3-highgpu-8g) has no NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL # Note A3 (a3-highgpu-8g) has no NCCL_FASTRAK_NUM_FLOWS volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia/lib64 - name: tcpx-nccl-plugin-volume mountPath: /usr/local/tcpx - name: tcpd-socket mountPath: /run/tcpx - name: workload-terminated-volume mountPath: /usr/share/nemo - name: shared-memory mountPath: /dev/shm - name: varlog mountPath: /var/log # Configuration specific to a3plus volumes: - name: nvidia-install-dir-host hostPath: path: /home/kubernetes/bin/nvidia/lib64 - name: tcpd-socket hostPath: path: /run/tcpx - name: tcpx-nccl-plugin-volume emptyDir: {} - name: workload-terminated-volume emptyDir: {} - name: shared-memory emptyDir: medium: "Memory" sizeLimit: 200Gi - name: varlog emptyDir: {} - name: fluentbit-key secret: secretName: fluentbit-key optional: true initContainers: nccl_plugin_installer: name: "nccl-plugin-installer" image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.7" imagePullPolicy: "Always" volumeMounts: name: "tcpx-nccl-plugin-volume" mountPath: "/usr/local/tcpx" tcpd_daemon: image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.12" imagePullPolicy: "Always" command: - "bash" args: - "-c" - | /tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 2 0" --uds_path /run/tcpx & cleanup() { echo "Received SIGTERM or workload terminated, starting cleanup..." pkill -e "^"tcpgpudmarxd || true exit 0 } trap cleanup SIGTERM while [ ! -e "/usr/share/nemo/workload_terminated" ]; do sleep 1; done cleanup sleep 10 volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia/lib64 - name: tcpx-nccl-plugin-volume mountPath: /usr/local/tcpx - name: tcpd-socket mountPath: /run/tcpx - name: workload-terminated-volume mountPath: /usr/share/nemo

deploy/helm/health_checks/nccl_healthcheck/a3.yaml (94 lines of code) (raw):