deploy/helm/health_checks/nccl_healthcheck/a3ultra.yaml (64 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
job:
base_name: "chs-hc"
# guid: "xckd" # Can specify a GUID if desired. Otherwise, a random GUID will be generated.
# check_time: "1590303600" # Will automatically be set if not given
health_check:
name: "nccl"
image:
repo: "us-docker.pkg.dev/gce-ai-infra/health-check/nccl-healthcheck"
tag: "a3-ultragpu-8g_4.4.0"
pull_policy: "Always"
env:
# A3+ instance type is a3-megagpu-8g
INSTANCE_TYPE: "a3-ultragpu-8g"
ENABLE_TWO_PASS_STRATEGY: "true"
HEALTH_VALIDITY_HOURS: "24"
DRY_RUN: "true"
START_MESSAGE_SIZE: "2G"
END_MESSAGE_SIZE: "8G"
TEST_ITERATIONS: "3"
NHOSTS: "2"
nr: "8"
# Specific to A3+
LD_LIBRARY_PATH: /usr/local/gib/lib64:/usr/local/nvidia/lib64
BANDWIDTH_THRESHOLD: "250"
USE_TCPX: "false"
USE_FASTRAK: "false"
# Note A3+ (a3-megagpu-8g) has no UNIX_CLIENT_PREFIX
NCCL_LIB_DIR: /usr/local/nvidia/lib64
NCCL_FASTRAK_USE_SNAP: "1"
NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL: "0"
NCCL_FASTRAK_NUM_FLOWS: "2"
NCCL_DEBUG: "INFO"
NCCL_DEBUG_SUBSYS: "INIT,NET"
volumeMounts:
- name: usr-local-gib
mountPath: /usr/local/gib
- name: library-dir-host
mountPath: /usr/local/nvidia
- name: shared-memory
mountPath: /dev/shm
- name: varlog
mountPath: /var/log
# Configuration specific to a3ultra
volumes:
- name: library-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 250Gi
- name: usr-local-gib
hostPath:
path: /home/kubernetes/bin/gib
- name: run-sshd
emptyDir: {}
- name: varlog
emptyDir: {}
- name: fluentbit-key
secret:
secretName: fluentbit-key
optional: true
initContainers:
nccl_plugin_installer:
name: "nccl-plugin-installer"
image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.2"
imagePullPolicy: "Always"
volumeMounts:
name: "usr-local-gib"
mountPath: "/usr/local/gib"