infra/4-training/nemo-example/values.yaml (35 lines of code) (raw):
# Copyright 2024 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
queue: null # optional (must have installed Kueue and pre-provisioned a local queue, see previous guide steps)
volumes:
ssdMountPath: "/ssd"
pvcMounts:
- name: cluster-filestore
mountPath: "/nfs"
# gcsMounts:
# - bucketName: "nemo-megatron-demo"
# mountPath: "/gcs"
gcsDownload: # downloads or synchronizes contents of a GCS bucket folder on initialization
source: "gs://nemo-megatron-demo/training-data/tokenized/bpe2gpt/wikipedia/"
target: "/ssd/.cache/"
workload:
image: ""
torchDistributedTarget: "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
gpus: 8 # This should be one of: {<= 8, multiple of 8}
arguments:
# These argument name will be prefixed with '+' (see https://hydra.cc/docs/advanced/override_grammar/basic/)
- name: "exp_manager.exp_dir"
value: "/nfs/nemo-experiments/"
- name: "model.data.data_prefix"
value: "[1.0,/ssd/.cache/wikipedia-tokenized-for-gpt2]"
# If not 'null', launches a Tensorboard server on first node. By design, the job will then not exit on first node.
# This is primarly intended for debugging purposes, when a shared file-system or external Tensorboard is unavailable.
embeddedTensorboardTarget: null
networking:
enableTcpx: "true" # required for optimal performance
tcpxRepository: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx"
tcpxDaemonVersion: "tcpgpudmarxd-dev:v2.0.9"
tcpxPluginVersion: "nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06"
ncclSettings:
- name: NCCL_DEBUG
value: "VERSION"
# The following NCCL settings are recommended (but tunable):
- name: NCCL_MIN_NCHANNELS
value: "8"
- name: NCCL_MAX_NCHANNELS
value: "8"
- name: NCCL_SOCKET_NTHREADS
value: "1"
- name: NCCL_NSOCKS_PERTHREAD
value: "4"