infra/4-training/nemo-example/values.yaml (35 lines of code) (raw):

# Copyright 2024 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # https://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. queue: null # optional (must have installed Kueue and pre-provisioned a local queue, see previous guide steps) volumes: ssdMountPath: "/ssd" pvcMounts: - name: cluster-filestore mountPath: "/nfs" # gcsMounts: # - bucketName: "nemo-megatron-demo" # mountPath: "/gcs" gcsDownload: # downloads or synchronizes contents of a GCS bucket folder on initialization source: "gs://nemo-megatron-demo/training-data/tokenized/bpe2gpt/wikipedia/" target: "/ssd/.cache/" workload: image: "" torchDistributedTarget: "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py" gpus: 8 # This should be one of: {<= 8, multiple of 8} arguments: # These argument name will be prefixed with '+' (see https://hydra.cc/docs/advanced/override_grammar/basic/) - name: "exp_manager.exp_dir" value: "/nfs/nemo-experiments/" - name: "model.data.data_prefix" value: "[1.0,/ssd/.cache/wikipedia-tokenized-for-gpt2]" # If not 'null', launches a Tensorboard server on first node. By design, the job will then not exit on first node. # This is primarly intended for debugging purposes, when a shared file-system or external Tensorboard is unavailable. embeddedTensorboardTarget: null networking: enableTcpx: "true" # required for optimal performance tcpxRepository: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx" tcpxDaemonVersion: "tcpgpudmarxd-dev:v2.0.9" tcpxPluginVersion: "nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06" ncclSettings: - name: NCCL_DEBUG value: "VERSION" # The following NCCL settings are recommended (but tunable): - name: NCCL_MIN_NCHANNELS value: "8" - name: NCCL_MAX_NCHANNELS value: "8" - name: NCCL_SOCKET_NTHREADS value: "1" - name: NCCL_NSOCKS_PERTHREAD value: "4"