sample_workloads/megatron-gke/helm/values.yaml (35 lines of code) (raw):
queue: "multislice-queue"
volumes:
# The VM host path for SSDs is assumed at /mnt/stateful_partition/kube-ephemeral-ssd
ssdMountPath: "/ssd"
# This mounts any persistent volume claims present in the cluster:
# pvcMounts:
# - name: <shared-file-system>
# mountPath: "/nfs"
# CHANGE `bucketName` value to user defined value
# This requires GCS fuse driver installed
gcsMounts:
- bucketName: $USER-test-megatron
mountPath: "/gcs"
gcsDownload: # downloads or synchronizes contents of a GCS bucket folder on initialization
source: "gs://nemo-megatron-demo/training-data/tokenized/sentencepiece-llama2/wikipedia"
target: "/ssd/.cache/"
workload:
# CHANGE `image` value to user defined value
image: $REGION-docker.pkg.dev/$PROJECT_ID/$USER-test-megatron/pytorch-megatron:23.11-py3
torchDistributedTarget: "/workspace/Megatron-LM/pretrain_gpt.py"
# CHANGE `gcsBucketForDataCataPath` value to user defined value
# It will be mounted to /nfs on container startup using GCS fuse
gcsBucketForDataCataPath: $USER-test-megatron
gpus: 128 # This should be one of: {<= 8, multiple of 8}
arguments:
# These argument name will be prefixed with '--'
- name: "data-path"
value: "/ssd/.cache/wikipedia-tokenized-for-llama2"
- name: "data-cache-path"
value: "/gcs/.cache/wikipedia-tokenized-for-llama2"
- name: "tokenizer-model"
value: "/ssd/.cache/llama-2-7b-megatron-checkpoint/tokenizer.model"
# If not 'null', launches a Tensorboard server on first node. By design, the job will then not exit on first node.
# This is primarly intended for debugging purposes, when a shared file-system or external Tensorboard is unavailable.
embeddedTensorboardTarget: null
network:
# Do not change. This code template is designed for A3+
stack: "tcpxo" # one of {"tcp", "tcpx", "tcpxo"}
# NCCL binaries
ncclRepository: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev"
ncclVersion: "v1.0.1"
# Receive daemon (tcpx or tcpxo)
netRxDaemonRepository: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev"
netRxDaemonVersion: "v1.0.8"
ncclSettings:
- name: NCCL_DEBUG
value: "VERSION"
# The following NCCL settings are recommended for TCPxo only (but tunable):
- name: NCCL_MIN_NCHANNELS
value: "16"
- name: NCCL_MAX_NCHANNELS
value: "16"