infra/4-training/nemo-example/templates/nemo-example.yaml (317 lines of code) (raw):
# Copyright 2024 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{ $timestamp := now | unixEpoch }}
{{ $jobsuffix := randAlphaNum 2 | lower }}
{{ $nodes := div .Values.workload.gpus 8 | max 1 }}
{{ $gpusPerNode := min .Values.workload.gpus 8 }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Release.Name }}"
data:
nemo-configuration.yaml: |-
{{ .Files.Get "selected-configuration.yaml" | nindent 4 }}
---
apiVersion: v1
kind: Service
metadata:
name: "{{ .Release.Name }}"
spec:
clusterIP: None
selector:
job-name: "{{ .Release.Name }}"
---
{{- $root := . -}}
apiVersion: batch/v1
kind: Job
metadata:
name: "{{ .Release.Name }}"
namespace: default
labels:
{{- if $root.Values.queue }}
kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
{{- end }}
spec:
{{- if $root.Values.queue }}
suspend: true
{{- end }}
parallelism: {{ $nodes }}
completions: {{ $nodes }}
completionMode: Indexed
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: nemo
networking.gke.io/default-interface: 'eth0'
networking.gke.io/interfaces: |
[
{"interfaceName":"eth0","network":"default"},
{"interfaceName":"eth1","network":"vpc1"},
{"interfaceName":"eth2","network":"vpc2"},
{"interfaceName":"eth3","network":"vpc3"},
{"interfaceName":"eth4","network":"vpc4"}
]
{{- if $root.Values.volumes.gcsMounts }}
gke-gcsfuse/volumes: "true"
{{- end}}
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
subdomain: "{{.Release.Name}}"
restartPolicy: Never
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
- operator: "Exists"
key: cloud.google.com/impending-node-termination
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: lib64
hostPath:
path: /lib64
- name: tcpx-nccl-plugin-volume
emptyDir: {}
- name: tcpx-daemon-socket
hostPath:
path: /run/tcpx
- name: workload-terminated-volume
emptyDir: {}
- name: local-ssd
hostPath:
path: /mnt/stateful_partition/kube-ephemeral-ssd
{{- range $pvc := $root.Values.volumes.pvcMounts }}
- name: "{{ $pvc.name }}"
persistentVolumeClaim:
claimName: "{{ $pvc.name }}"
{{- end }}
{{- range $gcs := $root.Values.volumes.gcsMounts }}
- name: "{{ $gcs.bucketName }}"
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: "{{ $gcs.bucketName }}"
{{- end}}
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 200Gi
- name: workload-configuration
configMap:
name: "{{.Release.Name}}"
initContainers:
- name: training-data-downloader
image: gcr.io/google.com/cloudsdktool/google-cloud-cli
volumeMounts:
- name: local-ssd
mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
{{- range $pvc := $root.Values.volumes.pvcMounts }}
- name: "{{ $pvc.name }}"
mountPath: "{{ $pvc.mountPath }}"
{{- end }}
{{- range $gcs := $root.Values.volumes.gcsMounts }}
- name: "{{ $gcs.bucketName }}"
mountPath: "{{ $gcs.mountPath }}"
{{- end }}
env:
- name: GCS_DATA_SOURCE
value: "{{ $root.Values.gcsDownload.source }}"
- name: GCS_DATA_TARGET
value: "{{ $root.Values.gcsDownload.target }}"
command:
- /bin/sh
- -c
- |
echo "Caching training data from $GCS_DATA_SOURCE to $GCS_DATA_TARGET"
mkdir -p $GCS_DATA_TARGET
SECONDS=0
gcloud storage rsync \
--recursive \
$GCS_DATA_SOURCE $GCS_DATA_TARGET
duration=$SECONDS
echo "Transferred or synchronized $GCS_DATA_SOURCE to $GCS_DATA_TARGET in $duration seconds."
{{- if $root.Values.networking.enableTcpx }}
- name: tcpx-nccl-plugin-installer
image: "{{$root.Values.networking.tcpxRepository}}/{{$root.Values.networking.tcpxPluginVersion}}"
imagePullPolicy: Always
volumeMounts:
- name: tcpx-nccl-plugin-volume
mountPath: /var/lib/tcpx
command:
- /bin/sh
- -c
- |
/scripts/container_entry.sh install
{{- end }}
containers:
{{- if $root.Values.networking.enableTcpx }}
- name: tcpd-daemon
image: "{{$root.Values.networking.tcpxRepository}}/{{$root.Values.networking.tcpxDaemonVersion}}"
imagePullPolicy: Always
command:
- "bash"
- "-c"
- |
/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 5 0" &
while [ ! -e "/semaphore/workload_terminated" ]; do sleep 10; done
pkill -e "^"tcpgpudmarxd || true
sleep 15
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: tcpx-daemon-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /semaphore
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
{{- end }}
- name: nemo
image: "{{ $root.Values.workload.image }}"
imagePullPolicy: Always
env:
- name: JOB_IDENTIFIER
value: "{{ .Release.Name }}-{{ $timestamp }}-{{ $jobsuffix }}"
# The following settings are specific to the Torch distributed launcher:
- name: TORCH_DISTRIBUTED_TARGET
value: "{{ $root.Values.workload.torchDistributedTarget }}"
- name: MASTER_ADDR
value: "{{.Release.Name}}-0.{{.Release.Name}}.default.svc.cluster.local"
- name: MASTER_PORT
value: "6002"
- name: WORLD_SIZE
value: "{{ $root.Values.workload.gpus }}"
- name: NNODES
value: "{{ $nodes }}"
- name: GPUS_PER_NODE
value: "{{ $gpusPerNode }}"
- name: GLOO_SOCKET_IFNAME
value: "eth0"
# The leader node can launch an embedded Tensorboard server (if needed)
{{- if $root.Values.workload.embeddedTensorboardTarget }}
- name: EMBEDDED_TENSORBOARD_TARGET
value: "{{ $root.Values.workload.embeddedTensorboardTarget}}"
{{- end }}
# The following arguments are passed to the Workload:
{{- range $environment_variable := $root.Values.workload.arguments }}
- name: "WORKLOAD_{{ $environment_variable.name }}"
value: "{{ $environment_variable.value }}"
{{- end }}
# Mount paths for volumes:
- name: SSD_MOUNT_PATH
value: "{{ $root.Values.volumes.ssdMountPath }}"
# The following NCCL settings should likely not be adjusted:
- name: NCCL_SOCKET_IFNAME
value: "eth0"
- name: NCCL_CHECK_POINTERS
value: "0"
- name: NCCL_DYNAMIC_CHUNK_SIZE
value: "524288"
- name: NCCL_P2P_NET_CHUNKSIZE
value: "524288"
- name: NCCL_P2P_PCI_CHUNKSIZE
value: "524288"
- name: NCCL_P2P_NVL_CHUNKSIZE
value: "1048576"
- name: NCCL_CROSS_NIC
value: "0"
- name: NCCL_ALGO
value: "Ring"
- name: NCCL_PROTO
value: "Simple"
- name: NCCL_NET_GDR_LEVEL
value: "PIX"
- name: NCCL_P2P_PXN_LEVEL
value: "0"
{{- range $environment_variable := $root.Values.networking.ncclSettings }}
- name: {{ $environment_variable.name }}
value: "{{ $environment_variable.value }}"
{{- end }}
{{- if $root.Values.networking.enableTcpx }}
# The following TCPx settings should likely not be adjusted:
- name: NCCL_GPUDIRECTTCPX_CTRL_DEV
value: "eth0"
- name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
value: "eth1,eth2,eth3,eth4"
- name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
- name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
- name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
value: "1000000"
- name: NCCL_GPUDIRECTTCPX_FORCE_ACK
value: "0"
- name: NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP
value: "1000"
{{- range $environment_variable := $root.Values.networking.tcpxSettings }}
- name: {{ $environment_variable.name }}
value: "{{ $environment_variable.value }}"
{{- end }}
{{- end }}
command:
- bash
- -c
- |
function on_script_completion {
# Note: This semaphore is used to terminate the TCPx side-car
touch /semaphore/workload_terminated
}
trap on_script_completion EXIT
echo "Pod on $(hostname --fqdn) is running"
echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
echo "Job ID is $JOB_IDENTIFIER"
# export LD_LIBRARY_PATH="/usr/local/tcpx/lib64:${LD_LIBRARY_PATH}"
export LD_LIBRARY_PATH="/usr/local/cuda/lib64/:/lib/x86_64-linux-gnu/:/usr/local/tcpx/lib64:$LD_LIBRARY_PATH"
# sleep infinity
echo "Warning: Set LD_LIBRARY_PATH=$LD_LIBRARY_PATH to override the NCCL library"
touch $SSD_MOUNT_PATH/hello-from-$HOSTNAME.txt
echo "Local SSD contents (path $SSD_MOUNT_PATH):"; ls $SSD_MOUNT_PATH | sed 's/^/ /'
echo "NeMo configuration file:"
cat /etc/workload-configuration/nemo-configuration.yaml | sed 's/^/| /'
echo ""
readarray -d "" workload_arguments < <(env | grep -e "^WORKLOAD_" | sed 's/^WORKLOAD_/+/' | tr '\n' '\0')
echo "Detected the following additional workload arguments:"
for workload_argument in "${workload_arguments[@]}"; do
echo " $workload_argument"
done
sleep 10 # <- Hack to allow some time for service to boot
export NODE_RANK=$JOB_COMPLETION_INDEX
echo "Launching Torch distributed as node rank $NODE_RANK out of $NNODES nodes"
for ((LOCAL_RANK=0; LOCAL_RANK <= $((GPUS_PER_NODE - 1)); LOCAL_RANK++)); do
RANK=$((8*$NODE_RANK + $LOCAL_RANK))
OMP_NUM_THREADS=12 RANK=$RANK LOCAL_RANK=$LOCAL_RANK \
/usr/bin/python3 $TORCH_DISTRIBUTED_TARGET \
--config-path="/etc/workload-configuration" \
--config-name="nemo-configuration.yaml" \
+trainer.num_nodes="$NNODES" \
+exp_manager.version="$JOB_IDENTIFIER" \
${workload_arguments[@]} &
TORCH_PIDS[$LOCAL_RANK]=$!
done
if [ "$NODE_RANK" -eq "0" ] && { ! [ -z ${EMBEDDED_TENSORBOARD_TARGET} ]; }; then
echo "Launching an embedded Tensorboard against log directory $EMBEDDED_TENSORBOARD_TARGET"
tensorboard --logdir /nfs/nemo-experiments &
fi
wait # <-- This will wait on Tensorboard (if it exists)
echo "Pod on $(hostname --fqdn) is exiting"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: tcpx-nccl-plugin-volume
mountPath: /usr/local/tcpx
- name: tcpx-daemon-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /semaphore
- name: workload-configuration
mountPath: /etc/workload-configuration
- name: shared-memory
mountPath: /dev/shm
- name: local-ssd
mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
{{- range $pvc := $root.Values.volumes.pvcMounts }}
- name: "{{ $pvc.name }}"
mountPath: "{{ $pvc.mountPath }}"
{{- end }}
{{- range $gcs := $root.Values.volumes.gcsMounts }}
- name: "{{ $gcs.bucketName }}"
mountPath: "{{ $gcs.mountPath }}"
{{- end }}
resources:
limits:
nvidia.com/gpu: {{ $gpusPerNode }}
---