sample_workloads/megatron-gke/helm/templates/megatron-example.yaml (431 lines of code) (raw):

{{ $timestamp := now | unixEpoch }} {{ $jobSuffix := randAlphaNum 4 | lower }} {{ $netRxRepository := "" }} {{ $netNcclRepository := "" }} {{ if eq .Values.network.stack "tcpx" }} {{ $netRxRepository = "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev" }} {{ $netNcclRepository = "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev" }} {{ end }} {{ if eq .Values.network.stack "tcpxo" }} {{ $netRxRepository = "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev" }} {{ $netNcclRepository = "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.1" }} {{ end }} {{ $nodes := div .Values.workload.gpus 8 | max 1 }} {{ $gpusPerNode := min .Values.workload.gpus 8 }} --- apiVersion: v1 kind: ConfigMap metadata: name: "{{ .Release.Name }}" data: megatron-configuration.sh: |- {{ .Files.Get "selected-configuration.sh" | nindent 4 }} --- apiVersion: v1 kind: Service metadata: name: "{{ .Release.Name }}" spec: clusterIP: None selector: job-name: "{{ .Release.Name }}" --- {{- $root := . -}} apiVersion: batch/v1 kind: Job metadata: name: "{{ .Release.Name }}" namespace: default labels: {{- if $root.Values.queue }} kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" {{- end }} spec: {{- if $root.Values.queue }} suspend: true {{- end }} parallelism: {{ $nodes }} completions: {{ $nodes }} completionMode: Indexed template: metadata: annotations: kubectl.kubernetes.io/default-container: megatron {{- if $root.Values.volumes.gcsMounts }} gke-gcsfuse/volumes: "true" {{- end}} spec: schedulingGates: - name: "gke.io/topology-aware-auto-scheduling" hostNetwork: true dnsPolicy: ClusterFirstWithHostNet subdomain: "{{.Release.Name}}" restartPolicy: Never # This is leveraged for topology compact GKE launches {{ if $root.Values.targetNodes }} affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: {{- range $hostname := $root.Values.targetNodes }} - {{ $hostname }} {{- end }} {{ end }} tolerations: - operator: "Exists" key: nvidia.com/gpu - operator: "Exists" key: cloud.google.com/impending-node-termination volumes: - name: nvidia-install-dir-host hostPath: path: /home/kubernetes/bin/nvidia - name: nccl-plugin-volume emptyDir: {} {{ if ne $root.Values.network.stack "tcp" }} - name: tcpx-daemon-socket hostPath: path: /run/tcpx {{ end }} - name: workload-configuration configMap: name: "{{.Release.Name}}" - name: workload-terminated-volume emptyDir: {} - name: local-ssd hostPath: path: /mnt/stateful_partition/kube-ephemeral-ssd - name: shared-memory emptyDir: medium: "Memory" sizeLimit: 250Gi - name: dmabuf hostPath: path: /dev/dmabuf_import_helper type: CharDevice {{- range $pvc := $root.Values.volumes.pvcMounts }} - name: "{{ $pvc.name }}" persistentVolumeClaim: claimName: "{{ $pvc.name }}" {{- end }} {{- range $gcs := $root.Values.volumes.gcsMounts }} - name: "{{ $gcs.bucketName }}" csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: "{{ $gcs.bucketName }}" {{- end}} initContainers: - name: training-data-downloader image: gcr.io/google.com/cloudsdktool/google-cloud-cli volumeMounts: - name: local-ssd mountPath: "{{ $root.Values.volumes.ssdMountPath }}" {{- range $pvc := $root.Values.volumes.pvcMounts }} - name: "{{ $pvc.name }}" mountPath: "{{ $pvc.mountPath }}" {{- end }} {{- range $gcs := $root.Values.volumes.gcsMounts }} - name: "{{ $gcs.bucketName }}" mountPath: "{{ $gcs.mountPath }}" {{- end }} env: - name: GCS_DATA_SOURCE value: "{{ $root.Values.gcsDownload.source }}" - name: GCS_DATA_TARGET value: "{{ $root.Values.gcsDownload.target }}" command: - /bin/sh - -c - | echo "Caching training data from $GCS_DATA_SOURCE to $GCS_DATA_TARGET" mkdir -p $GCS_DATA_TARGET SECONDS=0 gcloud storage rsync \ --recursive \ $GCS_DATA_SOURCE $GCS_DATA_TARGET duration=$SECONDS echo "Transferred or synchronized $GCS_DATA_SOURCE to $GCS_DATA_TARGET in $duration seconds." {{ if ne $root.Values.network.stack "tcp" }} - name: nccl-plugin-installer image: "{{ $root.Values.network.ncclRepository }}:{{ $root.Values.network.ncclVersion }}" imagePullPolicy: Always volumeMounts: - name: nccl-plugin-volume mountPath: /usr/local/nccl-plugin command: - /bin/sh - -c - | mkdir -p /var/lib/tcpxo ln -s /var/lib/tcpxo /var/lib/tcpx /scripts/container_entry.sh install --install-nccl # cp -r /var/lib/tcpxo/lib64/. /usr/local/nccl-plugin/lib64 cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/ echo "Installed NCCL plugin to pod-wide, shared NCCL plug-in volume" echo "Contents (mounted at /usr/local/nccl-plugin/lib64):" ls /usr/local/nccl-plugin/lib64 | sed 's/^/ /' echo "Contents (mounted at /usr/local/nccl-plugin/):" ls /usr/local/nccl-plugin/ | sed 's/^/ /' {{ end }} containers: # Either the tcpx or tcpxo receive daemon {{ if ne $root.Values.network.stack "tcp" }} - name: network-rx-daemon image: "{{ $root.Values.network.netRxDaemonRepository }}:{{ $root.Values.network.netRxDaemonVersion }}" imagePullPolicy: Always securityContext: privileged: true volumeMounts: - name: tcpx-daemon-socket mountPath: /tmp - name: workload-terminated-volume mountPath: /semaphore - name: nvidia-install-dir-host mountPath: "/usr/local/nvidia" - name: dmabuf mountPath: /dev/dmabuf_import_helper {{ if eq $root.Values.network.stack "tcpx" }} env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 command: - bash - -c - | /tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 2 0" & while [ ! -e "/semaphore/workload_terminated" ]; do sleep 10; done pkill -e "^"tcpgpudmarxd || true sleep 15 {{ end }} {{ if eq $root.Values.network.stack "tcpxo" }} env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 command: - bash - -c - | /fts/entrypoint_rxdm_container.sh --num_hops 2 --num_nics 8 --uid= --alsologtostderr & while [ ! -e "/semaphore/workload_terminated" ]; do sleep 10; done pkill -e "^"entrypoint_rxdm_container.sh || true sleep 15 {{ end }} {{ end }} - name: megatron image: "{{ $root.Values.workload.image }}" imagePullPolicy: Always securityContext: privileged: true env: - name: JOB_IDENTIFIER value: "{{ .Release.Name }}-{{ $timestamp }}-{{ $jobSuffix }}" - name: SSD_MOUNT_PATH value: "{{ $root.Values.volumes.ssdMountPath }}" # The following settings are specific to the Torch distributed launcher: - name: GCS_FUSE_BUCKET value: "{{ $root.Values.workload.gcsBucketForDataCataPath }}" - name: TORCH_DISTRIBUTED_TARGET value: "{{ $root.Values.workload.torchDistributedTarget }}" - name: MASTER_ADDR value: "{{.Release.Name}}-0.{{.Release.Name}}.default.svc.cluster.local" - name: MASTER_PORT value: "6002" - name: WORLD_SIZE value: "{{ $root.Values.workload.gpus }}" - name: NNODES value: "{{ $nodes }}" - name: GPUS_PER_NODE value: "{{ $gpusPerNode }}" - name: GLOO_SOCKET_IFNAME value: "eth0" # The leader node can launch an embedded Tensorboard server (if needed) {{- if $root.Values.workload.embeddedTensorboardTarget }} - name: EMBEDDED_TENSORBOARD_TARGET value: "{{ $root.Values.workload.embeddedTensorboardTarget}}" {{- end }} # The following arguments are passed to the Workload: {{- range $environment_variable := $root.Values.workload.arguments }} - name: "WORKLOAD_{{ $environment_variable.name }}" value: "{{ $environment_variable.value }}" {{- end }} # The following is needed to prevent send-receive stalling execution - name: NVTE_FWD_LAYERNORM_SM_MARGIN value: "8" - name: NVTE_BWD_LAYERNORM_SM_MARGIN value: "8" {{ if ne $root.Values.network.stack "tcp" }} # The following TCPxo settings should likely not be adjusted: {{ if eq $root.Values.network.stack "tcpxo" }} - name: NCCL_BUFFSIZE value: "8388608" - name: NCCL_FASTRAK_CTRL_DEV value: "eth0" - name: NCCL_FASTRAK_IFNAME value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" - name: NCCL_FASTRAK_NUM_FLOWS value: "2" - name: NCCL_FASTRAK_NUM_FLOWS_PER_GROUP value: "1" - name: NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL value: "0" - name: NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING value: "0" - name: NCCL_FASTRAK_USE_SNAP value: "1" - name: NCCL_FASTRAK_USE_LLCM value: "1" # The following NCCL tuner settings should likely not be adjusted: - name: NCCL_TUNER_PLUGIN value: "libnccl-tuner.so" - name: NCCL_TUNER_CONFIG_PATH value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto" {{ end }} {{ if eq $root.Values.network.stack "tcpx" }} - name: NCCL_GPUDIRECTTCPX_CTRL_DEV value: "eth0" - name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" - name: NCCL_GPUDIRECTTCPX_TX_BINDINGS value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" - name: NCCL_GPUDIRECTTCPX_RX_BINDINGS value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" - name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS value: "500000" {{ end }} # The following NCCL settings should likely not be adjusted: - name: NCCL_SOCKET_IFNAME value: "eth0" - name: NCCL_DYNAMIC_CHUNK_SIZE value: "524288" - name: NCCL_P2P_NET_CHUNKSIZE value: "524288" - name: NCCL_P2P_PCI_CHUNKSIZE value: "524288" - name: NCCL_P2P_NVL_CHUNKSIZE value: "1048576" - name: NCCL_CROSS_NIC value: "0" - name: NCCL_ALGO value: "Tree" - name: NCCL_PROTO value: "Simple" - name: NCCL_NET_GDR_LEVEL value: "PIX" - name: NCCL_P2P_PXN_LEVEL value: "0" - name: NCCL_NVLS_ENABLE value: "0" {{- range $environment_variable := $root.Values.network.ncclSettings }} - name: {{ $environment_variable.name }} value: "{{ $environment_variable.value }}" {{- end }} {{ end }} command: - bash - -c - | function on_script_completion { # Note: This semaphore is used to terminate the TCPx side-car touch /semaphore/workload_terminated } trap on_script_completion EXIT echo "Pod on $(hostname --fqdn) is running" echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" echo "Job ID is $JOB_IDENTIFIER" echo "Running nvidia-smi" nvidia-smi mkdir -p /gcs gcsfuse --client-protocol http2 $GCS_FUSE_BUCKET /gcs export LD_LIBRARY_PATH="/usr/local/nccl-plugin/lib64:/usr/local/cuda-12.3/lib64:/usr/local/nvidia/lib64/:${LD_LIBRARY_PATH}" echo "Warning: Set LD_LIBRARY_PATH=$LD_LIBRARY_PATH to override the NCCL library" ldconfig /usr/local/nvidia/lib64/ echo "Added /usr/local/nvidia/lib64/ to ldconfig:" ldconfig -p | grep libcuda | sed 's/^/ /' echo "Contents of /usr/local/nccl-plugin/lib64:" ls /usr/local/nccl-plugin/lib64 | sed 's/^/ /' touch $SSD_MOUNT_PATH/hello-from-$HOSTNAME.txt echo "Local SSD contents (path $SSD_MOUNT_PATH):"; ls $SSD_MOUNT_PATH | sed 's/^/ /' echo "Megatron source configuration file:" cat /etc/workload-configuration/megatron-configuration.sh | sed 's/^/| /' echo "" echo "Megatron arguments:" source /etc/workload-configuration/megatron-configuration.sh for megatron_argument in "${megatron_arguments[@]}"; do if [[ "${megatron_argument:0:2}" == "--" ]]; then echo ""; echo -n " " fi echo -n "$megatron_argument " done echo "" readarray -d "" workload_arguments < <(env | grep -e "^WORKLOAD_" | sed 's/^WORKLOAD_/--/' | tr '\n' '\0') echo "Detected the following additional workload arguments:" for index in "${!workload_arguments[@]}"; do workload_argument=${workload_arguments[$index]} key_value=(${workload_argument//=/ }) if [[ ${key_value[1]} == "" ]]; then workload_arguments[$index]="${key_value[0]}" fi echo " ${workload_arguments[$index]}" done sleep 10 # <- Hack to allow some time for service to boot mount /tmp -o remount,exec chmod -R a+rwx /tmp echo "Checking for presence of nsys:" which nsys echo "Nsight profiling will go to /gcs/megatron-experiments/$JOB_IDENTIFIER/." mkdir -p /gcs/megatron-experiments/$JOB_IDENTIFIER/ export NODE_RANK=$JOB_COMPLETION_INDEX echo "Launching Torch distributed as node rank $NODE_RANK out of $NNODES nodes" nsys_pids_to_wait_on=() # <- Ensure we let these complete before exiting for ((LOCAL_RANK=0; LOCAL_RANK <= $((GPUS_PER_NODE - 1)); LOCAL_RANK++)); do RANK=$((8*$NODE_RANK + $LOCAL_RANK)) OMP_NUM_THREADS=12 RANK=$RANK LOCAL_RANK=$LOCAL_RANK \ nsys profile -s none -t nvtx,cuda \ --capture-range=cudaProfilerApi \ --capture-range-end=stop \ -o /gcs/megatron-experiments/$JOB_IDENTIFIER/rank-$RANK \ python $TORCH_DISTRIBUTED_TARGET \ ${megatron_arguments[@]} \ ${workload_arguments[@]} & # Keep the pid of the nsys process in array nsys_pids_to_wait_on+=($!) TORCH_PIDS[$LOCAL_RANK]=$! echo "Launched rank $LOCAL_RANK as PID $!" done if [ "$NODE_RANK" -eq "0" ]; then nvidia-smi -l 60 & # Get job id (for reference not to wait on) nvidia_smi_pid=$! echo "PID of nvidia-smi: $nvidia_smi_pid" fi if [ "$NODE_RANK" -eq "0" ] && { ! [ -z ${EMBEDDED_TENSORBOARD_TARGET} ]; }; then echo "Launching an embedded Tensorboard against log directory $EMBEDDED_TENSORBOARD_TARGET" tensorboard --logdir $EMBEDDED_TENSORBOARD_TARGET & wait $! # <-- This will wait on Tensorboard (if it exists) fi # Let nsys jobs complete echo "Waiting for nsys jobs to complete" wait $nsys_pids_to_wait_on echo "Pod on $(hostname --fqdn) is exiting" volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia - name: nccl-plugin-volume mountPath: /usr/local/nccl-plugin - name: dmabuf mountPath: /dev/dmabuf_import_helper {{ if ne $root.Values.network.stack "tcp" }} - name: tcpx-daemon-socket mountPath: /tmp {{ end }} - name: workload-terminated-volume mountPath: /semaphore - name: workload-configuration mountPath: /etc/workload-configuration - name: shared-memory mountPath: /dev/shm - name: local-ssd mountPath: "{{ $root.Values.volumes.ssdMountPath }}" {{- range $pvc := $root.Values.volumes.pvcMounts }} - name: "{{ $pvc.name }}" mountPath: "{{ $pvc.mountPath }}" {{- end }} {{- range $gcs := $root.Values.volumes.gcsMounts }} - name: "{{ $gcs.bucketName }}" mountPath: "{{ $gcs.mountPath }}" {{- end }} resources: limits: nvidia.com/gpu: {{ $gpusPerNode }} ---