launcher/nemo/k8s_templates/training/train-script-trn.yaml

{{ $config := .Values.trainingConfig }} apiVersion: v1 kind: ConfigMap metadata: name: train-script-trn-{{ $config.jobName }} data: train-script.sh: | #!/usr/bin/env bash set -o pipefail set -ex {{- if $config.git.repo_url_or_path }} mkdir -p $HOME/tmp GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR GIT_CLONE_DIR=${GIT_CLONE_DIR}/ cd $GIT_CLONE_DIR rm -rf __pycache__ {{- if $config.git.branch }} git checkout {{ $config.git.branch }} {{- end }} {{- if $config.git.commit }} git fetch origin {{ $config.git.commit }} git reset --hard {{ $config.git.commit }} {{- end }} {{- if $config.git.update_adapter }} pip install . --force-reinstall --no-deps {{- end }} {{- else }} GIT_CLONE_DIR="" {{- end }} {{- range $config.pre_script }} {{ . }} {{- end }} {{- if gt (int $config.nodes) 1 }} hostname=$(hostname) prefix="{{ $config.jobName }}-worker-" echo "prefix is $prefix" node_id=${hostname#"$prefix"} export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --node_rank $node_id --master_addr={{ $config.jobName }}-worker-0 --master_port 41000" {{- else }} export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}" {{- end }} {{- if $config.customScript }} # Custom script provided echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS" torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} {{- if $config.scriptArgs -}} \ {{ $config.scriptArgs }} {{- end }} {{- else }} # Recipe provided # Implementation from NeuronxDistributedTraining's train_setup.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train_setup.sh) and train.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train.sh) ulimit -n 65535 # removed the `sudo` commands in this block since it leads to permission errors sysctl -w net.ipv4.ip_local_reserved_ports=41000 if which lctl >/dev/null 2>&1; then lctl set_param 'osc.*.max_dirty_mb=64' # Cap max space each connection to FSx reserves so we avoid OODs fi export FI_EFA_USE_DEVICE_RDMA=1 export FI_PROVIDER=efa export FI_EFA_FORK_SAFE=1 export XLA_DISABLE_FUNCTIONALIZATION=0 export HYDRA_FULL_ERROR=1 export MALLOC_ARENA_MAX=128 export CREATE_TB_LOGGER=True export CHECKPOINT_CALLBACK=True # Place cache on shared storage to reduce redundant compilations export NEURON_COMPILE_CACHE_URL="/{{ (index $config.persistentVolumeClaims 0).mountPath }}/neuron_cache" mkdir -p $NEURON_COMPILE_CACHE_URL {{- if eq (int $config.compile) 1 }} MAYBE_COMPILE="neuron_parallel_compile" {{- end }} echo "env MAYBE_COMPILE=$MAYBE_COMPILE" echo "env DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS" # End of block {{- if $config.git.repo_url_or_path }} # copy the training conf generated by the Launcher into Neuron's repo cp -f /config/config.yaml examples/conf/launcher_config.yaml # cd into directory containing training_orchestrator.py and /conf cd examples {{- end }} $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS training_orchestrator.py \ --config-path=conf \ --config-name=launcher_config \ trainer.devices={{ $config.ntasksPerNode | default 32 }} \ trainer.num_nodes={{ $config.nodes }} # return to top-level directory cd .. {{- end }} {{- range $config.post_script }} {{ . }} {{- end }} {{- if $config.git.repo_url_or_path }} cd $HOME rm -rf $GIT_CLONE_DIR rm -rf $NEURON_COMPILE_CACHE_URL {{- end }}

launcher/nemo/k8s_templates/training/train-script-trn.yaml (88 lines of code) (raw):