launcher/nemo/k8s_templates/training/train-script-trn.yaml (88 lines of code) (raw):
{{ $config := .Values.trainingConfig }}
apiVersion: v1
kind: ConfigMap
metadata:
name: train-script-trn-{{ $config.jobName }}
data:
train-script.sh: |
#!/usr/bin/env bash
set -o pipefail
set -ex
{{- if $config.git.repo_url_or_path }}
mkdir -p $HOME/tmp
GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME
[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
cd $GIT_CLONE_DIR
rm -rf __pycache__
{{- if $config.git.branch }}
git checkout {{ $config.git.branch }}
{{- end }}
{{- if $config.git.commit }}
git fetch origin {{ $config.git.commit }}
git reset --hard {{ $config.git.commit }}
{{- end }}
{{- if $config.git.update_adapter }}
pip install . --force-reinstall --no-deps
{{- end }}
{{- else }}
GIT_CLONE_DIR=""
{{- end }}
{{- range $config.pre_script }}
{{ . }}
{{- end }}
{{- if gt (int $config.nodes) 1 }}
hostname=$(hostname)
prefix="{{ $config.jobName }}-worker-"
echo "prefix is $prefix"
node_id=${hostname#"$prefix"}
export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --node_rank $node_id --master_addr={{ $config.jobName }}-worker-0 --master_port 41000"
{{- else }}
export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}"
{{- end }}
{{- if $config.customScript }}
# Custom script provided
echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }}
{{- if $config.scriptArgs -}} \
{{ $config.scriptArgs }}
{{- end }}
{{- else }}
# Recipe provided
# Implementation from NeuronxDistributedTraining's train_setup.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train_setup.sh) and train.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train.sh)
ulimit -n 65535
# removed the `sudo` commands in this block since it leads to permission errors
sysctl -w net.ipv4.ip_local_reserved_ports=41000
if which lctl >/dev/null 2>&1; then
lctl set_param 'osc.*.max_dirty_mb=64' # Cap max space each connection to FSx reserves so we avoid OODs
fi
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_EFA_FORK_SAFE=1
export XLA_DISABLE_FUNCTIONALIZATION=0
export HYDRA_FULL_ERROR=1
export MALLOC_ARENA_MAX=128
export CREATE_TB_LOGGER=True
export CHECKPOINT_CALLBACK=True
# Place cache on shared storage to reduce redundant compilations
export NEURON_COMPILE_CACHE_URL="/{{ (index $config.persistentVolumeClaims 0).mountPath }}/neuron_cache"
mkdir -p $NEURON_COMPILE_CACHE_URL
{{- if eq (int $config.compile) 1 }}
MAYBE_COMPILE="neuron_parallel_compile"
{{- end }}
echo "env MAYBE_COMPILE=$MAYBE_COMPILE"
echo "env DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
# End of block
{{- if $config.git.repo_url_or_path }}
# copy the training conf generated by the Launcher into Neuron's repo
cp -f /config/config.yaml examples/conf/launcher_config.yaml
# cd into directory containing training_orchestrator.py and /conf
cd examples
{{- end }}
$MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS training_orchestrator.py \
--config-path=conf \
--config-name=launcher_config \
trainer.devices={{ $config.ntasksPerNode | default 32 }} \
trainer.num_nodes={{ $config.nodes }}
# return to top-level directory
cd ..
{{- end }}
{{- range $config.post_script }}
{{ . }}
{{- end }}
{{- if $config.git.repo_url_or_path }}
cd $HOME
rm -rf $GIT_CLONE_DIR
rm -rf $NEURON_COMPILE_CACHE_URL
{{- end }}