launcher/nemo/k8s_templates/training/train-script-gpu.yaml (49 lines of code) (raw):

{{ $config := .Values.trainingConfig }} apiVersion: v1 kind: ConfigMap metadata: name: train-script-gpu-{{ $config.jobName }} data: train-script.sh: | #!/bin/bash set -ex {{- if $config.git.repo_url_or_path }} mkdir -p $HOME/tmp GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR GIT_CLONE_DIR=${GIT_CLONE_DIR}/ cd $GIT_CLONE_DIR rm -rf __pycache__ {{- if $config.git.branch }} git checkout {{ $config.git.branch }} {{- end }} {{- if $config.git.commit }} git fetch origin {{ $config.git.commit }} git reset --hard {{ $config.git.commit }} {{- end }} {{- if $config.git.update_adapter }} pip install . --force-reinstall --no-deps {{- end }} {{- else }} GIT_CLONE_DIR="" {{- end }} {{- range $config.pre_script }} {{ . }} {{- end }} {{- if gt (int $config.nodes) 1 }} export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --rdzv_backend=c10d --rdzv_endpoint={{ $config.jobName }}-worker-0" {{- else }} export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}" {{- end }} echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS" torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} \ {{- if $config.scriptArgs -}} {{ $config.scriptArgs }} {{- end }} {{- range $config.post_script }} {{ . }} {{- end }} {{- if $config.git.repo_url_or_path }} cd $HOME rm -rf $GIT_CLONE_DIR {{- end }}