launcher/nemo/k8s_templates/training/train-script-gpu.yaml (49 lines of code) (raw):
{{ $config := .Values.trainingConfig }}
apiVersion: v1
kind: ConfigMap
metadata:
name: train-script-gpu-{{ $config.jobName }}
data:
train-script.sh: |
#!/bin/bash
set -ex
{{- if $config.git.repo_url_or_path }}
mkdir -p $HOME/tmp
GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME
[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
cd $GIT_CLONE_DIR
rm -rf __pycache__
{{- if $config.git.branch }}
git checkout {{ $config.git.branch }}
{{- end }}
{{- if $config.git.commit }}
git fetch origin {{ $config.git.commit }}
git reset --hard {{ $config.git.commit }}
{{- end }}
{{- if $config.git.update_adapter }}
pip install . --force-reinstall --no-deps
{{- end }}
{{- else }}
GIT_CLONE_DIR=""
{{- end }}
{{- range $config.pre_script }}
{{ . }}
{{- end }}
{{- if gt (int $config.nodes) 1 }}
export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --rdzv_backend=c10d --rdzv_endpoint={{ $config.jobName }}-worker-0"
{{- else }}
export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}"
{{- end }}
echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} \
{{- if $config.scriptArgs -}}
{{ $config.scriptArgs }}
{{- end }}
{{- range $config.post_script }}
{{ . }}
{{- end }}
{{- if $config.git.repo_url_or_path }}
cd $HOME
rm -rf $GIT_CLONE_DIR
{{- end }}