launcher/nemo/k8s_templates/training/training.yaml (178 lines of code) (raw):
{{ $config := .Values.trainingConfig }}
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: {{ $config.jobName }}
namespace: {{ $config.namespace }}
{{- if $config.annotations }}
annotations:
{{- range $key, $value := $config.annotations }}
{{ $key | quote }}: {{ $value | quote }}
{{- end }}
{{- end }}
labels:
app: {{ $config.jobName }}
{{- if $config.customLabels }}
{{- range $key, $value := $config.customLabels }}
{{ $key | quote }}: {{ $value | quote }}
{{- end}}
{{- end }}
spec:
{{- if $config.cleanPodPolicy }}
runPolicy:
cleanPodPolicy: {{ $config.cleanPodPolicy }}
{{- end }}
pytorchReplicaSpecs:
Worker:
replicas: {{ $config.nodes }}
restartPolicy: {{ $config.restartPolicy }}
template:
{{- if $config.customLabels }}
metadata:
labels:
{{- range $key, $value := $config.customLabels }}
{{ $key | quote }}: {{ $value | quote }}
{{- end }}
{{- end }}
spec:
{{- if $config.priorityClassName }}
priorityClassName: {{ $config.priorityClassName }}
{{- end}}
{{- if $config.serviceAccountName }}
serviceAccountName: {{ $config.serviceAccountName }}
{{- end }}
containers:
- name: pytorch
image: {{ .Values.image.trainingImage }}
env:
{{- range $key, $value := $config.envVars }}
- name: {{ $key }}
value: {{ $value | quote }}
{{- end}}
command:
- /etc/config/train-script.sh
imagePullPolicy: {{ .Values.image.pullPolicy }}
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
{{- if or (eq $config.device "gpu") (eq $config.device "trainium") (gt (int $config.numEFADevices) 0 ) }}
resources:
requests:
{{- if eq $config.device "gpu" }}
nvidia.com/gpu: {{ $config.ntasksPerNode }}
{{- end }}
{{- if eq $config.device "trainium" }}
aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
{{- end }}
{{- if gt (int $config.numEFADevices) 0 }}
vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
{{- end }}
limits:
{{- if eq $config.device "gpu" }}
nvidia.com/gpu: {{ $config.ntasksPerNode }}
{{- end }}
{{- if eq $config.device "trainium" }}
aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
{{- end }}
{{- if gt (int $config.numEFADevices) 0 }}
vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
{{- end }}
{{- end }}
volumeMounts:
{{- if $config.persistentVolumeClaims }}
{{- range $config.persistentVolumeClaims }}
- mountPath: {{ .mountPath }}
name: {{ .claimName }}-volume
{{- end }}
{{- end }}
{{- if $config.volumes }}
{{- range $config.volumes }}
- name: {{ .volumeName }}
mountPath: {{ .mountPath }}
{{- end }}
{{- end }}
{{- if not $config.customScript }}
- mountPath: /config
name: training-config
{{- end }}
- mountPath: /etc/config
name: train-script
- mountPath: /dev/shm
name: shm
- mountPath: /var/log/aws/clusters
name: aws-clusters-logs
readOnly: true
{{- if (or $config.labelSelector.required $config.labelSelector.preferred) }}
affinity:
nodeAffinity:
{{- if $config.labelSelector.required }}
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
{{- range $key, $values := $config.labelSelector.required }}
- key: {{ $key | quote }}
operator: In
values:
{{- range $values }}
- {{ . | quote }}
{{- end}}
{{- end }}
{{- end }}
{{- if $config.labelSelector.preferred }}
{{- $index := 0 }}
preferredDuringSchedulingIgnoredDuringExecution:
{{- range $key, $values := $config.labelSelector.preferred }}
- weight: {{ index $config.labelSelector.weights $index }}
preference:
matchExpressions:
- key: {{ $key | quote }}
operator: In
values:
{{- range $values }}
- {{ . | quote }}
{{- end }}
{{- $index = add $index 1 }}
{{- end }}
{{- end }}
{{- end }}
volumes:
{{- if $config.persistentVolumeClaims }}
{{- range $config.persistentVolumeClaims }}
- name: {{ .claimName }}-volume
persistentVolumeClaim:
claimName: {{ .claimName }}
{{- end }}
{{- end }}
{{- if $config.volumes }}
{{- range $config.volumes }}
- name: {{ .volumeName }}
hostPath:
path: {{ .hostPath }}
type: Directory
{{- end }}
{{- end }}
{{- if not $config.customScript }}
- configMap:
name: training-config-{{ $config.jobName }}
name: training-config
{{- end }}
- name: shm
hostPath:
path: /dev/shm
type: Directory
- name: aws-clusters-logs
hostPath:
path: /var/log/aws/clusters
type: DirectoryOrCreate
- name: train-script
configMap:
defaultMode: 420
items:
- key: train-script.sh
mode: 365
path: train-script.sh
{{- if eq $config.device "trainium" }}
name: train-script-trn-{{ $config.jobName }}
{{- else }}
name: train-script-gpu-{{ $config.jobName }}
{{- end }}