launcher/nemo/k8s_templates/training/values.yaml (36 lines of code) (raw):

image: # training image trainingImage: cfg.container # image pulling policy pullPolicy: IfNotPresent trainingConfig: # current job name jobName: "nil" # namespace to launch job namespace: "default" # script path scriptPath: null # script args scriptArgs: null # specify whether to use custom scripts customScript: null # list of custom annotations apply to jobs annotations: null # list of custom labels apply to jobs and pods customLabels: null # Kueue scheduler priority class name priority_class_name: null # device type, can be "gpu", "trainium" and "nil", "nil" means cpu device: "nil" # number of EFA devices if the instance type support EFA numEFADevices: 0 # number of Neuron devices if job is for Trainium numNeuronDevices: null # number of process per node ntasksPerNode: 0 # number of nodes to run nodes: training.trainer.num_nodes # restart policy restartPolicy: Never # from NeMo, not used currently wandbKey: "nil" # name of service account associated with the namespace serviceAccountName: null # relevant for Trainium chips, either 0 or 1 compile: 0 # persistent volume, usually used to mount FSx persistentVolumeClaims: null # temp volume, usually used to mount temp file in the host volumes: null # A github repo if user might want to use script inside git: repo_url_or_path: null branch: null commit: null token: null update_adapter: null # Commands to run before training pre_script: [] # Commands to run after training post_script: [] # select preferred and required labels for nodes labelSelector: required: null # select nodes with required labels preferred: null # select nodes with priority which has preferred labels weights: null # list of weights for the preferred labels # The clean up policy after the job completes or fails. cleanPodPolicy: null