recipes_collection/cluster/k8s.yaml (12 lines of code) (raw):
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never
restartPolicy: Never # restart policy
namespace: default # the namespace to submit job
# create customized labels for the PytorchJob and Pods deployed jobs.
# Example:
# custom_labels:
# label-key-1: label-value-1
# label-key-2: label-value-2
custom_labels: null
# create customized annotations for the jobs.
# Example:
# annotations:
# annotation-key-1: annotation-value-1
# annotation-key-2: annotation-value-2
annotations: null
# add service account to job pods
# Example:
# serviceAccountName: service_account
service_account_name: null
# priorityClassName for Kueue scheduler to decide jobs priority
priority_class_name: null
# temp volume, usually used to mount temp directory
# Example:
# volumes:
# - volumeName: data1
# hostPath: "/data"
# mountPath: "/data"
volumes: null
# persistent volume, usually used to mount FSx
# Example:
# persistent_volume_claims:
# - claimName: null
# mountPath: null
# - claimName: null
# mountPath: null
# persistent volumes, usually used to mount FSx
persistent_volume_claims:
- null
# This claim should be created before running. Example:
# - claimName: fsx-claim
# mountPath: data
# Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels
# Structure:
# label_selector:
# required: <required label key-values pair>
# preferred: <preferred label key-values pair>
# weights: <weights list used by preferred labels to get nodes priority>
# Example:
# label_selector:
# required:
# example-label-key:
# - expected-label-value-1
# - expected-label-value-2
# preferred:
# preferred-label-key:
# - preferred-label-value-1
# - preferred-label-value-2
# weights:
# - 100
label_selector: null
# The clean up policy after the job completes or fails.
cleanPodPolicy: null