launcher_scripts/custom_script/config_k8s.yaml (41 lines of code) (raw):

# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com defaults: - override hydra/job_logging: stdout hydra: run: dir: . output_subdir: null git: repo_url_or_path: null branch: null commit: null token: null training_cfg: entry_script: ??? # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo script_args: - "--some_args" : "debug" - "--some_other_args" : 1 run: name: test_custom # Current run name nodes: 8 # Number of nodes to use for current training ntasks_per_node: 8 # Number of devices to use per node cluster: # Example k8s cluster cluster_type: k8s instance_type: ??? cluster_config: namespace: default # the namespace to submit job # create customized labels for the PytorchJob and Pods deployed jobs. # Example: # custom_labels: # label-key-1: label-value-1 # label-key-2: label-value-2 custom_labels: null # create customized annotations for the jobs. # Example: # annotations: # annotation-key-1: annotation-value-1 # annotation-key-2: annotation-value-2 annotations: null # add service account to job pods # Example: # serviceAccountName: service_account service_account_name: null # priorityClassName for Kueue scheduler to decide jobs priority priority_class_name: null # Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels # Structure: # label_selector: # required: <required label key-values pair> # preferred: <preferred label key-values pair> # weights: <weights list used by preferred labels to get nodes priority> # Example: # label_selector: # required: # example-label-key: # - expected-label-value-1 # - expected-label-value-2 # preferred: # preferred-label-key: # - preferred-label-value-1 # - preferred-label-value-2 # weights: # - 100 label_selector: null # persistent volume, usually used to mount FSx # Example: # persistent_volume_claims: # - claimName: null # mountPath: null # - claimName: null # mountPath: null persistent_volume_claims: null # temp volume: usually used to mount temp directory # Example: # volumes: # - volumeName: data1 # hostPath: "/data" # mountPath: "/data" volumes: null pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never restartPolicy: Never # restart policy # The clean up policy after the job completes or fails. cleanPodPolicy: null base_results_dir: ??? # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - null container: ??? # container to use env_vars: NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information