launcher_scripts/custom_script/config_slurm.yaml (34 lines of code) (raw):
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
defaults:
- override hydra/job_logging: stdout
hydra:
run:
dir: .
output_subdir: null
git:
repo_url_or_path: null
branch: null
commit: null
token: null
training_cfg:
entry_script: null # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo
# script_args:
# - "--some_args" : "debug"
# - "--some_other_args" : 1
run:
name: test_custom # Current run name
nodes: 2 # Number of nodes to use for current training
ntasks_per_node: 8 # Number of devices to use per node
cluster:
#Example slurm cluster
cluster_type: slurm
instance_type: p5.48xlarge
cluster_config:
exclusive: True
job_name_prefix: testcustom_slurm_
slurm_create_submission_file_only: False # Setting to True if just want to create submission file
srun_args:
# - "--no-container-mount-home"
base_results_dir: null # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
- null
container: null # container to use
slurm_docker_cfg: # Will only be used with docker on slurm
docker_args:
# - "--runtime=nvidia" # this is required if the docker runtime version is low
post_launch_commands: # commands will run after launching the docker container using bash
env_vars:
NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information