recipes_collection/config.yaml (24 lines of code) (raw):
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
defaults:
- _self_
- cluster: slurm # set to `slurm`, `k8s` or `sm_jobs`, depending on the desired cluster
- recipes: training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain # select desired config inside the training directory
- override hydra/job_logging: stdout
cluster_type: slurm # bcm, bcp, k8s or sm_jobs. If bcm, k8s or sm_jobs, it must match - cluster above.
# If using sm_jobs cluster_type, set sm_jobs_config. See cluster/sm_jobs.yaml for example.
hydra:
run:
dir: .
output_subdir: null
debug: False
instance_type: p5.48xlarge
base_results_dir: null # Location to store the results, checkpoints and logs.
container: null
git:
repo_url_or_path: null
branch: null
commit: null
entry_script: null
token: null
update_adapter: false # if true it will re-install the Adapter code but not its dependencies
env_vars:
NCCL_DEBUG: WARN # Logging level for NCCL. Set to "INFO" for debug information
# Do not modify below, use the values above instead.
training_config: ${hydra:runtime.choices.recipes}