recipes_collection/recipes/training/custom

# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com # Basic run information configs run: name: falcon-7b results_dir: ${base_results_dir}/${.name} time_limit: "6-00:00:00" model_type: hf # huggingface for our recipes # Basic pytorch lightning trainer config trainer: devices: 8 num_nodes: 2 accelerator: gpu precision: bf16 max_steps: 50 log_every_n_steps: 10 val_check_interval: 1 limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation. # Basic pytorch lightning experiment config # Config for checkpoint/tensorboard etc exp_manager: exp_dir: null name: experiment # experiment loggers create_tensorboard_logger: False summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"} create_mlflow_logger: False mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"} create_wandb_logger: False wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default create_checkpoint_callback: True # Configs to save checkpoint with a fixed interval # Note: These config will not work with auto checkpoint mode checkpoint_callback_params: # Set save_top_k = 0 to disable sharded checkpointing save_top_k: 0 every_n_train_steps: 10 monitor: "step" mode: "max" save_last: False checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/ resume_from_checkpoint: null # Set auto_checkpoint = False to disable auto resilience checkpointing # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint auto_checkpoint: enabled: False export_full_model: # Set every_n_train_steps = 0 to disable full checkpointing every_n_train_steps: 0 save_last: True use_smp_model: False #enable SMP distributed_backend: nccl # Start training from pretrained model model: model_type: falcon do_finetune: False hf_model_name_or_path: "tiiuae/falcon-7b" hf_access_token: None train_batch_size: 1 val_batch_size: 1 seed: 12345 grad_clip: 1.0 use_flash_attention: True activation_checkpointing: True # FSDP Configs sharding_strategy: hybrid_shard forward_prefetch: True shard_degree: 16 backward_fetch_policy: backward_pre auto_wrap_policy: transformer_auto_wrap_policy limit_all_gathers: true use_orig_param: False # model architecture max_context_width: 2048 precision: bf16 lr_decay_iters: 47683 log_reduced_training_loss: True # PEFT peft: peft_type: null # lora # Optimizer optim: name: adamw lr: 2e-4 weight_decay: 0.01 betas: - 0.9 - 0.98 sched: name: CosineAnnealing warmup_steps: 500 constant_steps: 0 min_lr: 2e-5 # Data data: train_dir: null val_dir: null dataset_type: hf use_synthetic_data: False zipped_data: False # Viztracer viztracer: enabled: false

recipes_collection/recipes/training/custom_model/falcon.yaml (83 lines of code) (raw):