recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml (162 lines of code) (raw):

# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com # Referred from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/llama/llama3_1_8b.yaml run: name: llama3-1-8b results_dir: ${base_results_dir}/${.name} time_limit: "0-01:30:00" dependency: "singleton" trainer: num_nodes: 16 devices: 8 accelerator: gpu precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False max_epochs: null max_steps: 300000 # consumed_samples = global_step * global_batch_size max_time: "05:23:30:00" # days:hours:minutes:seconds log_every_n_steps: 10 val_check_interval: 2000 limit_val_batches: 32 limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null name: megatron_llama create_wandb_logger: False wandb_logger_kwargs: project: nemo_llama_pretrain name: ${training.run.name} resume_if_exists: false resume_ignore_no_checkpoint: true create_checkpoint_callback: True checkpoint_callback_params: monitor: val_loss save_top_k: 10 mode: min always_save_nemo: False # saves nemo file during validation, not implemented for model parallel save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} log_step_timing: True step_timing_kwargs: sync_cuda: True buffer_size: 5 model: mcore_gpt: true micro_batch_size: 1 global_batch_size: 2048 rampup_batch_size: null tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null context_parallel_size: 1 encoder_seq_length: 8192 max_position_embeddings: 8192 num_layers: 32 hidden_size: 4096 ffn_hidden_size: 14336 num_attention_heads: 32 num_query_groups: 8 init_method_std: 0.01 use_scaled_init_method: true hidden_dropout: 0.0 attention_dropout: 0.0 ffn_dropout: 0.0 kv_channels: null apply_query_key_layer_scaling: true normalization: rmsnorm layernorm_epsilon: 1.0e-05 do_layer_norm_weight_decay: false make_vocab_size_divisible_by: 128 pre_process: true post_process: true persist_layer_norm: true bias: false activation: fast-swiglu headscale: false transformer_block_type: pre_ln openai_gelu: false normalize_attention_scores: true position_embedding_type: rope rotary_percentage: 1.0 apply_rope_fusion: true cross_entropy_loss_fusion: true attention_type: multihead share_embeddings_and_output_weights: false scale_positional_embedding: true tokenizer: library: 'sentencepiece' type: null model: <path_to_my_model>/tokenizer.model delimiter: null vocab_file: null merge_file: null sentencepiece_legacy: False native_amp_init_scale: 4294967296 native_amp_growth_interval: 1000 hysteresis: 2 fp32_residual_connection: false fp16_lm_cross_entropy: false megatron_amp_O2: true grad_allreduce_chunk_size_mb: 125 grad_div_ar_fusion: true gradient_accumulation_fusion: true bias_activation_fusion: true bias_dropout_add_fusion: true masked_softmax_fusion: true seed: 1234 resume_from_checkpoint: null use_cpu_initialization: false onnx_safe: false apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false activations_checkpoint_granularity: null activations_checkpoint_method: null activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false deterministic_mode: false ## Transformer Engine transformer_engine: true fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID fp8_margin: 0 # scaling margin fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False ub_tp_comm_overlap: False use_flash_attention: true optim: name: distributed_fused_adam lr: 1e-4 weight_decay: 0.1 betas: - 0.9 - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true overlap_param_sync: true contiguous_grad_buffer: true contiguous_param_buffer: true sched: name: CosineAnnealing warmup_steps: 500 constant_steps: 0 min_lr: 1e-5 data: data_impl: mock splits_string: 99990,8,2 seq_length: ${training.model.encoder_seq_length} skip_warmup: true num_workers: 2 dataloader_type: single reset_position_ids: true reset_attention_mask: true eod_mask_loss: false index_mapping_dir: null data_prefix: []