recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml (162 lines of code) (raw):
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
# Referred from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
run:
name: llama3-1-8b
results_dir: ${base_results_dir}/${.name}
time_limit: "0-01:30:00"
dependency: "singleton"
trainer:
num_nodes: 16
devices: 8
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: null
max_steps: 300000 # consumed_samples = global_step * global_batch_size
max_time: "05:23:30:00" # days:hours:minutes:seconds
log_every_n_steps: 10
val_check_interval: 2000
limit_val_batches: 32
limit_test_batches: 50
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: ${training.run.results_dir}/results
exp_dir: null
name: megatron_llama
create_wandb_logger: False
wandb_logger_kwargs:
project: nemo_llama_pretrain
name: ${training.run.name}
resume_if_exists: false
resume_ignore_no_checkpoint: true
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5
model:
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 2048
rampup_batch_size: null
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
context_parallel_size: 1
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 32
hidden_size: 4096
ffn_hidden_size: 14336
num_attention_heads: 32
num_query_groups: 8
init_method_std: 0.01
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
apply_rope_fusion: true
cross_entropy_loss_fusion: true
attention_type: multihead
share_embeddings_and_output_weights: false
scale_positional_embedding: true
tokenizer:
library: 'sentencepiece'
type: null
model: <path_to_my_model>/tokenizer.model
delimiter: null
vocab_file: null
merge_file: null
sentencepiece_legacy: False
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: true
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
deterministic_mode: false
## Transformer Engine
transformer_engine: true
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: False
ub_tp_comm_overlap: False
use_flash_attention: true
optim:
name: distributed_fused_adam
lr: 1e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
bucket_cap_mb: 125
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
contiguous_param_buffer: true
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 0
min_lr: 1e-5
data:
data_impl: mock
splits_string: 99990,8,2
seq_length: ${training.model.encoder_seq_length}
skip_warmup: true
num_workers: 2
dataloader_type: single
reset_position_ids: true
reset_attention_mask: true
eod_mask_loss: false
index_mapping_dir: null
data_prefix: []