infra/4-training/nemo-example/nemo-configurations/gpt-5b.yaml (144 lines of code) (raw):

# Copyright 2024 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # https://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. run: name: GPT-5B trainer: devices: 8 accelerator: gpu precision: bf16 logger: false # logger provided by exp_manager enable_checkpointing: false # replace_sampler_ddp: false use_distributed_sampler: false max_epochs: null max_steps: 50 # consumed_samples = global_step * global_batch_size max_time: "05:00:00:00" log_every_n_steps: 1 val_check_interval: 50 limit_val_batches: 0.0 limit_test_batches: 10 accumulate_grad_batches: 1 gradient_clip_val: 1.0 # enable_progress_bar: true exp_manager: explicit_log_dir: null name: megatron_gpt create_wandb_logger: false wandb_logger_kwargs: project: null name: null resume_if_exists: true resume_ignore_no_checkpoint: true create_checkpoint_callback: true checkpoint_callback_params: monitor: val_loss save_top_k: 10 mode: min always_save_nemo: false # saves nemo file during validation, not implemented for model parallel save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} log_step_timing: true step_timing_kwargs: sync_cuda: true buffer_size: 5 model: micro_batch_size: 4 global_batch_size: 2048 rampup_batch_size: null tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null # interleaved pipeline resume_from_checkpoint: null # manually set the checkpoint file to load from # model architecture encoder_seq_length: 2048 max_position_embeddings: 2048 num_layers: 24 hidden_size: 4096 ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size. num_attention_heads: 32 init_method_std: 0.01 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.0 # Dropout probability for hidden state transformer. attention_dropout: 0.0 kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: true # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. pre_process: true # add embedding post_process: true # add pooler persist_layer_norm: true # Use of persistent fused layer norm kernel. gradient_as_bucket_view: true # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) # Fusion grad_div_ar_fusion: true # Fuse grad division into torch.distributed.all_reduce gradient_accumulation_fusion: true # Fuse weight gradient accumulation to GEMMs bias_activation_fusion: true # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. bias_dropout_add_fusion: true # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: true # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing activations_checkpoint_granularity: null # 'selective' or 'full' activations_checkpoint_method: null # 'uniform', 'block' activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null ## Sequence Parallelism sequence_parallel: false overlap_p2p_comm: false # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 batch_p2p_comm: true # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' model: null delimiter: null # only used for tabular tokenizer vocab_file: gpt2-vocab.json merge_file: gpt2-merges.txt # precision native_amp_init_scale: 4294967296 # 2 ** 32 native_amp_growth_interval: 1000 hysteresis: 2 # Gradient scale hysteresis fp32_residual_connection: false # Move residual connections to fp32 fp16_lm_cross_entropy: false # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision megatron_amp_O2: true # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 mcore_gpt: true ## Transformer Engine # To use fp8, please set `transformer_engine=true` and `fp8=true`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training transformer_engine: true fp8: false # enables fp8 in TransformerLayer forward fp8_e4m3: false # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: true # sets fp8_format = recipe.Format.HYBRID fp8_margin: 0 # scaling margin fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: true # use_emha: false ub_tp_comm_overlap: false # miscellaneous seed: 1234 sync_batch_comm: false use_cpu_initialization: false # Init weights on the CPU (slow for large models) onnx_safe: false # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this # Nsys profiling options nsys_profile: enabled: false trace: [nvtx, cuda] start_step: 10 # Global batch to start profiling end_step: 12 # Global batch to end profiling ranks: [0] # Global rank IDs to profile gen_shape: false # Generate model and kernel details including input shapes optim: name: distributed_fused_adam bucket_cap_mb: 400 overlap_grad_sync: true overlap_param_sync: true contiguous_grad_buffer: true # grad_sync_dtype: bf16 lr: 1.6e-4 weight_decay: 0.1 betas: - 0.9 - 0.95 sched: name: CosineAnnealing warmup_steps: 115 constant_steps: 12500 min_lr: 1.6e-5 data: exchange_indices_distributed: true data_impl: mmap splits_string: "99990,8,2" seq_length: 2048 skip_warmup: true num_workers: 2 dataloader_type: single # cyclic reset_position_ids: false # Reset position ids after end-of-document token reset_attention_mask: false # Reset attention mask after end-of-document token eod_mask_loss: false # Mask loss for the end of document tokens # index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix index_mapping_dir: /nfs