infra/4-training/nemo-example/nemo-configurations/gpt-175b.yaml (144 lines of code) (raw):

# Copyright 2024 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # https://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. run: name: gpt3_175b_google_tuned time_limit: "26-00:00:00" dependency: "singleton" trainer: devices: 8 accelerator: gpu precision: bf16 logger: false # logger provided by exp_manager enable_checkpointing: false replace_sampler_ddp: false max_epochs: null max_steps: 100 # consumed_samples = global_step * global_batch_size max_time: "25:23:00:00" log_every_n_steps: 1 val_check_interval: 100 limit_val_batches: 5 limit_test_batches: 20 accumulate_grad_batches: 1 gradient_clip_val: 1.0 exp_manager: explicit_log_dir: null name: megatron_gpt create_wandb_logger: false wandb_logger_kwargs: project: null name: null resume_if_exists: true resume_ignore_no_checkpoint: true create_checkpoint_callback: false checkpoint_callback_params: monitor: val_loss save_top_k: 5 mode: min always_save_nemo: false # saves nemo file during validation, not implemented for model parallel save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} log_step_timing: true step_timing_kwargs: sync_cuda: true buffer_size: 5 model: micro_batch_size: 1 global_batch_size: 2048 tensor_model_parallel_size: 4 pipeline_model_parallel_size: 8 virtual_pipeline_model_parallel_size: 2 # interleaved pipeline, set to maximum resume_from_checkpoint: null # manually set the checkpoint file to load from # model architecture encoder_seq_length: 2048 max_position_embeddings: 2048 num_layers: 96 hidden_size: 12288 ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size. num_attention_heads: 96 init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: true # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. pre_process: true # add embedding post_process: true # add pooler persist_layer_norm: true # Use of persistent fused layer norm kernel. gradient_as_bucket_view: true # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) # Fusion grad_div_ar_fusion: true # Fuse grad division into torch.distributed.all_reduce gradient_accumulation_fusion: true # Fuse weight gradient accumulation to GEMMs bias_activation_fusion: true # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. bias_dropout_add_fusion: true # Use a kernel that fuses the bias addition, dropout and residual connection addition. masked_softmax_fusion: true # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing activations_checkpoint_granularity: selective # 'selective' or 'full' activations_checkpoint_method: block # 'uniform', 'block' activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null ## Sequence Parallelism sequence_parallel: true tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' model: null delimiter: null # only used for tabular tokenizer vocab_file: gpt2-vocab.json merge_file: gpt2-merges.txt # precision native_amp_init_scale: 4294967296 # 2 ** 32 native_amp_growth_interval: 1000 hysteresis: 2 # Gradient scale hysteresis fp32_residual_connection: false # Move residual connections to fp32 fp16_lm_cross_entropy: false # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision megatron_amp_O2: true # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 ## Transformer Engine # To use fp8, please set `transformer_engine=true` and `fp8=true`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training transformer_engine: true fp8: true # enables fp8 in TransformerLayer forward fp8_e4m3: false # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: true # sets fp8_format = recipe.Format.HYBRID fp8_margin: 0 # scaling margin fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: false # ub_tp_comm_overlap: Setting this to true enables pipelined tensor-parallel communication overlap intra-node. # In Slurm, it needs --mpi=pmix setting in srun. In GCE/GKE, it would still require some MPI setup. # By setting it to False, we'd expect a 3% increase in training time. ub_tp_comm_overlap: false # miscellaneous seed: 1234 sync_batch_comm: false use_cpu_initialization: false # Init weights on the CPU (slow for large models) onnx_safe: false # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this overlap_p2p_comm: true # Overlap p2p communication with computes batch_p2p_comm: false # Batch consecutive inter-peer send/recv operations gc_interval: 100 # Interval of the host memory garbage collection # Nsys profiling options nsys_profile: enabled: false trace: [nvtx, cuda] start_step: 10 # Global batch to start profiling end_step: 12 # Global batch to end profiling # 1st pipeline group (at tensor parallel index 0): [0, 32, 64, 96, 128, 160, 192, 224] # 1st FP8 group (superset of first four DPG groups): [0, 1, ..., 31] ranks: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 64, 96, 128, 160, 192, 224] # Global rank IDs to profile gen_shape: false # Generate model and kernel details including input shapes optim: name: distributed_fused_adam bucket_cap_mb: 200 overlap_grad_sync: true overlap_param_sync: true contiguous_grad_buffer: true grad_sync_dtype: bf16 lr: 0.9e-4 weight_decay: 0.1 betas: - 0.9 - 0.95 sched: name: CosineAnnealing warmup_steps: 115 constant_steps: 12500 min_lr: 0.9e-5 data: data_impl: mmap splits_string: "99990,8,2" seq_length: 2048 skip_warmup: true num_workers: 4 exchange_indices_distributed: true dataloader_type: single # cyclic reset_position_ids: false # Reset position ids after end-of-document token reset_attention_mask: false # Reset attention mask after end-of-document token eod_mask_loss: false # Mask loss for the end of document tokens index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix