infra/4-training/nemo-example/selected-configuration.yaml (144 lines of code) (raw):
# Copyright 2024 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
run:
name: GPT-5B
trainer:
devices: 8
accelerator: gpu
precision: bf16
logger: false # logger provided by exp_manager
enable_checkpointing: false
# replace_sampler_ddp: false
use_distributed_sampler: false
max_epochs: null
max_steps: 50 # consumed_samples = global_step * global_batch_size
max_time: "05:00:00:00"
log_every_n_steps: 1
val_check_interval: 50
limit_val_batches: 0.0
limit_test_batches: 10
accumulate_grad_batches: 1
gradient_clip_val: 1.0
# enable_progress_bar: true
exp_manager:
explicit_log_dir: null
name: megatron_gpt
create_wandb_logger: false
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: true
resume_ignore_no_checkpoint: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: true
step_timing_kwargs:
sync_cuda: true
buffer_size: 5
model:
micro_batch_size: 4
global_batch_size: 2048
rampup_batch_size: null
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null # interleaved pipeline
resume_from_checkpoint: null # manually set the checkpoint file to load from
# model architecture
encoder_seq_length: 2048
max_position_embeddings: 2048
num_layers: 24
hidden_size: 4096
ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size.
num_attention_heads: 32
init_method_std: 0.01 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
attention_dropout: 0.0
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: true # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: true # add embedding
post_process: true # add pooler
persist_layer_norm: true # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: true # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
# Fusion
grad_div_ar_fusion: true # Fuse grad division into torch.distributed.all_reduce
gradient_accumulation_fusion: true # Fuse weight gradient accumulation to GEMMs
bias_activation_fusion: true # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
bias_dropout_add_fusion: true # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: true # Use a kernel that fuses the attention softmax with it's mask.
## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
## Sequence Parallelism
sequence_parallel: false
overlap_p2p_comm: false # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
batch_p2p_comm: true # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
delimiter: null # only used for tabular tokenizer
vocab_file: gpt2-vocab.json
merge_file: gpt2-merges.txt
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: false # Move residual connections to fp32
fp16_lm_cross_entropy: false # Move the cross entropy unreduced loss calculation for lm head to fp16
# Megatron O2-style half-precision
megatron_amp_O2: true # Enable O2-level automatic mixed precision using master parameters
grad_allreduce_chunk_size_mb: 125
mcore_gpt: true
## Transformer Engine
# To use fp8, please set `transformer_engine=true` and `fp8=true`.
# The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
transformer_engine: true
fp8: false # enables fp8 in TransformerLayer forward
fp8_e4m3: false # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: true # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
fp8_wgrad: true
# use_emha: false
ub_tp_comm_overlap: false
# miscellaneous
seed: 1234
sync_batch_comm: false
use_cpu_initialization: false # Init weights on the CPU (slow for large models)
onnx_safe: false # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
# Nsys profiling options
nsys_profile:
enabled: false
trace: [nvtx, cuda]
start_step: 10 # Global batch to start profiling
end_step: 12 # Global batch to end profiling
ranks: [0] # Global rank IDs to profile
gen_shape: false # Generate model and kernel details including input shapes
optim:
name: distributed_fused_adam
bucket_cap_mb: 400
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
# grad_sync_dtype: bf16
lr: 1.6e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 115
constant_steps: 12500
min_lr: 1.6e-5
data:
exchange_indices_distributed: true
data_impl: mmap
splits_string: "99990,8,2"
seq_length: 2048
skip_warmup: true
num_workers: 2
dataloader_type: single # cyclic
reset_position_ids: false # Reset position ids after end-of-document token
reset_attention_mask: false # Reset attention mask after end-of-document token
eod_mask_loss: false # Mask loss for the end of document tokens
# index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
index_mapping_dir: /nfs