infra/4-training/nemo-example/nemo-configurations/gpt-175b.yaml (144 lines of code) (raw):
# Copyright 2024 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
run:
name: gpt3_175b_google_tuned
time_limit: "26-00:00:00"
dependency: "singleton"
trainer:
devices: 8
accelerator: gpu
precision: bf16
logger: false # logger provided by exp_manager
enable_checkpointing: false
replace_sampler_ddp: false
max_epochs: null
max_steps: 100 # consumed_samples = global_step * global_batch_size
max_time: "25:23:00:00"
log_every_n_steps: 1
val_check_interval: 100
limit_val_batches: 5
limit_test_batches: 20
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: null
name: megatron_gpt
create_wandb_logger: false
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: true
resume_ignore_no_checkpoint: true
create_checkpoint_callback: false
checkpoint_callback_params:
monitor: val_loss
save_top_k: 5
mode: min
always_save_nemo: false # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: false # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: true
step_timing_kwargs:
sync_cuda: true
buffer_size: 5
model:
micro_batch_size: 1
global_batch_size: 2048
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 8
virtual_pipeline_model_parallel_size: 2 # interleaved pipeline, set to maximum
resume_from_checkpoint: null # manually set the checkpoint file to load from
# model architecture
encoder_seq_length: 2048
max_position_embeddings: 2048
num_layers: 96
hidden_size: 12288
ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size.
num_attention_heads: 96
init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: true # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: true # add embedding
post_process: true # add pooler
persist_layer_norm: true # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: true # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
# Fusion
grad_div_ar_fusion: true # Fuse grad division into torch.distributed.all_reduce
gradient_accumulation_fusion: true # Fuse weight gradient accumulation to GEMMs
bias_activation_fusion: true # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
bias_dropout_add_fusion: true # Use a kernel that fuses the bias addition, dropout and residual connection addition.
masked_softmax_fusion: true # Use a kernel that fuses the attention softmax with it's mask.
## Activation Checkpointing
activations_checkpoint_granularity: selective # 'selective' or 'full'
activations_checkpoint_method: block # 'uniform', 'block'
activations_checkpoint_num_layers: 0
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
## Sequence Parallelism
sequence_parallel: true
tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
delimiter: null # only used for tabular tokenizer
vocab_file: gpt2-vocab.json
merge_file: gpt2-merges.txt
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: false # Move residual connections to fp32
fp16_lm_cross_entropy: false # Move the cross entropy unreduced loss calculation for lm head to fp16
# Megatron O2-style half-precision
megatron_amp_O2: true # Enable O2-level automatic mixed precision using master parameters
grad_allreduce_chunk_size_mb: 125
## Transformer Engine
# To use fp8, please set `transformer_engine=true` and `fp8=true`.
# The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training
transformer_engine: true
fp8: true # enables fp8 in TransformerLayer forward
fp8_e4m3: false # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: true # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: false
# ub_tp_comm_overlap: Setting this to true enables pipelined tensor-parallel communication overlap intra-node.
# In Slurm, it needs --mpi=pmix setting in srun. In GCE/GKE, it would still require some MPI setup.
# By setting it to False, we'd expect a 3% increase in training time.
ub_tp_comm_overlap: false
# miscellaneous
seed: 1234
sync_batch_comm: false
use_cpu_initialization: false # Init weights on the CPU (slow for large models)
onnx_safe: false # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
overlap_p2p_comm: true # Overlap p2p communication with computes
batch_p2p_comm: false # Batch consecutive inter-peer send/recv operations
gc_interval: 100 # Interval of the host memory garbage collection
# Nsys profiling options
nsys_profile:
enabled: false
trace: [nvtx, cuda]
start_step: 10 # Global batch to start profiling
end_step: 12 # Global batch to end profiling
# 1st pipeline group (at tensor parallel index 0): [0, 32, 64, 96, 128, 160, 192, 224]
# 1st FP8 group (superset of first four DPG groups): [0, 1, ..., 31]
ranks: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 64, 96, 128, 160, 192, 224] # Global rank IDs to profile
gen_shape: false # Generate model and kernel details including input shapes
optim:
name: distributed_fused_adam
bucket_cap_mb: 200
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
grad_sync_dtype: bf16
lr: 0.9e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 115
constant_steps: 12500
min_lr: 0.9e-5
data:
data_impl: mmap
splits_string: "99990,8,2"
seq_length: 2048
skip_warmup: true
num_workers: 4
exchange_indices_distributed: true
dataloader_type: single # cyclic
reset_position_ids: false # Reset position ids after end-of-document token
reset_attention_mask: false # Reset attention mask after end-of-document token
eod_mask_loss: false # Mask loss for the end of document tokens
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix