megatron_patch/model/qwen2_vl/layer_specs.py (95 lines of code) (raw):

# Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.extensions.transformer_engine import ( TEDotProductAttention, TELayerNormColumnParallelLinear, TENorm, TERowParallelLinear, TEColumnParallelLinear ) from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.attention import SelfAttentionSubmodules from .attention_vision import SelfAttention from .attention import SelfAttention as Qwen2VLSelfAttention # Use this spec to use lower level Transformer Engine modules (required for fp8 training) def get_gpt_layer_with_transformer_engine_spec( qk_layernorm: bool = False ) -> ModuleSpec: mlp = get_mlp_module_spec( use_te=True, num_experts=None, moe_grouped_gemm=False ) return ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( self_attention=ModuleSpec( module=Qwen2VLSelfAttention, params={"attn_mask_type": AttnMaskType.causal}, submodules=SelfAttentionSubmodules( linear_qkv=TELayerNormColumnParallelLinear, core_attention=TEDotProductAttention, linear_proj=TERowParallelLinear, q_layernorm=TENorm if qk_layernorm else IdentityOp, k_layernorm=TENorm if qk_layernorm else IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, pre_mlp_layernorm=IdentityOp, mlp=mlp, mlp_bda=get_bias_dropout_add, ), ) def get_qwen2vl_vision_model_spec( is_vit=False ) -> ModuleSpec: attn_mask_type = AttnMaskType.no_mask # THD --> causal_pad mlp = ModuleSpec( module=MLP, submodules=MLPSubmodules( linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear, ), ) return ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( self_attention=ModuleSpec( module=SelfAttention, params={"attn_mask_type": attn_mask_type}, submodules=SelfAttentionSubmodules( linear_qkv=TELayerNormColumnParallelLinear, core_attention=TEDotProductAttention, linear_proj=TERowParallelLinear, q_layernorm=IdentityOp, k_layernorm=IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, pre_mlp_layernorm=IdentityOp, mlp=mlp, mlp_bda=get_bias_dropout_add, ), ) # Helper function to get module spec for MLP/MoE def get_mlp_module_spec( use_te: bool = True, num_experts: int = None, moe_grouped_gemm: bool = False, add_norm: bool = True ) -> ModuleSpec: if num_experts is None: # Dense MLP w/ or w/o TE modules. if add_norm: return ModuleSpec( module=MLP, submodules=MLPSubmodules( linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear, linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, ), ) else: return ModuleSpec( module=MLP, submodules=MLPSubmodules( linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear, linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, ), ) else: # Mixture of experts with modules in megatron core. raise NotImplementedError()