in deepseek_vl2/models/modeling_deepseek.py [0:0]
def __init__(self, config: DeepseekV2Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
if config.use_mla:
attn_implementation = "mla_" + config._attn_implementation
else:
attn_implementation = "mha_" + config._attn_implementation
self.self_attn = ATTENTION_CLASSES[attn_implementation](
config=config, layer_idx=layer_idx
)
self.mlp = (
DeepseekV2MoE(config)
if (
config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0
)
else DeepseekV2MLP(config)
)
self.input_layernorm = DeepseekV2RMSNorm(
config.hidden_size, eps=config.rms_norm_eps
)
self.post_attention_layernorm = DeepseekV2RMSNorm(
config.hidden_size, eps=config.rms_norm_eps
)