in src/nanotron/config/models_config.py [0:0]
def __post_init__(self):
# NOTE: user don't set self._init_method, ModelArgs will set it
# then we only pass LlamaConfig around
self._is_using_mup: bool = False
# self._init_method: Optional[Union[RandomInit, SpectralMupInit, ExistingCheckpointInit]] = None
# for backward compatibility
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads
# By default i want all layers to be MoE layers
if self.moe_config and self.moe_config.layers == [-1]:
self.moe_config.layers = list(range(self.num_hidden_layers))
# Validate that the attention implementation is valid
if self._attn_implementation is not None:
assert (
self._attn_implementation in ALL_ATTENTION_FUNCTIONS
), f"Invalid attention implementation: {self._attn_implementation}. Available options are: {ALL_ATTENTION_FUNCTIONS.keys()}"
if self.sliding_window_size is not None:
assert self._attn_implementation in [
"flex_attention",
"flash_attention_2",
"llama3_ring_attention",
], "Sliding window is only supported for Flex Attention and Flash Attention 2"
if self.flex_attention_mask is not None:
assert (
self._attn_implementation == "flex_attention"
), "Flex attention mask is only supported for flex attention"
assert self.flex_attention_mask in [
"sliding_window",
"document",
"sliding_window_document",
], "Flex attention mask must be one of ['sliding_window', 'document', 'sliding_window_document']"
if self.no_rope_layer is not None:
assert (
self.num_hidden_layers % self.no_rope_layer == 0
), "no_rope_layer must be a multiple of num_hidden_layers"
if self._attn_implementation == "llama3_ring_attention":
assert self.ring_attn_heads_k_stride is not None, "ring_attn_heads_k_stride must be specified for llama3 ring attention"
else:
assert self.ring_attn_heads_k_stride is None, f"ring_attn_heads_k_stride must be None for non-llama3 ring attention, got attn_implementation={self._attn_implementation}"