in optimum/neuron/models/training/llama/modeling_llama.py [0:0]
def __init__(self, config, trn_config: TrainingNeuronConfig):
nn.Module.__init__(self)
self.config = config
self.trn_config = trn_config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.act_fn = ACT2FN[config.hidden_act]
tp_size = get_tensor_model_parallel_size()
if self.intermediate_size % tp_size != 0:
raise RuntimeError(
f"Intermediate size {self.intermediate_size} must be divisible by the tensor model parallel size "
f"{tp_size}."
)
self.split_size = self.intermediate_size // tp_size
init_method = partial(_init_normal, config.initializer_range)
# Defines the MLP weight transformation specs
self.specs = ModelWeightTransformationSpecs(
specs=FusedLinearsSpec(
fused_linear_name="gate_up_proj",
linear_names=["gate_proj", "up_proj"],
bias=False,
fuse_axis="column",
original_dims=[self.intermediate_size] * 2,
)
)
self.gate_up_proj = ColumnParallelLinear(
self.hidden_size,
2 * self.intermediate_size,
stride=2,
bias=False,
gather_output=False,
init_method=init_method,
sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
sequence_dimension=0,
dtype=self.config.torch_dtype,
)
self.down_proj = RowParallelLinear(
self.intermediate_size,
self.hidden_size,
bias=False,
input_is_parallel=True,
init_method=init_method,
sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
sequence_dimension=0,
dtype=self.config.torch_dtype,
)