def __init__()

in optimum/neuron/models/training/llama/modeling_llama.py [0:0]


    def __init__(self, config, trn_config: TrainingNeuronConfig):
        nn.Module.__init__(self)
        self.config = config
        self.trn_config = trn_config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.act_fn = ACT2FN[config.hidden_act]

        tp_size = get_tensor_model_parallel_size()
        if self.intermediate_size % tp_size != 0:
            raise RuntimeError(
                f"Intermediate size {self.intermediate_size} must be divisible by the tensor model parallel size "
                f"{tp_size}."
            )
        self.split_size = self.intermediate_size // tp_size

        init_method = partial(_init_normal, config.initializer_range)

        # Defines the MLP weight transformation specs
        self.specs = ModelWeightTransformationSpecs(
            specs=FusedLinearsSpec(
                fused_linear_name="gate_up_proj",
                linear_names=["gate_proj", "up_proj"],
                bias=False,
                fuse_axis="column",
                original_dims=[self.intermediate_size] * 2,
            )
        )
        self.gate_up_proj = ColumnParallelLinear(
            self.hidden_size,
            2 * self.intermediate_size,
            stride=2,
            bias=False,
            gather_output=False,
            init_method=init_method,
            sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
            sequence_dimension=0,
            dtype=self.config.torch_dtype,
        )
        self.down_proj = RowParallelLinear(
            self.intermediate_size,
            self.hidden_size,
            bias=False,
            input_is_parallel=True,
            init_method=init_method,
            sequence_parallel_enabled=self.trn_config.sequence_parallel_enabled,
            sequence_dimension=0,
            dtype=self.config.torch_dtype,
        )