backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py [283:408]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Starcoder2MLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        c_fc = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.c_fc",
            weights=weights,
            bias=config.use_bias,
        )
        c_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=config.use_bias,
        )

        self.c_fc = TensorParallelMultiAdapterLinear.load(
            c_fc,
            layer_id=index,
            layer_names=[f"{prefix}.c_fc"],
            sizes=[config.intermediate_size, config.intermediate_size],
            process_group=weights.process_group,
        )

        self.c_proj = TensorParallelAdapterRowLinear.load(
            c_proj,
            index,
            "c_proj",
            process_group=weights.process_group,
        )

    def forward(self, hidden_states, adapter_data):
        hidden_states = self.c_fc(hidden_states, adapter_data)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states, adapter_data)


class Starcoder2GatedMLP(nn.Module):
    def __init__(self, index, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
        sizes = [
            config.intermediate_size,
            config.intermediate_size,
        ]
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            weights=weights,
            dim=0,
            bias=config.use_bias,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )
        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=config.use_bias,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


STARCODER2_NORMALIZATION_CLASSES = {
    "layer_norm": FastLayerNorm,
    "rms_norm": FastRMSNorm,
}

STARCODER2_MLP_CLASSES = {
    "default": Starcoder2MLP,
    "gated": Starcoder2GatedMLP,
}


class Starcoder2Layer(nn.Module):
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py [295:420]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Starcoder2MLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        c_fc = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.c_fc",
            weights=weights,
            bias=config.use_bias,
        )
        c_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=config.use_bias,
        )

        self.c_fc = TensorParallelMultiAdapterLinear.load(
            c_fc,
            layer_id=index,
            layer_names=[f"{prefix}.c_fc"],
            sizes=[config.intermediate_size, config.intermediate_size],
            process_group=weights.process_group,
        )

        self.c_proj = TensorParallelAdapterRowLinear.load(
            c_proj,
            index,
            "c_proj",
            process_group=weights.process_group,
        )

    def forward(self, hidden_states, adapter_data):
        hidden_states = self.c_fc(hidden_states, adapter_data)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states, adapter_data)


class Starcoder2GatedMLP(nn.Module):
    def __init__(self, index, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
        sizes = [
            config.intermediate_size,
            config.intermediate_size,
        ]
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            weights=weights,
            dim=0,
            bias=config.use_bias,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )
        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=config.use_bias,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


STARCODER2_NORMALIZATION_CLASSES = {
    "layer_norm": FastLayerNorm,
    "rms_norm": FastRMSNorm,
}

STARCODER2_MLP_CLASSES = {
    "default": Starcoder2MLP,
    "gated": Starcoder2GatedMLP,
}


class Starcoder2Layer(nn.Module):
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -