optimum/habana/transformers/models/deepseek_v2/modeling_deepseek_v2.py [667:715]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        else:
            final_hidden_states = torch.zeros(
                (batch * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
            )
            for idx in range(self.expert_slice):
                experts_min = (self.ep_rank * self.experts_per_rank) + (self.expert_chunk * idx)
                experts_max = min((experts_min + self.expert_chunk), (self.ep_rank + 1) * self.experts_per_rank)
                experts_range = range(experts_min, experts_max)
                gate_proj_list = [self.experts[i].gate_proj.weight.squeeze() for i in experts_range]
                down_proj_list = [self.experts[i].down_proj.weight.squeeze() for i in experts_range]
                up_proj_list = [self.experts[i].up_proj.weight.squeeze() for i in experts_range]
                hidden_states_slice = torch.ops.hpu.mixture_of_experts(
                    hidden_states=hidden_states,
                    expert_routing_table=topk_idx,
                    router_weights=topk_weight,
                    w1=gate_proj_list,
                    w2=up_proj_list,
                    w3=down_proj_list,
                    permuted_weights=True,
                    activation="silu",
                    experts_min=experts_min,
                    experts_max=experts_max - 1,
                )
                final_hidden_states = final_hidden_states + hidden_states_slice
                htcore.mark_step()

            if self.ep_size > 1:
                final_hidden_states = _all_reduce(final_hidden_states)
            elif is_deepspeed_available():
                from deepspeed import comm as dist

                if dist.is_initialized():
                    dist.all_reduce(final_hidden_states, op=dist.ReduceOp.SUM)

            final_hidden_states = final_hidden_states.type(hidden_states.dtype)
            final_hidden_states = final_hidden_states.reshape(-1, sequence_length, hidden_dim)

        if self.config.n_shared_experts is not None:
            final_hidden_states = final_hidden_states + self.shared_experts(identity)

        return final_hidden_states


class Matmul(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, y):
        return torch.matmul(x, y)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/habana/transformers/models/deepseek_v3/modeling_deepseek_v3.py [540:592]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        else:
            final_hidden_states = torch.zeros(
                (batch * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
            )
            # changes to support hpu fused dynamic MoE op -- replacement for moe_infer()
            # loop through expert slices due to limits on max. experts supported by mixture_of_experts op
            for idx in range(self.expert_slice):
                experts_min = (self.ep_rank * self.experts_per_rank) + (self.expert_chunk * idx)
                experts_max = min((experts_min + self.expert_chunk), (self.ep_rank + 1) * self.experts_per_rank)
                experts_range = range(experts_min, experts_max)
                gate_proj_list = [self.experts[i].gate_proj.weight.squeeze() for i in experts_range]
                down_proj_list = [self.experts[i].down_proj.weight.squeeze() for i in experts_range]
                up_proj_list = [self.experts[i].up_proj.weight.squeeze() for i in experts_range]

                hidden_states_slice = torch.ops.hpu.mixture_of_experts(
                    hidden_states=hidden_states,
                    expert_routing_table=topk_idx,
                    router_weights=topk_weight,
                    w1=gate_proj_list,
                    w2=up_proj_list,
                    w3=down_proj_list,
                    permuted_weights=True,
                    activation="silu",
                    experts_min=experts_min,
                    experts_max=experts_max - 1,
                )
                final_hidden_states = final_hidden_states + hidden_states_slice
                htcore.mark_step()

            if self.ep_size > 1:
                final_hidden_states = _all_reduce(final_hidden_states)
            elif is_deepspeed_available():
                from deepspeed import comm as dist

                if dist.is_initialized():
                    dist.all_reduce(final_hidden_states, op=dist.ReduceOp.SUM)

            final_hidden_states = final_hidden_states.type(hidden_states.dtype)
            final_hidden_states = final_hidden_states.reshape(-1, sequence_length, hidden_dim)

        if self.config.n_shared_experts is not None:
            final_hidden_states = final_hidden_states + self.shared_experts(identity)

        return final_hidden_states


# Functional apis need to be wrapped in classes for quantization on hpu
class Matmul(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, y):
        return torch.matmul(x, y)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -