in src/hyperpod_nemo_adapter/collections/model/nlp/custom_models/modeling_deepseek.py [0:0]
def forward(self, hidden_states):
bsz, seq_len, h = hidden_states.shape
### compute gating score
hidden_states = hidden_states.view(-1, h)
logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
if self.scoring_func == "sigmoid":
scores = logits.sigmoid()
else:
raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
### select top-k experts
if self.topk_method == "noaux_tc":
topk_idx = self.get_topk_indices(scores, bsz, seq_len)
topk_weight = scores.gather(1, topk_idx)
else:
raise NotImplementedError(f"insupportable TopK function for MoE gating: {self.topk_method}")
### norm gate to sum 1
if self.top_k > 1 and self.norm_topk_prob:
denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
topk_weight = topk_weight / denominator
topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
return topk_idx, topk_weight