in codes/attention.py [0:0]
def compute_selfattention(transformer_encoder, x, i_layer, d_model, num_heads):
h = F.linear(x, transformer_encoder.layers[i_layer].self_attn.in_proj_weight, bias=transformer_encoder.layers[i_layer].self_attn.in_proj_bias)
qkv = h.reshape(x.shape[0], x.shape[1], num_heads, 3 * d_model//num_heads)
qkv = qkv.permute(0, 2, 1, 3)
q, k, v = qkv.chunk(3, dim=-1)
attn_logits = torch.matmul(q, k.transpose(-2, -1))
d_k = q.size()[-1]
attn_probs = attn_logits / math.sqrt(d_k)
return attn_probs