models/compressive.py [170:184]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        out = out.view(B, self.args.nheads, M, self.args.head_dim)  # B x K x M x D
        out = out.transpose(1, 2).contiguous()  # B x M x K x D
        out = out.view(B, M, -1)  # B x M x K_D
        out = self.proj_out(out)
        return out, aux_loss


class TransformerSeqLayer(nn.Module):
    def __init__(self, args, layer_ind):
        super(TransformerSeqLayer, self).__init__()
        self.args = args
        self.attn = MultiHeadSeqAttention(args)
        self.ff = FeedForwardLayer(args)
        self.norm1 = nn.LayerNorm(args.hid_sz)
        self.norm2 = nn.LayerNorm(args.hid_sz)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



models/transformer_seq.py [142:156]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        out = out.view(B, self.args.nheads, M, self.args.head_dim)  # B x K x M x D
        out = out.transpose(1, 2).contiguous()  # B x M x K x D
        out = out.view(B, M, -1)  # B x M x K_D
        out = self.proj_out(out)  # B x M x H
        return out, aux_loss


class TransformerSeqLayer(nn.Module):
    def __init__(self, args, layer_ind):
        super(TransformerSeqLayer, self).__init__()
        self.args = args
        self.attn = MultiHeadSeqAttention(args)
        self.ff = FeedForwardLayer(args)
        self.norm1 = nn.LayerNorm(args.hid_sz)
        self.norm2 = nn.LayerNorm(args.hid_sz)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



