def forward()

in code/src/model/attention.py [0:0]


    def forward(self, encoded, y, attr, one_hot=False):
        """
        Input:
            - LongTensor of size (slen, bs), word indices
              or
              LongTensor of size (slen, bs, n_words), one-hot word embeddings
            - LongTensor of size (bs,), sentence lengths
            - FloatTensor of size (bs, hidden_dim), latent
              state representing sentences
        Output:
            - FloatTensor of size (slen, bs, n_words),
              representing the score of each word in each sentence
        """
        latent = encoded.dec_input
        x_len = encoded.input_len
        is_cuda = latent.is_cuda

        # check inputs
        assert x_len.size(0) == y.size(1)
        assert latent.size() == (x_len.max(), x_len.size(0), self.emb_dim)
        assert attr.size() == (x_len.size(0), len(self.attributes))

        # embeddings
        if one_hot:
            y_len, bs, _ = y.size()
            embeddings = y.view(y_len * bs, self.n_words).mm(self.embeddings.weight)
            embeddings = embeddings.view(y_len, bs, self.emb_dim)
        else:
            y_len, bs = y.size()
            embeddings = self.embeddings(y)
        embeddings = embeddings.detach() if self.freeze_dec_emb else embeddings
        if self.bos_attr != '':
            embeddings[0] = self.get_bos_attr(attr)
        embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)

        if self.input_feeding:
            mask = get_mask(x_len, True, cuda=is_cuda) == 0  # attention mask
            h_c = None
            hidden_states = [latent.data.new(1, bs, self.hidden_dim).zero_()]
            attention_states = []

            for i in range(y_len):
                # attention layer
                attention = self.get_attention(latent, hidden_states[-1][0], embeddings[i], mask)
                attention_states.append(attention)

                # lstm step
                lstm_input = embeddings[i:i + 1]
                lstm_input = torch.cat([lstm_input, attention], 2)
                h_t, h_c = self.lstm1(lstm_input, h_c)
                assert h_t.size() == (1, bs, self.hidden_dim)
                hidden_states.append(h_t)

            # first layer LSTM output
            lstm_output = torch.cat(hidden_states[1:], 0)
            assert lstm_output.size() == (y_len, bs, self.hidden_dim)

            # lstm (layers > 1)
            if self.n_dec_layers > 1:
                lstm_output = F.dropout(lstm_output, p=self.dropout, training=self.training)
                lstm_output, (_, _) = self.lstm2(lstm_output)
                assert lstm_output.size() == (y_len, bs, self.hidden_dim)

        else:
            # first LSTM layer
            lstm_output, (_, _) = self.lstm1(embeddings)
            assert lstm_output.size() == (y_len, bs, self.hidden_dim)

            # attention layer
            mask = get_mask(x_len, True, expand=int(y_len), batch_first=True, cuda=is_cuda).transpose(1, 2) == 0
            att_input = torch.cat([latent.data.new(1, bs, self.hidden_dim).zero_(), lstm_output[:-1]], 0)
            attention = self.get_full_attention(latent, att_input, embeddings, mask)
            assert attention.size() == (y_len, bs, self.emb_dim)

            # > 1 LSTM layers
            lstm_output = F.dropout(lstm_output, p=self.dropout, training=self.training)
            lstm_output = torch.cat([lstm_output, attention], 2)
            lstm_output, (_, _) = self.lstm2(lstm_output)
            assert lstm_output.size() == (y_len, bs, self.hidden_dim)

        # word scores
        output = F.dropout(lstm_output, p=self.dropout, training=self.training).view(-1, self.hidden_dim)
        if self.lstm_proj_layer is not None:
            output = F.relu(self.lstm_proj_layer(output))
        scores = self.proj(output).view(y_len, bs, self.n_words)
        if self.bias_attr != '':
            scores = scores + self.get_bias_attr(attr)[None]
        return scores