in code/src/model/attention.py [0:0]
def forward(self, encoded, y, attr, one_hot=False):
"""
Input:
- LongTensor of size (slen, bs), word indices
or
LongTensor of size (slen, bs, n_words), one-hot word embeddings
- LongTensor of size (bs,), sentence lengths
- FloatTensor of size (bs, hidden_dim), latent
state representing sentences
Output:
- FloatTensor of size (slen, bs, n_words),
representing the score of each word in each sentence
"""
latent = encoded.dec_input
x_len = encoded.input_len
is_cuda = latent.is_cuda
# check inputs
assert x_len.size(0) == y.size(1)
assert latent.size() == (x_len.max(), x_len.size(0), self.emb_dim)
assert attr.size() == (x_len.size(0), len(self.attributes))
# embeddings
if one_hot:
y_len, bs, _ = y.size()
embeddings = y.view(y_len * bs, self.n_words).mm(self.embeddings.weight)
embeddings = embeddings.view(y_len, bs, self.emb_dim)
else:
y_len, bs = y.size()
embeddings = self.embeddings(y)
embeddings = embeddings.detach() if self.freeze_dec_emb else embeddings
if self.bos_attr != '':
embeddings[0] = self.get_bos_attr(attr)
embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)
if self.input_feeding:
mask = get_mask(x_len, True, cuda=is_cuda) == 0 # attention mask
h_c = None
hidden_states = [latent.data.new(1, bs, self.hidden_dim).zero_()]
attention_states = []
for i in range(y_len):
# attention layer
attention = self.get_attention(latent, hidden_states[-1][0], embeddings[i], mask)
attention_states.append(attention)
# lstm step
lstm_input = embeddings[i:i + 1]
lstm_input = torch.cat([lstm_input, attention], 2)
h_t, h_c = self.lstm1(lstm_input, h_c)
assert h_t.size() == (1, bs, self.hidden_dim)
hidden_states.append(h_t)
# first layer LSTM output
lstm_output = torch.cat(hidden_states[1:], 0)
assert lstm_output.size() == (y_len, bs, self.hidden_dim)
# lstm (layers > 1)
if self.n_dec_layers > 1:
lstm_output = F.dropout(lstm_output, p=self.dropout, training=self.training)
lstm_output, (_, _) = self.lstm2(lstm_output)
assert lstm_output.size() == (y_len, bs, self.hidden_dim)
else:
# first LSTM layer
lstm_output, (_, _) = self.lstm1(embeddings)
assert lstm_output.size() == (y_len, bs, self.hidden_dim)
# attention layer
mask = get_mask(x_len, True, expand=int(y_len), batch_first=True, cuda=is_cuda).transpose(1, 2) == 0
att_input = torch.cat([latent.data.new(1, bs, self.hidden_dim).zero_(), lstm_output[:-1]], 0)
attention = self.get_full_attention(latent, att_input, embeddings, mask)
assert attention.size() == (y_len, bs, self.emb_dim)
# > 1 LSTM layers
lstm_output = F.dropout(lstm_output, p=self.dropout, training=self.training)
lstm_output = torch.cat([lstm_output, attention], 2)
lstm_output, (_, _) = self.lstm2(lstm_output)
assert lstm_output.size() == (y_len, bs, self.hidden_dim)
# word scores
output = F.dropout(lstm_output, p=self.dropout, training=self.training).view(-1, self.hidden_dim)
if self.lstm_proj_layer is not None:
output = F.relu(self.lstm_proj_layer(output))
scores = self.proj(output).view(y_len, bs, self.n_words)
if self.bias_attr != '':
scores = scores + self.get_bias_attr(attr)[None]
return scores