in pytorch_translate/rnn.py [0:0]
def forward_unprojected(self, input_tokens, encoder_out, incremental_state=None):
if incremental_state is not None:
input_tokens = input_tokens[:, -1:]
bsz, seqlen = input_tokens.size()
# get outputs from encoder
(
encoder_outs,
final_hidden,
final_cell,
src_lengths,
src_tokens,
_,
) = encoder_out
# embed tokens
x = self.embed_tokens(input_tokens)
x = F.dropout(x, p=self.dropout_in, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# initialize previous states (or get from cache during incremental generation)
cached_state = utils.get_incremental_state(
self, incremental_state, "cached_state"
)
input_feed = None
if cached_state is not None:
prev_hiddens, prev_cells, input_feed = cached_state
else:
# first time step, initialize previous states
init_prev_states = self._init_prev_states(encoder_out)
prev_hiddens = []
prev_cells = []
# init_prev_states may or may not include initial attention context
for (h, c) in zip(init_prev_states[0::2], init_prev_states[1::2]):
prev_hiddens.append(h)
prev_cells.append(c)
if self.attention.context_dim:
input_feed = self.initial_attn_context.expand(
bsz, self.attention.context_dim
)
attn_scores_per_step = []
outs = []
step_attn_scores = None
for j in range(seqlen):
# input feeding: concatenate context vector from previous time step
step_input = maybe_cat((x[j, :, :], input_feed), dim=1)
previous_layer_input = step_input
for i, rnn in enumerate(self.layers):
# recurrent cell
hidden, cell = rnn(step_input, (prev_hiddens[i], prev_cells[i]))
if self.first_layer_attention and i == 0:
# tgt_len is 1 in decoder and squeezed for both matrices
# input_feed.shape = tgt_len X bsz X embed_dim
# step_attn_scores.shape = src_len X tgt_len X bsz
input_feed, step_attn_scores = self.attention(
hidden, encoder_outs, src_lengths
)
# hidden state becomes the input to the next layer
layer_output = F.dropout(
hidden, p=self.dropout_out, training=self.training
)
if self.residual_level is not None and i >= self.residual_level:
# TODO add an assert related to sizes here
step_input = layer_output + previous_layer_input
else:
step_input = layer_output
if self.first_layer_attention:
step_input = maybe_cat((step_input, input_feed), dim=1)
previous_layer_input = step_input
# save state for next time step
prev_hiddens[i] = hidden
prev_cells[i] = cell
if not self.first_layer_attention:
input_feed, step_attn_scores = self.attention(
hidden, encoder_outs, src_lengths
)
attn_scores_per_step.append(step_attn_scores.unsqueeze(1))
attn_scores = torch.cat(attn_scores_per_step, dim=1)
# srclen x tgtlen x bsz -> bsz x tgtlen x srclen
attn_scores = attn_scores.transpose(0, 2)
combined_output_and_context = maybe_cat((hidden, input_feed), dim=1)
# save final output
outs.append(combined_output_and_context)
# cache previous states (no-op except during incremental generation)
utils.set_incremental_state(
self,
incremental_state,
"cached_state",
(prev_hiddens, prev_cells, input_feed),
)
# collect outputs across time steps
x = torch.cat(outs, dim=0).view(
seqlen, bsz, self.combined_output_and_context_dim
)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# bottleneck layer
if hasattr(self, "additional_fc"):
x = self.additional_fc(x)
x = F.dropout(x, p=self.dropout_out, training=self.training)
return x, attn_scores