def forward_unprojected()

in pytorch_translate/rnn.py [0:0]


    def forward_unprojected(self, input_tokens, encoder_out, incremental_state=None):
        if incremental_state is not None:
            input_tokens = input_tokens[:, -1:]
        bsz, seqlen = input_tokens.size()

        # get outputs from encoder
        (
            encoder_outs,
            final_hidden,
            final_cell,
            src_lengths,
            src_tokens,
            _,
        ) = encoder_out

        # embed tokens
        x = self.embed_tokens(input_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)
        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # initialize previous states (or get from cache during incremental generation)
        cached_state = utils.get_incremental_state(
            self, incremental_state, "cached_state"
        )
        input_feed = None
        if cached_state is not None:
            prev_hiddens, prev_cells, input_feed = cached_state
        else:
            # first time step, initialize previous states
            init_prev_states = self._init_prev_states(encoder_out)
            prev_hiddens = []
            prev_cells = []

            # init_prev_states may or may not include initial attention context
            for (h, c) in zip(init_prev_states[0::2], init_prev_states[1::2]):
                prev_hiddens.append(h)
                prev_cells.append(c)
            if self.attention.context_dim:
                input_feed = self.initial_attn_context.expand(
                    bsz, self.attention.context_dim
                )

        attn_scores_per_step = []
        outs = []
        step_attn_scores = None
        for j in range(seqlen):
            # input feeding: concatenate context vector from previous time step
            step_input = maybe_cat((x[j, :, :], input_feed), dim=1)
            previous_layer_input = step_input
            for i, rnn in enumerate(self.layers):
                # recurrent cell
                hidden, cell = rnn(step_input, (prev_hiddens[i], prev_cells[i]))

                if self.first_layer_attention and i == 0:
                    # tgt_len is 1 in decoder and squeezed for both matrices
                    # input_feed.shape = tgt_len X bsz X embed_dim
                    # step_attn_scores.shape = src_len X tgt_len X bsz
                    input_feed, step_attn_scores = self.attention(
                        hidden, encoder_outs, src_lengths
                    )

                # hidden state becomes the input to the next layer
                layer_output = F.dropout(
                    hidden, p=self.dropout_out, training=self.training
                )

                if self.residual_level is not None and i >= self.residual_level:
                    # TODO add an assert related to sizes here
                    step_input = layer_output + previous_layer_input
                else:
                    step_input = layer_output

                if self.first_layer_attention:
                    step_input = maybe_cat((step_input, input_feed), dim=1)
                previous_layer_input = step_input

                # save state for next time step
                prev_hiddens[i] = hidden
                prev_cells[i] = cell

            if not self.first_layer_attention:
                input_feed, step_attn_scores = self.attention(
                    hidden, encoder_outs, src_lengths
                )

            attn_scores_per_step.append(step_attn_scores.unsqueeze(1))
            attn_scores = torch.cat(attn_scores_per_step, dim=1)
            # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
            attn_scores = attn_scores.transpose(0, 2)
            combined_output_and_context = maybe_cat((hidden, input_feed), dim=1)
            # save final output
            outs.append(combined_output_and_context)

        # cache previous states (no-op except during incremental generation)
        utils.set_incremental_state(
            self,
            incremental_state,
            "cached_state",
            (prev_hiddens, prev_cells, input_feed),
        )

        # collect outputs across time steps
        x = torch.cat(outs, dim=0).view(
            seqlen, bsz, self.combined_output_and_context_dim
        )

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # bottleneck layer
        if hasattr(self, "additional_fc"):
            x = self.additional_fc(x)
            x = F.dropout(x, p=self.dropout_out, training=self.training)
        return x, attn_scores