def forward()

in optimum/bettertransformer/models/encoder_models.py [0:0]


    def forward(self, hidden_states, attention_mask, output_attentions: bool, position_bias=None, *_, **__):
        if output_attentions:
            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")

        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
            if not hasattr(hidden_states, "original_shape"):
                original_shape = hidden_states.shape
            else:
                original_shape = hidden_states.original_shape

            if hidden_states.is_nested:
                attention_mask = None

            if attention_mask is not None:
                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
                # 0->false->keep this token -inf->true->mask this token
                attention_mask = attention_mask.bool()
                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))

                # FSMT swaps the first two axis before calling the encoder stack
                # Reference: https://github.com/huggingface/transformers/blob/699e90437f984d69ad3c9b891dd2e9d0fc2cffe4/src/transformers/models/fsmt/modeling_fsmt.py#L508
                if hidden_states.shape[0] != attention_mask.shape[0]:
                    hidden_states = hidden_states.transpose(1, 0)
                    original_shape = hidden_states.shape

                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
                attention_mask = None

            hidden_states = torch._transformer_encoder_layer_fwd(
                hidden_states,
                self.embed_dim,
                self.num_heads,
                self.in_proj_weight,
                self.in_proj_bias,
                self.out_proj_weight,
                self.out_proj_bias,
                self.use_gelu,
                self.norm_first,
                self.norm1_eps,
                self.norm1_weight,
                self.norm1_bias,
                self.norm2_weight,
                self.norm2_bias,
                self.linear1_weight,
                self.linear1_bias,
                self.linear2_weight,
                self.linear2_bias,
                attention_mask,
            )

            if not self.is_last_layer:
                hidden_states.original_shape = original_shape
            elif hidden_states.is_nested and self.is_last_layer:
                hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
        else:
            raise NotImplementedError(
                "Training and Autocast are not implemented for BetterTransformer + FSMT. Please open an issue."
            )

        return (hidden_states, attention_mask)