optimum/bettertransformer/models/encoder_models.py [1040:1070]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if output_attentions:
            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")

        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
            attention_mask = None

            hidden_states = torch._transformer_encoder_layer_fwd(
                hidden_states,
                self.embed_dim,
                self.num_heads,
                self.in_proj_weight,
                self.in_proj_bias,
                self.out_proj_weight,
                self.out_proj_bias,
                self.use_gelu,
                self.norm_first,
                self.norm1_eps,
                self.norm1_weight,
                self.norm1_bias,
                self.norm2_weight,
                self.norm2_bias,
                self.linear1_weight,
                self.linear1_bias,
                self.linear2_weight,
                self.linear2_bias,
                attention_mask,
            )
            if hidden_states.is_nested and self.is_last_layer:
                hidden_states = hidden_states.to_padded_tensor(0.0)
        else:
            raise NotImplementedError(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/bettertransformer/models/encoder_models.py [1163:1193]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if output_attentions:
            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")

        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
            attention_mask = None

            hidden_states = torch._transformer_encoder_layer_fwd(
                hidden_states,
                self.embed_dim,
                self.num_heads,
                self.in_proj_weight,
                self.in_proj_bias,
                self.out_proj_weight,
                self.out_proj_bias,
                self.use_gelu,
                self.norm_first,
                self.norm1_eps,
                self.norm1_weight,
                self.norm1_bias,
                self.norm2_weight,
                self.norm2_bias,
                self.linear1_weight,
                self.linear1_bias,
                self.linear2_weight,
                self.linear2_bias,
                attention_mask,
            )
            if hidden_states.is_nested and self.is_last_layer:
                hidden_states = hidden_states.to_padded_tensor(0.0)
        else:
            raise NotImplementedError(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -