in optimum/bettertransformer/models/encoder_models.py [0:0]
def __init__(self, layer, config):
r"""
A simple conversion of the CLIPEncoderLayer to its `BetterTransformer` implementation.
**The implementation is valid only for the vision model, that does not use `causal_attention_mask`.**
Args:
layer (`torch.nn.Module`):
The original `CLIPEncoderLayer` where the weights needs to be retrieved.
"""
super().__init__(config)
super(BetterTransformerBaseLayer, self).__init__()
# In_proj layer
self.in_proj_weight = nn.Parameter(
torch.cat(
[
layer.self_attn.q_proj.weight,
layer.self_attn.k_proj.weight,
layer.self_attn.v_proj.weight,
]
)
)
self.in_proj_bias = nn.Parameter(
torch.cat(
[
layer.self_attn.q_proj.bias,
layer.self_attn.k_proj.bias,
layer.self_attn.v_proj.bias,
]
)
)
# Out proj layer
self.out_proj_weight = layer.self_attn.out_proj.weight
self.out_proj_bias = layer.self_attn.out_proj.bias
# Linear layer 1
self.linear1_weight = layer.mlp.fc1.weight
self.linear1_bias = layer.mlp.fc1.bias
# Linear layer 2
self.linear2_weight = layer.mlp.fc2.weight
self.linear2_bias = layer.mlp.fc2.bias
# Layer norm 1
self.norm1_eps = layer.layer_norm1.eps
self.norm1_weight = layer.layer_norm1.weight
self.norm1_bias = layer.layer_norm1.bias
# Layer norm 2
self.norm2_eps = layer.layer_norm2.eps
self.norm2_weight = layer.layer_norm2.weight
self.norm2_bias = layer.layer_norm2.bias
# Model hyper parameters
self.num_heads = layer.self_attn.num_heads
self.embed_dim = layer.self_attn.embed_dim
# Last step: set the last layer to `False` -> this will be set to `True` when converting the model
self.is_last_layer = False
self.norm_first = True
self.original_layers_mapping = {
"in_proj_weight": ["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"],
"in_proj_bias": ["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"],
"out_proj_weight": "self_attn.out_proj.weight",
"out_proj_bias": "self_attn.out_proj.bias",
"linear1_weight": "mlp.fc1.weight",
"linear1_bias": "mlp.fc1.bias",
"linear2_weight": "mlp.fc2.weight",
"linear2_bias": "mlp.fc2.bias",
"norm1_eps": "layer_norm1.eps",
"norm1_weight": "layer_norm1.weight",
"norm1_bias": "layer_norm1.bias",
"norm2_eps": "layer_norm2.eps",
"norm2_weight": "layer_norm2.weight",
"norm2_bias": "layer_norm2.bias",
}
self.validate_bettertransformer()