in optimum/bettertransformer/models/encoder_models.py [0:0]
def __init__(self, vit_layer, config):
r"""
A simple conversion of the ViTLayer to its `BetterTransformer` implementation.
Args:
vit_layer (`torch.nn.Module`):
The original `ViTLayer` where the weights needs to be retrieved.
"""
super().__init__(config)
super(BetterTransformerBaseLayer, self).__init__()
# In_proj layer
self.in_proj_weight = nn.Parameter(
torch.cat(
[
vit_layer.attention.attention.query.weight,
vit_layer.attention.attention.key.weight,
vit_layer.attention.attention.value.weight,
]
)
)
self.in_proj_bias = nn.Parameter(
torch.cat(
[
vit_layer.attention.attention.query.bias,
vit_layer.attention.attention.key.bias,
vit_layer.attention.attention.value.bias,
]
)
)
# Out proj layer
self.out_proj_weight = vit_layer.attention.output.dense.weight
self.out_proj_bias = vit_layer.attention.output.dense.bias
# Linear layer 1
self.linear1_weight = vit_layer.intermediate.dense.weight
self.linear1_bias = vit_layer.intermediate.dense.bias
# Linear layer 2
self.linear2_weight = vit_layer.output.dense.weight
self.linear2_bias = vit_layer.output.dense.bias
# Layer norm 1
self.norm1_eps = vit_layer.layernorm_before.eps
self.norm1_weight = vit_layer.layernorm_before.weight
self.norm1_bias = vit_layer.layernorm_before.bias
# Layer norm 2
self.norm2_eps = vit_layer.layernorm_after.eps
self.norm2_weight = vit_layer.layernorm_after.weight
self.norm2_bias = vit_layer.layernorm_after.bias
# Model hyper parameters
self.num_heads = vit_layer.attention.attention.num_attention_heads
self.embed_dim = int(vit_layer.attention.attention.attention_head_size * self.num_heads)
# Last step: set the last layer to `False` -> this will be set to `True` when converting the model
self.is_last_layer = False
self.norm_first = True
self.original_layers_mapping = {
"in_proj_weight": [
"attention.attention.query.weight",
"attention.attention.key.weight",
"attention.attention.value.weight",
],
"in_proj_bias": [
"attention.attention.query.bias",
"attention.attention.key.bias",
"attention.attention.value.bias",
],
"out_proj_weight": "attention.output.dense.weight",
"out_proj_bias": "attention.output.dense.bias",
"linear1_weight": "intermediate.dense.weight",
"linear1_bias": "intermediate.dense.bias",
"linear2_weight": "output.dense.weight",
"linear2_bias": "output.dense.bias",
"norm1_weight": "layernorm_before.weight",
"norm1_bias": "layernorm_before.bias",
"norm2_weight": "layernorm_after.weight",
"norm2_bias": "layernorm_after.bias",
}
self.validate_bettertransformer()