in optimum/bettertransformer/models/encoder_models.py [0:0]
def __init__(self, bert_layer, config):
r"""
A simple conversion of the Distill-BERTLayer to its `BetterTransformer` implementation.
Args:
bert_layer (`torch.nn.Module`):
The original Distill-BERT Layer where the weights needs to be retrieved.
"""
super().__init__(config)
super(BetterTransformerBaseLayer, self).__init__()
# In_proj layer
self.in_proj_weight = nn.Parameter(
torch.cat(
[
bert_layer.attention.q_lin.weight,
bert_layer.attention.k_lin.weight,
bert_layer.attention.v_lin.weight,
]
)
)
self.in_proj_bias = nn.Parameter(
torch.cat(
[
bert_layer.attention.q_lin.bias,
bert_layer.attention.k_lin.bias,
bert_layer.attention.v_lin.bias,
]
)
)
# Out proj layer
self.out_proj_weight = bert_layer.attention.out_lin.weight
self.out_proj_bias = bert_layer.attention.out_lin.bias
# Linear layer 1
self.linear1_weight = bert_layer.ffn.lin1.weight
self.linear1_bias = bert_layer.ffn.lin1.bias
# Linear layer 2
self.linear2_weight = bert_layer.ffn.lin2.weight
self.linear2_bias = bert_layer.ffn.lin2.bias
# Layer norm 1
self.norm1_eps = bert_layer.sa_layer_norm.eps
self.norm1_weight = bert_layer.sa_layer_norm.weight
self.norm1_bias = bert_layer.sa_layer_norm.bias
# Layer norm 2
self.norm2_eps = bert_layer.output_layer_norm.eps
self.norm2_weight = bert_layer.output_layer_norm.weight
self.norm2_bias = bert_layer.output_layer_norm.bias
# Model hyper parameters
self.num_heads = bert_layer.attention.n_heads
self.embed_dim = bert_layer.attention.dim
# Last step: set the last layer to `False` -> this will be set to `True` when converting the model
self.is_last_layer = False
self.original_layers_mapping = {
"in_proj_weight": ["attention.q_lin.weight", "attention.k_lin.weight", "attention.v_lin.weight"],
"in_proj_bias": ["attention.q_lin.bias", "attention.k_lin.bias", "attention.v_lin.bias"],
"out_proj_weight": "attention.out_lin.weight",
"out_proj_bias": "attention.out_lin.bias",
"linear1_weight": "ffn.lin1.weight",
"linear1_bias": "ffn.lin1.bias",
"linear2_weight": "ffn.lin2.weight",
"linear2_bias": "ffn.lin2.bias",
"norm1_weight": "sa_layer_norm.weight",
"norm1_bias": "sa_layer_norm.bias",
"norm2_weight": "output_layer_norm.weight",
"norm2_bias": "output_layer_norm.bias",
}
self.attention_dropout = config.attention_dropout
self.dropout = config.dropout
self.attention_head_size = config.dim // config.n_heads
self.act_fn_callable = ACT2FN[self.act_fn]
self.validate_bettertransformer()