in optimum/bettertransformer/models/decoder_models.py [0:0]
def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
super().__init__(config)
with torch.device("meta"):
super(BetterTransformerBaseLayer, self).__init__(config)
submodules = [
"k_proj",
"v_proj",
"q_proj",
"out_proj",
"attn_dropout",
"resid_dropout",
"scale_attn",
]
# Attribute only for transformers>=4.28
if hasattr(layer, "embed_positions"):
submodules.append("embed_positions")
# Attribute only for transformers<4.45
if hasattr(layer, "bias"):
submodules.append("bias")
if hasattr(layer, "masked_bias"):
submodules.append("masked_bias")
# Attribute only for transformers>=4.45
if hasattr(layer, "layer_idx"):
submodules.append("layer_idx")
for attr in submodules:
setattr(self, attr, getattr(layer, attr))
self.module_mapping = None
self.original_layers_mapping = {submodule: submodule for submodule in submodules}
self.downcast_qk = True
self.dropout_prob_attn = config.attn_pdrop