in opacus_lab/models/GPT2/model/attention.py [0:0]
def __init__(self, dropout: float = 0.1, max_position_embeddings: int = 1024):
super().__init__()
self.dropout = nn.Dropout(dropout)
# register buffer for masked_bias and max_position_embeddings
# copied from Huggingface's implementation (see causal_masking routine)
self.mpe = max_position_embeddings
self.register_buffer(
"bias",
torch.tril(torch.ones((self.mpe, self.mpe), dtype=torch.uint8)).view(
1, 1, self.mpe, self.mpe
),
)
self.register_buffer("masked_bias", torch.tensor(-1e4))