in src/mlm/models/gpt2.py [0:0]
def __init__(self, units, vocab_size, max_length, num_layers, num_heads, dropout=0.0,
prefix=None, params=None):
super(GPT2Model, self).__init__(prefix=prefix, params=params)
self._units = units
self._max_length = max_length
self._num_layers = num_layers
self._num_heads = num_heads
with self.name_scope():
self._pos_embed = nn.Embedding(input_dim=max_length, output_dim=units,
weight_initializer=mx.init.Normal(0.01),
prefix='pos_embed_')
self._embed = nn.Embedding(input_dim=vocab_size, output_dim=units, prefix='embed_',
weight_initializer=mx.init.Normal(0.02))
self._logits_proj = nn.Dense(units=vocab_size, in_units=units, use_bias=False,
flatten=False, params=self._embed.params)
self._self_attention_layers = nn.Sequential()
self._ffn_layers = nn.HybridSequential()
self._attn_ln = nn.HybridSequential()
self._ffn_ln = nn.HybridSequential()
for i in range(num_layers):
self._self_attention_layers.add(GPT2SelfAttentionLayer(
units=units, num_heads=num_heads, dropout=dropout,
prefix='self_attn{}_'.format(i)))
self._ffn_layers.add(GPT2FFNLayer(
units=units, hidden_size=units * 4, prefix='ffn{}_'.format(i)))
self._attn_ln.add(nn.LayerNorm(prefix='attn_ln{}_'.format(i)))
self._ffn_ln.add(nn.LayerNorm(prefix='ffn_ln{}_'.format(i)))
self._final_ln = nn.LayerNorm(prefix='final_ln{}_'.format(i))