in ocr/utils/encoder_decoder.py [0:0]
def __init__(self, attention_cell='multi_head', num_layers=2,
units=512, hidden_size=2048, max_length=50,
num_heads=4, scaled=True, dropout=0.0,
use_residual=True, output_attention=False,
weight_initializer=None, bias_initializer='zeros',
prefix=None, params=None):
super(TransformerEncoder, self).__init__(prefix=prefix, params=params)
assert units % num_heads == 0,\
'In TransformerEncoder, The units should be divided exactly ' \
'by the number of heads. Received units={}, num_heads={}' \
.format(units, num_heads)
self._num_layers = num_layers
self._max_length = max_length
self._num_heads = num_heads
self._units = units
self._hidden_size = hidden_size
self._output_attention = output_attention
self._dropout = dropout
self._use_residual = use_residual
self._scaled = scaled
with self.name_scope():
self.dropout_layer = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm()
self.position_weight = self.params.get_constant('const',
_position_encoding_init(max_length,
units))
self.transformer_cells = nn.HybridSequential()
for i in range(num_layers):
self.transformer_cells.add(
TransformerEncoderCell(
units=units,
hidden_size=hidden_size,
num_heads=num_heads,
attention_cell=attention_cell,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
dropout=dropout,
use_residual=use_residual,
scaled=scaled,
output_attention=output_attention,
prefix='transformer%d_' % i))