in ocr/utils/encoder_decoder.py [0:0]
def __init__(self, attention_cell='multi_head', num_layers=2,
units=128, hidden_size=2048, max_length=50,
num_heads=4, scaled=True, dropout=0.0,
use_residual=True, output_attention=False,
weight_initializer=None, bias_initializer='zeros',
prefix=None, params=None):
super(TransformerDecoder, self).__init__(prefix=prefix, params=params)
assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
'exactly by the number of heads. Received units={}, ' \
'num_heads={}'.format(units, num_heads)
self._num_layers = num_layers
self._units = units
self._hidden_size = hidden_size
self._num_states = num_heads
self._max_length = max_length
self._dropout = dropout
self._use_residual = use_residual
self._output_attention = output_attention
self._scaled = scaled
with self.name_scope():
self.dropout_layer = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm()
self.position_weight = self.params.get_constant('const',
_position_encoding_init(max_length,
units))
self.transformer_cells = nn.HybridSequential()
for i in range(num_layers):
self.transformer_cells.add(
TransformerDecoderCell(
units=units,
hidden_size=hidden_size,
num_heads=num_heads,
attention_cell=attention_cell,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
dropout=dropout,
scaled=scaled,
use_residual=use_residual,
output_attention=output_attention,
prefix='transformer%d_' % i))