in ocr/utils/encoder_decoder.py [0:0]
def __init__(self, attention_cell='multi_head', units=128,
hidden_size=512, num_heads=4, scaled=True,
dropout=0.0, use_residual=True, output_attention=False,
weight_initializer=None, bias_initializer='zeros',
prefix=None, params=None):
super(TransformerDecoderCell, self).__init__(prefix=prefix, params=params)
self._units = units
self._num_heads = num_heads
self._dropout = dropout
self._use_residual = use_residual
self._output_attention = output_attention
self._scaled = scaled
with self.name_scope():
self.dropout_layer = nn.Dropout(dropout)
self.attention_cell_in = _get_attention_cell(attention_cell,
units=units,
num_heads=num_heads,
scaled=scaled,
dropout=dropout)
self.attention_cell_inter = _get_attention_cell(attention_cell,
units=units,
num_heads=num_heads,
scaled=scaled,
dropout=dropout)
self.proj_in = nn.Dense(units=units, flatten=False,
use_bias=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='proj_in_')
self.proj_inter = nn.Dense(units=units, flatten=False,
use_bias=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='proj_inter_')
self.ffn = PositionwiseFFN(hidden_size=hidden_size,
units=units,
use_residual=use_residual,
dropout=dropout,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)
self.layer_norm_in = nn.LayerNorm()
self.layer_norm_inter = nn.LayerNorm()