in pytorch_translate/transformer.py [0:0]
def __init__(self, args, src_dict, dst_dict, embed_tokens):
super().__init__(dst_dict)
self.dropout = args.dropout
self.decoder_layerdrop = 0
if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
self.decoder_layerdrop = args.decoder_layerdrop
self.share_input_output_embed = args.share_decoder_input_output_embed
embed_dim = embed_tokens.embedding_dim
padding_idx = embed_tokens.padding_idx
self.embed_tokens = embed_tokens
self.embed_scale = math.sqrt(embed_dim)
self.embed_positions = fairseq_transformer.PositionalEmbedding(
1024, embed_dim, padding_idx, learned=args.decoder_learned_pos
)
self.aan = args.aan
decoder_layer_class = (
AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer
)
self.layers = nn.ModuleList([])
self.layers.extend(
[decoder_layer_class(args) for i in range(args.decoder_layers)]
)
if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep:
layers_to_keep = sorted(
int(x) for x in args.decoder_layers_to_keep.split(",")
)
self.decoder_layers_to_keep = {
layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep)
}
self.adaptive_softmax = None
self.bottleneck_layer = None
out_embed_dim = embed_dim
if args.decoder_out_embed_dim is not None:
assert (
not args.share_all_embeddings
and not args.share_decoder_input_output_embed
), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
self.bottleneck_layer = fairseq_transformer.Linear(
embed_dim, args.decoder_out_embed_dim
)
out_embed_dim = args.decoder_out_embed_dim
if args.adaptive_softmax_cutoff is not None:
self.adaptive_softmax = AdaptiveSoftmax(
len(dst_dict),
out_embed_dim,
options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
dropout=args.dropout,
)
elif not self.share_input_output_embed:
self.embed_out = nn.Parameter(torch.Tensor(len(dst_dict), out_embed_dim))
nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim ** -0.5)
self.vocab_reduction_module = None
if args.vocab_reduction_params:
assert (
self.adaptive_softmax is None
), "vocabulary reduction not compatible with adaptive softmax!"
self.vocab_reduction_module = vocab_reduction.VocabReduction(
src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16
)
self.onnx_trace = False
# Use quantizable nn.Linear for output projection instead of F.linear
self.output_projection = None
if self.vocab_reduction_module is None:
if self.share_input_output_embed:
self.output_projection = nn.Linear(
self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]
)
self.output_projection.weight = self.embed_tokens.weight
else:
self.output_projection = nn.Linear(
self.embed_out.shape[1], self.embed_out.shape[0]
)
self.output_projection.weight = self.embed_out