def __init__()

in pytorch_translate/transformer.py [0:0]


    def __init__(self, args, src_dict, dst_dict, embed_tokens):
        super().__init__(dst_dict)
        self.dropout = args.dropout
        self.decoder_layerdrop = 0
        if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0:
            self.decoder_layerdrop = args.decoder_layerdrop

        self.share_input_output_embed = args.share_decoder_input_output_embed

        embed_dim = embed_tokens.embedding_dim
        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = fairseq_transformer.PositionalEmbedding(
            1024, embed_dim, padding_idx, learned=args.decoder_learned_pos
        )

        self.aan = args.aan
        decoder_layer_class = (
            AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer
        )

        self.layers = nn.ModuleList([])
        self.layers.extend(
            [decoder_layer_class(args) for i in range(args.decoder_layers)]
        )
        if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep:
            layers_to_keep = sorted(
                int(x) for x in args.decoder_layers_to_keep.split(",")
            )
            self.decoder_layers_to_keep = {
                layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep)
            }

        self.adaptive_softmax = None

        self.bottleneck_layer = None
        out_embed_dim = embed_dim
        if args.decoder_out_embed_dim is not None:
            assert (
                not args.share_all_embeddings
                and not args.share_decoder_input_output_embed
            ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!"
            self.bottleneck_layer = fairseq_transformer.Linear(
                embed_dim, args.decoder_out_embed_dim
            )
            out_embed_dim = args.decoder_out_embed_dim

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dst_dict),
                out_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.dropout,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dst_dict), out_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim ** -0.5)

        self.vocab_reduction_module = None
        if args.vocab_reduction_params:
            assert (
                self.adaptive_softmax is None
            ), "vocabulary reduction not compatible with adaptive softmax!"
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16
            )

        self.onnx_trace = False

        # Use quantizable nn.Linear for output projection instead of F.linear
        self.output_projection = None
        if self.vocab_reduction_module is None:
            if self.share_input_output_embed:
                self.output_projection = nn.Linear(
                    self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]
                )
                self.output_projection.weight = self.embed_tokens.weight
            else:
                self.output_projection = nn.Linear(
                    self.embed_out.shape[1], self.embed_out.shape[0]
                )
                self.output_projection.weight = self.embed_out