def add_args()

in fairseq/data/multilingual/multilingual_data_manager.py [0:0]
185 lines of code
2 McCabe index (conditional complexity)

    def add_args(parser):
        parser.add_argument(
            "data",
            help="colon separated path to data directories list, \
                            will be iterated upon during epochs in round-robin manner",
            action=FileContentsAction,
        )
        parser.add_argument(
            "--langs",
            default=None,
            type=csv_str_list,
            help="a list of languages comma sperated languages which can appear in lang-pairs; "
            "note that the ordering determines language token IDs",
        )
        parser.add_argument(
            "--lang-dict",
            default=None,
            type=str,
            help="an external file which contains a list of "
            "languages which can appear in lang-pairs; "
            "note that the ordering determines language token IDs; "
            "--langs and --lang-dict are two exclusive options",
        )
        parser.add_argument(
            "--source-dict",
            default=None,
            type=str,
            help="path to source dictionary; if specified it will override per language dictionary loading",
        )
        parser.add_argument(
            "--target-dict",
            default=None,
            type=str,
            help="path to target dictionary; if specified it will override per language dictionary loading",
        )
        parser.add_argument(
            "--lang-tok-style",
            default=LangTokStyle.multilingual.value,
            type=str,
            choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value],
            help="language token styles",
        )

        parser.add_argument(
            "--load-alignments",
            action="store_true",
            help="load the binarized alignments",
        )
        parser.add_argument(
            "--left-pad-source",
            default="True",
            type=str,
            metavar="BOOL",
            help="pad the source on the left",
        )
        parser.add_argument(
            "--left-pad-target",
            default="False",
            type=str,
            metavar="BOOL",
            help="pad the target on the left",
        )
        try:
            parser.add_argument(
                "--max-source-positions",
                default=1024,
                type=int,
                metavar="N",
                help="max number of tokens in the source sequence",
            )
            parser.add_argument(
                "--max-target-positions",
                default=1024,
                type=int,
                metavar="N",
                help="max number of tokens in the target sequence",
            )
        except ArgumentError:
            # this might have already been defined. Once we transition this to hydra it should be fine to add it here.
            pass
        parser.add_argument(
            "--upsample-primary",
            default=1,
            type=int,
            help="amount to upsample primary dataset",
        )
        parser.add_argument(
            "--truncate-source",
            action="store_true",
            default=False,
            help="truncate source to max-source-positions",
        )
        parser.add_argument(
            "--encoder-langtok",
            default=None,
            type=str,
            choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value],
            metavar="SRCTGT",
            help="prepend to the beginning of source sentence the source or target "
            "language token. (src/tgt)",
        )
        parser.add_argument(
            "--decoder-langtok",
            action="store_true",
            help="prepend to the beginning of target sentence the target language token",
        )
        parser.add_argument(
            "--lang-tok-replacing-bos-eos", action="store_true", default=False
        )
        parser.add_argument(
            "--enable-lang-ids",
            default=False,
            action="store_true",
            help="whether to include language IDs in samples",
        )
        parser.add_argument(
            "--enable-reservsed-directions-shared-datasets",
            default=False,
            action="store_true",
            help="whether to allow datasets be used in reversed directions",
        )

        parser.add_argument(
            "--extra-data",
            help='a dictionary of data name to this path, \
                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
            type=lambda uf: eval_str_dict(uf, type=str),
            default=None,
        )
        parser.add_argument(
            "--extra-lang-pairs",
            help='a dictionary of data name to the language pairs they serve, \
                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
            type=lambda uf: eval_str_dict(uf, type=str),
            default=None,
        )
        parser.add_argument(
            "--fixed-dictionary",
            help="Fixed dictionary to use with model path",
            default=None,
            type=str,
        )
        parser.add_argument(
            "--langtoks-specs",
            help='a list of comma separated data types that a set of language tokens to be specialized for, \
                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
                            distinguish languages in different training data types. If not specified, default language \
                            tokens per languages will be added',
            default=LangTokSpec.main.value,
            type=csv_str_list,
        )
        parser.add_argument(
            "--langtoks",
            help='a dictionary of how to add language tokens, \
                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument(
            "--sampling-weights-from-file",
            help='a file contain a python dictionary of how to sample data sets, \
                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=str,
        )
        parser.add_argument(
            "--sampling-weights",
            help='a dictionary of how to sample data sets, \
                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument(
            "--virtual-epoch-size",
            default=None,
            type=int,
            help="virtual epoch size to speed up data loading",
        )
        parser.add_argument(
            "--virtual-data-size",
            default=None,
            type=int,
            help="virtual data size of the whole joint dataset to speed"
            "up data loading and have specific dynamic sampling strategy interval",
        )