in fairseq/data/multilingual/multilingual_data_manager.py [0:0]
def add_args(parser):
parser.add_argument(
"data",
help="colon separated path to data directories list, \
will be iterated upon during epochs in round-robin manner",
action=FileContentsAction,
)
parser.add_argument(
"--langs",
default=None,
type=csv_str_list,
help="a list of languages comma sperated languages which can appear in lang-pairs; "
"note that the ordering determines language token IDs",
)
parser.add_argument(
"--lang-dict",
default=None,
type=str,
help="an external file which contains a list of "
"languages which can appear in lang-pairs; "
"note that the ordering determines language token IDs; "
"--langs and --lang-dict are two exclusive options",
)
parser.add_argument(
"--source-dict",
default=None,
type=str,
help="path to source dictionary; if specified it will override per language dictionary loading",
)
parser.add_argument(
"--target-dict",
default=None,
type=str,
help="path to target dictionary; if specified it will override per language dictionary loading",
)
parser.add_argument(
"--lang-tok-style",
default=LangTokStyle.multilingual.value,
type=str,
choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value],
help="language token styles",
)
parser.add_argument(
"--load-alignments",
action="store_true",
help="load the binarized alignments",
)
parser.add_argument(
"--left-pad-source",
default="True",
type=str,
metavar="BOOL",
help="pad the source on the left",
)
parser.add_argument(
"--left-pad-target",
default="False",
type=str,
metavar="BOOL",
help="pad the target on the left",
)
try:
parser.add_argument(
"--max-source-positions",
default=1024,
type=int,
metavar="N",
help="max number of tokens in the source sequence",
)
parser.add_argument(
"--max-target-positions",
default=1024,
type=int,
metavar="N",
help="max number of tokens in the target sequence",
)
except ArgumentError:
# this might have already been defined. Once we transition this to hydra it should be fine to add it here.
pass
parser.add_argument(
"--upsample-primary",
default=1,
type=int,
help="amount to upsample primary dataset",
)
parser.add_argument(
"--truncate-source",
action="store_true",
default=False,
help="truncate source to max-source-positions",
)
parser.add_argument(
"--encoder-langtok",
default=None,
type=str,
choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value],
metavar="SRCTGT",
help="prepend to the beginning of source sentence the source or target "
"language token. (src/tgt)",
)
parser.add_argument(
"--decoder-langtok",
action="store_true",
help="prepend to the beginning of target sentence the target language token",
)
parser.add_argument(
"--lang-tok-replacing-bos-eos", action="store_true", default=False
)
parser.add_argument(
"--enable-lang-ids",
default=False,
action="store_true",
help="whether to include language IDs in samples",
)
parser.add_argument(
"--enable-reservsed-directions-shared-datasets",
default=False,
action="store_true",
help="whether to allow datasets be used in reversed directions",
)
parser.add_argument(
"--extra-data",
help='a dictionary of data name to this path, \
e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
type=lambda uf: eval_str_dict(uf, type=str),
default=None,
)
parser.add_argument(
"--extra-lang-pairs",
help='a dictionary of data name to the language pairs they serve, \
e.g. {"mined": comma-separated-lang-pairs, "denoised": comma-separated-lang-pairs}',
type=lambda uf: eval_str_dict(uf, type=str),
default=None,
)
parser.add_argument(
"--fixed-dictionary",
help="Fixed dictionary to use with model path",
default=None,
type=str,
)
parser.add_argument(
"--langtoks-specs",
help='a list of comma separated data types that a set of language tokens to be specialized for, \
e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
distinguish languages in different training data types. If not specified, default language \
tokens per languages will be added',
default=LangTokSpec.main.value,
type=csv_str_list,
)
parser.add_argument(
"--langtoks",
help='a dictionary of how to add language tokens, \
e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
default=None,
type=lambda uf: eval_str_dict(uf, type=str),
)
parser.add_argument(
"--sampling-weights-from-file",
help='a file contain a python dictionary of how to sample data sets, \
e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
"mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
default=None,
type=str,
)
parser.add_argument(
"--sampling-weights",
help='a dictionary of how to sample data sets, \
e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
"mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
default=None,
type=lambda uf: eval_str_dict(uf, type=str),
)
parser.add_argument(
"--virtual-epoch-size",
default=None,
type=int,
help="virtual epoch size to speed up data loading",
)
parser.add_argument(
"--virtual-data-size",
default=None,
type=int,
help="virtual data size of the whole joint dataset to speed"
"up data loading and have specific dynamic sampling strategy interval",
)