in pytorch_translate/options.py [0:0]
def add_preprocessing_args(parser):
# Args related to dataset.
group = parser.add_argument_group("Preprocess data")
group.add_argument(
"--source-vocab-file",
default="",
metavar="FILE",
help="Path to text file representing the dictionary of tokens to use. "
"If the file does not exist, the dict is auto-generated from source "
"training data and saved as that file.",
)
group.add_argument(
"--source-max-vocab-size",
default=-1,
type=int,
metavar="N",
help="If a new vocab file needs to be generated, restrict it to the "
"top N most common words. If we re-use an existing vocab file, this "
"flag will have no effect. A value of < 0 means no max size.",
)
group.add_argument(
"--source-bpe-cont-marker",
default=None,
type=str,
metavar="CONT",
help="Source BPE continuation marker. You should only specify this if "
"you are using a BPE source vocab that has an continuation marker "
"suffix. Note that this is the default BPE format in fairseq. Ex: '@@'",
)
group.add_argument(
"--source-bpe-end-marker",
default=None,
type=str,
metavar="END",
help="Source BPE end marker. You should only specify this if you are "
"using a BPE source vocab that has an end marker suffix. Ex: '_EOW'",
)
group.add_argument(
"--char-source-vocab-file",
default="",
metavar="FILE",
help="Same as --source-vocab-file except using characters.",
)
group.add_argument(
"--char-target-vocab-file",
default="",
metavar="FILE",
help="Same as --target-vocab-file except using characters.",
)
group.add_argument(
"--embed-bytes",
type=utils.bool_flag,
nargs="?",
const=True,
default=False,
help="If specified along with a character model and set to True, "
"then we embed bytes instead of characters.",
)
group.add_argument(
"--char-source-max-vocab-size",
default=-1,
type=int,
metavar="N",
help="Same as --source-max-vocab-size except using characters.",
)
group.add_argument(
"--char-target-max-vocab-size",
default=-1,
type=int,
metavar="N",
help="Same as --target-max-vocab-size except using characters.",
)
group.add_argument(
"--target-vocab-file",
default="",
metavar="FILE",
help="Path to text file representing the fairseq Dictionary to use. "
"If the file does not exist, the dict is auto-generated from target "
"training data and saved as that file.",
)
group.add_argument(
"--target-max-vocab-size",
default=-1,
type=int,
metavar="N",
help="If a new vocab file needs to be generated, restrict it to the "
"top N most common words. If we re-use an existing vocab file, this "
"flag will have no effect. A value of < 0 means no max size.",
)
group.add_argument(
"--target-bpe-cont-marker",
default=None,
type=str,
metavar="CONT",
help="Target BPE continuation marker. You should only specify this if "
"you are using a BPE target vocab that has an continuation marker "
"suffix. Note that this is the default BPE format in fairseq. Ex: '@@'",
)
group.add_argument(
"--target-bpe-end-marker",
default=None,
type=str,
metavar="END",
help="Target BPE end marker. You should only specify this if you are "
"using a BPE target vocab that has an end marker suffix. Ex: '_EOW'",
)
group.add_argument(
"--fairseq-data-format",
type=bool,
default=False,
help="binary paths are prefixes for .bin/.idx mmap datasets",
)
group.add_argument(
"--train-source-text-file",
default="",
metavar="FILE",
help="Path to text file containing source training examples.",
)
group.add_argument(
"--train-target-text-file",
default="",
metavar="FILE",
help="Path to text file containing target training examples.",
)
group.add_argument(
"--eval-source-text-file",
default="",
metavar="FILE",
help="Path to text file containing source eval examples for "
"calculating validation loss and BLEU eval scores.",
)
group.add_argument(
"--eval-target-text-file",
default="",
metavar="FILE",
help="Path to text file containing target eval examples for "
"calculating validation loss and BLEU eval scores.",
)
group.add_argument(
"--train-source-binary-path",
default="",
help="Path for the binary file containing source training examples.",
)
group.add_argument(
"--train-target-binary-path",
default="",
help="Path for the binary file containing target training examples.",
)
group.add_argument(
"--train-weights-path",
default="",
metavar="FILE",
help="Path to text file of weight (0 to 1) for each train example.."
"If left empty, all examples will receive equal weights.",
)
group.add_argument(
"--eval-source-binary-path",
default="",
help="Path for the binary file containing source eval examples for "
"calculating validation loss and BLEU scores.",
)
group.add_argument(
"--eval-target-binary-path",
default="",
help="Path for the binary file containing target eval examples for "
"calculating validation loss and BLEU scores.",
)
group.add_argument(
"--train-mono-source-binary-path",
default="",
help="Path for the binary file containing source side monolingual data",
)
group.add_argument(
"--train-mono-target-binary-path",
default="",
help="Path for the binary file containing target side monolingual data",
)
# TODO(T43045193): Move this to multilingual_task.py eventually
group.add_argument(
"--multiling-encoder-lang",
action="append",
metavar="SRC",
help="For multilingual models only. Use this argument repeatedly to "
"specify a list of encoder languages. The multilingual model contains "
"a separate encoder for each language in this list.",
)
group.add_argument(
"--multiling-decoder-lang",
action="append",
metavar="TARGET",
help="For multilingual models only. Use this argument repeatedly to "
"specify a list of decoder languages. The multilingual model contains "
"a separate decoder for each language in this list.",
)
group.add_argument(
"--multiling-source-lang",
action="append",
metavar="SRC",
help="For multilingual models only. Use this argument repeatedly to "
"specify a list of corpus source languages, where the n-th language is "
"the source language of the n-th training corpus. Each entry must be "
"in --multiling-encoder-lang.",
)
group.add_argument(
"--multiling-target-lang",
action="append",
metavar="TARGET",
help="For multilingual models only. Use this argument repeatedly to "
"specify a list of corpus target languages, where the n-th language is "
"the target language of the n-th training corpus. Each entry must be "
"in --multiling-decoder-lang.",
)
group.add_argument(
"--multiling-source-vocab-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify the path to the dictionary for the n-th entry in "
"--multiling-encoder-lang",
)
group.add_argument(
"--multiling-target-vocab-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify the path to the dictionary for the n-th entry in "
"--multiling-decoder-lang",
)
group.add_argument(
"--multiling-train-source-text-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify paths to training source samples. The n-th entry should be "
"in the n-th language in --multiling-source-lang.",
)
group.add_argument(
"--multiling-train-target-text-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify paths to training target samples. The n-th entry should be "
"in the n-th language in --multiling-target-lang.",
)
group.add_argument(
"--multiling-train-oversampling",
action="append",
type=int,
help="For multilingual models only. Use this argument repeatedly to "
"oversample corpora. The n-th training corpus is oversampled by the n-"
"the entry. No oversampling if not specified.",
)
group.add_argument(
"--multiling-eval-source-text-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify paths to eval source samples. The n-th entry should be "
"in the n-th language in --multiling-source-lang.",
)
group.add_argument(
"--multiling-eval-target-text-file",
action="append",
metavar="FILE",
help="For multilingual models only. Use this argument repeatedly to "
"specify paths to eval target samples. The n-th entry should be "
"in the n-th language in --multiling-target-lang.",
)
group.add_argument(
"--multiling-rescale-grads",
type=utils.bool_flag,
nargs="?",
const=True,
default=False,
help=(
"If true, rescale gradients based on the number of training "
"samples a specific component has received in a training batch."
),
)
group.add_argument(
"--penalized-target-tokens-file",
default="",
metavar="FILE",
help="Path to text file of tokens to receive a penalty in decoding."
"If left empty, no penalty will be applied",
)
group.add_argument(
"--append-eos-to-source",
type=utils.bool_flag,
nargs="?",
const=True,
default=False,
help=("If true, append EOS to source sentences (instead of just target)."),
)
group.add_argument(
"--reverse-source",
type=utils.bool_flag,
nargs="?",
const=True,
default=True,
help=("If true, feed source sentence to model in reverse order."),
)
group.add_argument(
"--reverse-target",
type=utils.bool_flag,
nargs="?",
const=True,
default=False,
help=("If true, feed target sentence to model in reverse order."),
)