def add_preprocessing

def add_preprocessing_args()

in pytorch_translate/options.py [0:0]
308 lines of code
1 McCabe index (conditional complexity)

def add_preprocessing_args(parser):
    # Args related to dataset.
    group = parser.add_argument_group("Preprocess data")
    group.add_argument(
        "--source-vocab-file",
        default="",
        metavar="FILE",
        help="Path to text file representing the dictionary of tokens to use. "
        "If the file does not exist, the dict is auto-generated from source "
        "training data and saved as that file.",
    )
    group.add_argument(
        "--source-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="If a new vocab file needs to be generated, restrict it to the "
        "top N most common words. If we re-use an existing vocab file, this "
        "flag will have no effect. A value of < 0 means no max size.",
    )
    group.add_argument(
        "--source-bpe-cont-marker",
        default=None,
        type=str,
        metavar="CONT",
        help="Source BPE continuation marker. You should only specify this if "
        "you are using a BPE source vocab that has an continuation marker "
        "suffix. Note that this is the default BPE format in fairseq. Ex: '@@'",
    )
    group.add_argument(
        "--source-bpe-end-marker",
        default=None,
        type=str,
        metavar="END",
        help="Source BPE end marker. You should only specify this if you are "
        "using a BPE source vocab that has an end marker suffix. Ex: '_EOW'",
    )
    group.add_argument(
        "--char-source-vocab-file",
        default="",
        metavar="FILE",
        help="Same as --source-vocab-file except using characters.",
    )
    group.add_argument(
        "--char-target-vocab-file",
        default="",
        metavar="FILE",
        help="Same as --target-vocab-file except using characters.",
    )
    group.add_argument(
        "--embed-bytes",
        type=utils.bool_flag,
        nargs="?",
        const=True,
        default=False,
        help="If specified along with a character model and set to True, "
        "then we embed bytes instead of characters.",
    )
    group.add_argument(
        "--char-source-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="Same as --source-max-vocab-size except using characters.",
    )
    group.add_argument(
        "--char-target-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="Same as --target-max-vocab-size except using characters.",
    )
    group.add_argument(
        "--target-vocab-file",
        default="",
        metavar="FILE",
        help="Path to text file representing the fairseq Dictionary to use. "
        "If the file does not exist, the dict is auto-generated from target "
        "training data and saved as that file.",
    )
    group.add_argument(
        "--target-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="If a new vocab file needs to be generated, restrict it to the "
        "top N most common words. If we re-use an existing vocab file, this "
        "flag will have no effect. A value of < 0 means no max size.",
    )
    group.add_argument(
        "--target-bpe-cont-marker",
        default=None,
        type=str,
        metavar="CONT",
        help="Target BPE continuation marker. You should only specify this if "
        "you are using a BPE target vocab that has an continuation marker "
        "suffix. Note that this is the default BPE format in fairseq. Ex: '@@'",
    )
    group.add_argument(
        "--target-bpe-end-marker",
        default=None,
        type=str,
        metavar="END",
        help="Target BPE end marker. You should only specify this if you are "
        "using a BPE target vocab that has an end marker suffix. Ex: '_EOW'",
    )
    group.add_argument(
        "--fairseq-data-format",
        type=bool,
        default=False,
        help="binary paths are prefixes for .bin/.idx mmap datasets",
    )
    group.add_argument(
        "--train-source-text-file",
        default="",
        metavar="FILE",
        help="Path to text file containing source training examples.",
    )
    group.add_argument(
        "--train-target-text-file",
        default="",
        metavar="FILE",
        help="Path to text file containing target training examples.",
    )
    group.add_argument(
        "--eval-source-text-file",
        default="",
        metavar="FILE",
        help="Path to text file containing source eval examples for "
        "calculating validation loss and BLEU eval scores.",
    )
    group.add_argument(
        "--eval-target-text-file",
        default="",
        metavar="FILE",
        help="Path to text file containing target eval examples for "
        "calculating validation loss and BLEU eval scores.",
    )
    group.add_argument(
        "--train-source-binary-path",
        default="",
        help="Path for the binary file containing source training examples.",
    )
    group.add_argument(
        "--train-target-binary-path",
        default="",
        help="Path for the binary file containing target training examples.",
    )
    group.add_argument(
        "--train-weights-path",
        default="",
        metavar="FILE",
        help="Path to text file of weight (0 to 1) for each train example.."
        "If left empty, all examples will receive equal weights.",
    )
    group.add_argument(
        "--eval-source-binary-path",
        default="",
        help="Path for the binary file containing source eval examples for "
        "calculating validation loss and BLEU scores.",
    )
    group.add_argument(
        "--eval-target-binary-path",
        default="",
        help="Path for the binary file containing target eval examples for "
        "calculating validation loss and BLEU scores.",
    )
    group.add_argument(
        "--train-mono-source-binary-path",
        default="",
        help="Path for the binary file containing source side monolingual data",
    )
    group.add_argument(
        "--train-mono-target-binary-path",
        default="",
        help="Path for the binary file containing target side monolingual data",
    )

    # TODO(T43045193): Move this to multilingual_task.py eventually
    group.add_argument(
        "--multiling-encoder-lang",
        action="append",
        metavar="SRC",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify a list of encoder languages. The multilingual model contains "
        "a separate encoder for each language in this list.",
    )
    group.add_argument(
        "--multiling-decoder-lang",
        action="append",
        metavar="TARGET",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify a list of decoder languages. The multilingual model contains "
        "a separate decoder for each language in this list.",
    )
    group.add_argument(
        "--multiling-source-lang",
        action="append",
        metavar="SRC",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify a list of corpus source languages, where the n-th language is "
        "the source language of the n-th training corpus. Each entry must be "
        "in --multiling-encoder-lang.",
    )
    group.add_argument(
        "--multiling-target-lang",
        action="append",
        metavar="TARGET",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify a list of corpus target languages, where the n-th language is "
        "the target language of the n-th training corpus. Each entry must be "
        "in --multiling-decoder-lang.",
    )
    group.add_argument(
        "--multiling-source-vocab-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify the path to the dictionary for the n-th entry in "
        "--multiling-encoder-lang",
    )
    group.add_argument(
        "--multiling-target-vocab-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify the path to the dictionary for the n-th entry in "
        "--multiling-decoder-lang",
    )
    group.add_argument(
        "--multiling-train-source-text-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify paths to training source samples. The n-th entry should be "
        "in the n-th language in --multiling-source-lang.",
    )
    group.add_argument(
        "--multiling-train-target-text-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify paths to training target samples. The n-th entry should be "
        "in the n-th language in --multiling-target-lang.",
    )
    group.add_argument(
        "--multiling-train-oversampling",
        action="append",
        type=int,
        help="For multilingual models only. Use this argument repeatedly to "
        "oversample corpora. The n-th training corpus is oversampled by the n-"
        "the entry. No oversampling if not specified.",
    )
    group.add_argument(
        "--multiling-eval-source-text-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify paths to eval source samples. The n-th entry should be "
        "in the n-th language in --multiling-source-lang.",
    )
    group.add_argument(
        "--multiling-eval-target-text-file",
        action="append",
        metavar="FILE",
        help="For multilingual models only. Use this argument repeatedly to "
        "specify paths to eval target samples. The n-th entry should be "
        "in the n-th language in --multiling-target-lang.",
    )
    group.add_argument(
        "--multiling-rescale-grads",
        type=utils.bool_flag,
        nargs="?",
        const=True,
        default=False,
        help=(
            "If true, rescale gradients based on the number of training "
            "samples a specific component has received in a training batch."
        ),
    )

    group.add_argument(
        "--penalized-target-tokens-file",
        default="",
        metavar="FILE",
        help="Path to text file of tokens to receive a penalty in decoding."
        "If left empty, no penalty will be applied",
    )

    group.add_argument(
        "--append-eos-to-source",
        type=utils.bool_flag,
        nargs="?",
        const=True,
        default=False,
        help=("If true, append EOS to source sentences (instead of just target)."),
    )
    group.add_argument(
        "--reverse-source",
        type=utils.bool_flag,
        nargs="?",
        const=True,
        default=True,
        help=("If true, feed source sentence to model in reverse order."),
    )
    group.add_argument(
        "--reverse-target",
        type=utils.bool_flag,
        nargs="?",
        const=True,
        default=False,
        help=("If true, feed target sentence to model in reverse order."),
    )