in pytorch_translate/options.py [0:0]
def add_dataset_args(parser, train=False, gen=False):
"""Same as fairseq.options.add_dataset_args but without
the "data" argument"""
group = parser.add_argument_group("Dataset and data loading")
group.add_argument(
"data",
metavar="DIR",
nargs="?",
help="path to data directory. "
"This is not needed but kept for backward compatibility",
)
group.add_argument(
"--num-workers",
default=0,
type=int,
metavar="N",
help="how many subprocesses to use for data loading",
)
group.add_argument(
"--skip-invalid-size-inputs-valid-test",
action="store_true",
help="Ignore too long or too short lines in valid and test set",
)
group.add_argument(
"--max-tokens",
default=5000,
type=int,
metavar="N",
help="maximum number of tokens in a batch",
)
group.add_argument(
"--batch-size",
"--max-sentences",
type=int,
metavar="N",
help="maximum number of sentences in a batch",
)
group.add_argument(
"--dataset-impl",
metavar="FORMAT",
choices=get_available_dataset_impl(),
help="output dataset implementation",
)
if train:
group.add_argument(
"--train-subset",
default="train",
metavar="SPLIT",
choices=["train", "valid", "test"],
help="data subset to use for training (train, valid, test)",
)
group.add_argument(
"--valid-subset",
default="valid",
metavar="SPLIT",
help="comma separated list of data subsets to use"
" for validation (train, valid, valid1,test, test1)",
)
group.add_argument(
"--max-sentences-valid",
type=int,
metavar="N",
help="maximum number of sentences in a validation batch"
" (defaults to --max-sentences)",
)
if gen:
group.add_argument(
"--gen-subset",
default="test",
metavar="SPLIT",
help="data subset to generate (train, valid, test)",
)
group.add_argument(
"--num-shards",
default=1,
type=int,
metavar="N",
help="shard generation over N shards",
)
group.add_argument(
"--shard-id",
default=0,
type=int,
metavar="ID",
help="id of the shard to generate (id < num_shards)",
)
return group