in build_and_train_models/sm-distributed_model_parallel_v2/shared-scripts/data/prep/_prepare_nemo_megatron_dataset.py [0:0]
def get_args():
parser = argparse.ArgumentParser()
group = parser.add_argument_group(title="input data")
group.add_argument(
"--input",
type=str,
required=True,
help="Path to the input json or json.gz file. If preprocessing an entire folder, set the --preproc-folder flag and provide the path to the folder in this arg.",
)
group.add_argument(
"--json-keys",
nargs="+",
default=["text"],
help="space separate listed of keys to extract from json",
)
group.add_argument(
"--split-sentences", action="store_true", help="Split documents into sentences."
)
group.add_argument(
"--keep-newlines",
action="store_true",
help="Keep newlines between sentences when splitting.",
)
group.add_argument("--text_file", action="store_true", help="Use text file instead of json.")
group = parser.add_argument_group(title="tokenizer")
group.add_argument(
"--tokenizer-library",
type=str,
required=True,
choices=["yttm", "sentencepiece", "megatron", "huggingface", "tabular"],
help="What tokenizer library to use.",
)
group.add_argument(
"--tokenizer-type",
type=str,
default=None,
help="What type of tokenizer to use.",
)
group.add_argument(
"--tokenizer-model",
type=str,
default=None,
help="Path to tokenizer model.",
)
group.add_argument("--vocab-file", type=str, default=None, help="Path to the vocab file")
group.add_argument("--files-filter", type=str, default="**/*.json*", help="files filter str")
group.add_argument(
"--merge-file", type=str, default=None, help="Path to the BPE merge file (if necessary)."
)
group.add_argument(
"--delimiter", type=str, default=None, help="delimiter used for tabular tokenizer"
)
group.add_argument(
"--append-eod", action="store_true", help="Append an <eod> token to the end of a document."
)
group.add_argument("--retrieval-db", action="store_true", help="Dataset used for retrieval.")
group.add_argument(
"--need-pad-id", action="store_true", help="Whether we need the pad id for the tokenizer"
)
group = parser.add_argument_group(title="output data")
group.add_argument(
"--output-prefix", type=str, required=True, help="Path to binary output file without suffix"
)
group.add_argument(
"--dataset-impl", type=str, default="mmap", choices=["lazy", "cached", "mmap", "retmmap"]
)
group = parser.add_argument_group(title="runtime")
group.add_argument(
"--workers", type=int, default=1, help="Number of worker processes to launch"
)
group.add_argument("--chunk_size", type=int, default=64, help="chunk size used for retrieval")
group.add_argument(
"--chunk_stride_size",
type=int,
default=64,
help="the stride size for neighbor chunks used for retrieval",
)
group.add_argument(
"--log-interval", type=int, default=100, help="Interval between progress updates"
)
group.add_argument(
"--preproc-folder",
action="store_true",
help="If set, will preprocess all .json or .json.gz files into a single .bin and .idx file. Folder path provided via the --input arg",
)
group.add_argument(
"--apply-ftfy", action="store_true", help="If set, will apply ftfy to the input text"
)
args = parser.parse_args()
args.keep_empty = False
if args.tokenizer_type is not None and args.tokenizer_type.lower().startswith("bert"):
if not args.split_sentences:
print("Bert tokenizer detected, are you sure you don't want to split sentences?")
# some default/dummy values for the tokenizer
args.rank = 0
args.make_vocab_size_divisible_by = 128
args.tensor_model_parallel_size = 1
args.vocab_extra_ids = 0
# TODO: There are dependencies b/w libraries and model files / tokenizer type strings to check.
assert args.tokenizer_type is not None or args.tokenizer_model is not None
return args