in assets/training/finetune_acft_hf_nlp/src/preprocess/preprocess.py [0:0]
def get_parser():
"""
Add arguments and returns the parser. Here we add all the arguments for all the tasks.
Those arguments that are not relevant for the input task should be ignored.
"""
parser = argparse.ArgumentParser(description="Model Preprocessing", allow_abbrev=False)
# NOTE that the default is present in both :param `type` and `default`. In case of change, we need to update
# in both places
parser.add_argument(
"--train_file_path",
type=partial(default_missing_path, default=None),
required=False,
default=None,
help="Train data path",
)
# NOTE that the default is present in both :param `type` and `default`. In case of change, we need to update
# in both places
parser.add_argument(
"--validation_file_path",
type=partial(default_missing_path, default=None),
required=False,
default=None,
help="Validation data path",
)
# NOTE that the default is present in both :param `type` and `default`. In case of change, we need to update
# in both places
parser.add_argument(
"--test_file_path",
type=partial(default_missing_path, default=None),
required=False,
default=None,
help="Test data path",
)
parser.add_argument(
"--train_mltable_path",
type=str,
required=False,
default=None,
help="train mltable dataset_path folder",
)
parser.add_argument(
"--validation_mltable_path",
type=str,
required=False,
default=None,
help="valid mltable dataset_path folder",
)
parser.add_argument(
"--test_mltable_path",
type=str,
required=False,
default=None,
help="valid mltable dataset_path folder",
)
# `test_data` is used as pass through and will be consumed directly by the model evaluation component
# Instead of deleting the entire code related to test processing; introducing a new flag that enables/disables
# processing test data
# Enabling this flag will skip data validation and data encoding for the test data
parser.add_argument(
"--skip_test_data_processing",
type=str2bool,
required=False,
default="true",
help="If enabled, the processing for test data will be skipped",
)
parser.add_argument(
"--output_dir",
default="preprocess_output",
type=str,
help="folder to store preprocessed input data",
)
# Task settings
parser.add_argument(
"--model_selector_output",
default=None,
type=str,
help=(
"output folder of model selector containing model configs, tokenizer, checkpoints in case of model_id."
"If huggingface_id is selected, the model download happens dynamically on the fly"
),
)
parser.add_argument(
"--task_name",
type=str,
default="SingleLabelClassification",
help="Task Name",
)
parser.add_argument("--num_train_epochs", default=1, type=int, help="training epochs")
# NLP settings
parser.add_argument(
"--enable_long_range_text",
type=bool,
default=True,
help=(
"User option to apply heuristic for finding optimal max_seq_length value, "
"reserved only for multiclass classification."
)
)
return parser