in run_pipeline.py [0:0]
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--n_samples", type=int, default=100_000)
parser.add_argument("--start", type=int, default=-1)
parser.add_argument("--end", type=int, default=100_000)
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--save_load_path", type=str, default="./cc_100k")
parser.add_argument(
"--input_dataset",
type=str,
default="HuggingFaceFW/FW-12-12-2023-CC-2023-06",
help="dataset with the samples to use for clustering",
)
parser.add_argument(
"--data_subset",
type=str,
default=None,
help="dataset subset",
)
parser.add_argument("--input_content", type=str, default="content")
parser.add_argument(
"--topic_mode",
type=str,
choices=["single_topic", "multiple_topics"],
default="multiple_topics",
help="Specify 'single_topic' to generate only one topic and score its educational value, or 'multiple_topics' to generate the 3 most relevant topics in the cluster.",
)
parser.add_argument(
"--dbscan_eps",
type=float,
default=0.08,
help="The maximum distance between two samples for them to be considered as in the neighborhood of each other.",
)
parser.add_argument(
"--dbscan_min_samples",
type=int,
default=50,
help="The number of samples in a neighborhood for a point to be considered as a core point.",
)
parser.add_argument(
"--mode",
choices=["run", "load", "infer"],
default="run",
help="Run the pipeline from scratch/load existing model to build hf datasets or to infer on new texts",
)
parser.add_argument(
"--inference_repo_name",
type=str,
default="infer_fw_on_ultrachat",
help="HF repo name for the clusters dataset in inference mode",
)
parser.add_argument(
"--build_hf_ds",
action="store_true",
help="Builds HF datasets used for space visualization and pushes them to the hub",
)
parser.add_argument("--username", type=str, default="loubnabnl")
return parser.parse_args()