in src/jobs/util/grouping_pipeline.py [0:0]
def sweep_params():
all_results = []
dataset_names = user_test_list
for dataset_id in dataset_names:
datasets, labeled_topics = get_labeled_dataset(dataset_id)
model_provider = ModelProvider()
for embedding_model in EMBEDDING_MODEL_LIST:
for clustering_method in CLUSTER_METHODS:
dbscan_eps_params = [0.4]
if clustering_method == "kmeans":
num_cluster_methods = NUM_CLUSTER_METHODS
else:
num_cluster_methods = ["knee"]
if clustering_method == "dbscan":
dbscan_eps_params = [0.4] # add others here
for num_cluster_method in num_cluster_methods:
for dbscan_eps in dbscan_eps_params:
for remap in DIM_REDUCE_OPTIONS:
for tf_idf_scale in [0.0]:
config = get_default_config()
config["embedding_model"] = embedding_model
config["remap"] = remap
config["dbscan_eps"] = dbscan_eps
config["tf_idf_scale"] = tf_idf_scale
config["clustering_method"] = clustering_method
config["num_cluster_method"] = num_cluster_method
res, score, adj_rscore = run_pipeline(config, datasets[0], model_provider=model_provider)
result_dict = {**config, "dataset": dataset_id, "rand": score, "adj_rand": adj_rscore}
all_results.append(result_dict)
# wandb.log(result_dict)
print("got result")
return all_results