in src/jobs/util/grouping_pipeline.py [0:0]
def run_pipeline(config, df, saved_set_name=None, model_provider = None):
"""
Runs a tab grouping pipeline, grouping tabs into clusters and labeling each cluster
Args:
config: Configuration options
df: Dataset
saved_set_name: Name to save embeddings as a tsv for other use
model_provider: Language model class
Returns:
dataset, rand score (based on labels), adjusted rand score
"""
df = df.sample(frac=1)
dbscan = config["clustering_method"] == "dbscan"
pipeline = generate_pipeline(config, model_provider=model_provider)
df = add_text_for_embedding(df, config["text_for_embedding"])
model = pipeline.fit(df)
pipeline_result = model.transform(df).toarray()
if saved_set_name is not None:
np.savetxt(f"output/{saved_set_name}.tsv", pipeline_result, delimiter="\t")
df[["title", "smart_group_label"]].to_csv(f"output/{saved_set_name}_labels.tsv", sep="\t")
embeddings_as_list = [pipeline_result.tolist() for _row in pipeline_result]
if config["remap"] > 0:
umap_model = umap.UMAP(
n_neighbors=config["remap"],
n_components=5,
min_dist=0.0,
metric="cosine"
)
pipeline_result = umap_model.fit_transform(pipeline_result)
# embeddings_as_list = [pipeline_result.tolist() for _row in pipeline_result]
max_clusters = min(math.floor(math.log(len(embeddings_as_list)) * 2.0 + 1), len(embeddings_as_list))
best_cluster = generate_best_cluster_model(pipeline_result, range(2, max_clusters),
verbose=False, use_dbscan=dbscan, eps=config["dbscan_eps"],
num_cluster_method=config["num_cluster_method"])
if dbscan:
clusters = best_cluster.fit_predict(pipeline_result)
else:
clusters = best_cluster.predict(pipeline_result)
df["predicted_cluster"] = clusters
df["embeddings"] = embeddings_as_list
rscore = rand_score(df["smart_group_label"], df["predicted_cluster"])
adj_rscore = adjusted_rand_score(df["smart_group_label"], df["predicted_cluster"])
return df, rscore, adj_rscore