def build_hf_data_clusters()

in run_pipeline.py [0:0]


def build_hf_data_clusters(cc, texts=None, labels=None):
    """
    Build an HF dataset containing information on each cluster.

    Args:
        cc: ClusterClassifier object.
        texts: list of texts used for inference mode.
        labels: list of cluster labels corresponding to the texts for inference mode.

    If `texts` and `labels` are not provided, the function will use the data available in `cc`
    to construct the dataset. Otherwise it will run in inference mode on texts.
    """
    cluster_data = []
    for cluster_id in cc.label2docs.keys():
        if cluster_id == -1:
            continue

        # inference mode
        if texts is not None and labels is not None:
            labels_array = np.array(labels)
            files_in_cluster = np.where(labels_array == cluster_id)[0]
            examples = [texts[doc_id] for doc_id in files_in_cluster]
        else:
            doc_ids = cc.label2docs[cluster_id]
            examples = [cc.texts[doc_id] for doc_id in doc_ids]

        cluster_info = {
            "cluster_id": cluster_id,
            "summary": cc.cluster_summaries[cluster_id],
            "examples": examples,
        }

        if not texts:
            cluster_info["position"] = cc.cluster_centers[cluster_id]

        cluster_data.append(cluster_info)

    return Dataset.from_pandas(pd.DataFrame(cluster_data))