in run_pipeline.py [0:0]
def build_hf_data_clusters(cc, texts=None, labels=None):
"""
Build an HF dataset containing information on each cluster.
Args:
cc: ClusterClassifier object.
texts: list of texts used for inference mode.
labels: list of cluster labels corresponding to the texts for inference mode.
If `texts` and `labels` are not provided, the function will use the data available in `cc`
to construct the dataset. Otherwise it will run in inference mode on texts.
"""
cluster_data = []
for cluster_id in cc.label2docs.keys():
if cluster_id == -1:
continue
# inference mode
if texts is not None and labels is not None:
labels_array = np.array(labels)
files_in_cluster = np.where(labels_array == cluster_id)[0]
examples = [texts[doc_id] for doc_id in files_in_cluster]
else:
doc_ids = cc.label2docs[cluster_id]
examples = [cc.texts[doc_id] for doc_id in doc_ids]
cluster_info = {
"cluster_id": cluster_id,
"summary": cc.cluster_summaries[cluster_id],
"examples": examples,
}
if not texts:
cluster_info["position"] = cc.cluster_centers[cluster_id]
cluster_data.append(cluster_info)
return Dataset.from_pandas(pd.DataFrame(cluster_data))