in run_pipeline.py [0:0]
def build_hf_data_files(cc):
"""
Build an HF dataset containing information on each file and the cluster they belong to
"""
df = pd.DataFrame(
data={
"X": cc.projections[:, 0],
"Y": cc.projections[:, 1],
"labels": cc.cluster_labels,
"content_display": [textwrap.fill(txt[:1024], 64) for txt in cc.texts],
}
)
return Dataset.from_pandas(df)