def build_hf_data_files()

in run_pipeline.py [0:0]


def build_hf_data_files(cc):
    """
    Build an HF dataset containing information on each file and the cluster they belong to
    """

    df = pd.DataFrame(
        data={
            "X": cc.projections[:, 0],
            "Y": cc.projections[:, 1],
            "labels": cc.cluster_labels,
            "content_display": [textwrap.fill(txt[:1024], 64) for txt in cc.texts],
        }
    )
    return Dataset.from_pandas(df)