def _convert_ingested_docs_to_dataset()

in yourbench/pipeline/upload_ingest_to_hub.py [0:0]


def _convert_ingested_docs_to_dataset(ingested_docs: list[IngestedDocument]) -> Dataset:
    """
    Convert a list of ingested markdown documents into a Hugging Face Dataset object.

    Args:
        ingested_docs (list[IngestedDocument]):
            list of `IngestedDocument` objects to be packaged in a dataset.

    Returns:
        Dataset:
            A Hugging Face Dataset constructed from the provided documents,
            with columns: 'document_id', 'document_text', 'document_filename',
            and 'document_metadata'.
    """
    # Prepare data structure for Hugging Face Dataset
    records = {
        "document_id": [],
        "document_text": [],
        "document_filename": [],
        "document_metadata": [],
    }

    for doc in ingested_docs:
        records["document_id"].append(doc.document_id)
        records["document_text"].append(doc.document_text)
        records["document_filename"].append(doc.document_filename)
        records["document_metadata"].append(doc.document_metadata)

    dataset = Dataset.from_dict(records)
    logger.debug(f"Constructed HF Dataset with {len(dataset)} entries from ingested documents.")
    return dataset