in yourbench/pipeline/upload_ingest_to_hub.py [0:0]
def _convert_ingested_docs_to_dataset(ingested_docs: list[IngestedDocument]) -> Dataset:
"""
Convert a list of ingested markdown documents into a Hugging Face Dataset object.
Args:
ingested_docs (list[IngestedDocument]):
list of `IngestedDocument` objects to be packaged in a dataset.
Returns:
Dataset:
A Hugging Face Dataset constructed from the provided documents,
with columns: 'document_id', 'document_text', 'document_filename',
and 'document_metadata'.
"""
# Prepare data structure for Hugging Face Dataset
records = {
"document_id": [],
"document_text": [],
"document_filename": [],
"document_metadata": [],
}
for doc in ingested_docs:
records["document_id"].append(doc.document_id)
records["document_text"].append(doc.document_text)
records["document_filename"].append(doc.document_filename)
records["document_metadata"].append(doc.document_metadata)
dataset = Dataset.from_dict(records)
logger.debug(f"Constructed HF Dataset with {len(dataset)} entries from ingested documents.")
return dataset