def _collect_markdown_files()

in yourbench/pipeline/upload_ingest_to_hub.py [0:0]


def _collect_markdown_files(md_file_paths: list[str]) -> list[IngestedDocument]:
    """
    Gather Markdown documents from the given file paths and store them in data classes.

    Args:
        md_file_paths (list[str]):
            A list of absolute/relative paths to `.md` files.

    Returns:
        list[IngestedDocument]:
            A list of `IngestedDocument` objects, one per valid markdown file discovered.

    Side Effects:
        Logs a warning for any unreadable or empty markdown files.
    """
    ingested_docs: list[IngestedDocument] = []
    for file_path in md_file_paths:
        try:
            with open(file_path, "r", encoding="utf-8") as file_handle:
                content = file_handle.read().strip()

            if not content:
                logger.warning(f"Skipping empty markdown file: {file_path}")
                continue

            doc_id = str(uuid.uuid4())
            ingested_docs.append(
                IngestedDocument(
                    document_id=doc_id,
                    document_text=content,
                    document_filename=os.path.basename(file_path),
                    document_metadata={"file_size": os.path.getsize(file_path)},
                )
            )
            logger.debug(f"Loaded markdown file: {file_path} (doc_id={doc_id})")

        except Exception as e:
            logger.error(f"Error reading file '{file_path}'. Skipping. Reason: {str(e)}")

    return ingested_docs