in yourbench/pipeline/upload_ingest_to_hub.py [0:0]
def _collect_markdown_files(md_file_paths: list[str]) -> list[IngestedDocument]:
"""
Gather Markdown documents from the given file paths and store them in data classes.
Args:
md_file_paths (list[str]):
A list of absolute/relative paths to `.md` files.
Returns:
list[IngestedDocument]:
A list of `IngestedDocument` objects, one per valid markdown file discovered.
Side Effects:
Logs a warning for any unreadable or empty markdown files.
"""
ingested_docs: list[IngestedDocument] = []
for file_path in md_file_paths:
try:
with open(file_path, "r", encoding="utf-8") as file_handle:
content = file_handle.read().strip()
if not content:
logger.warning(f"Skipping empty markdown file: {file_path}")
continue
doc_id = str(uuid.uuid4())
ingested_docs.append(
IngestedDocument(
document_id=doc_id,
document_text=content,
document_filename=os.path.basename(file_path),
document_metadata={"file_size": os.path.getsize(file_path)},
)
)
logger.debug(f"Loaded markdown file: {file_path} (doc_id={doc_id})")
except Exception as e:
logger.error(f"Error reading file '{file_path}'. Skipping. Reason: {str(e)}")
return ingested_docs