in yourbench/pipeline/ingestion.py [0:0]
def _get_markdown_content(file_path: str, markdown_processor: MarkItDown) -> str | None:
"""
Extract or convert file content to Markdown based on file type.
Args:
file_path (str): The path to the source document.
markdown_processor (MarkItDown): Configured MarkItDown instance for conversions.
Returns:
str | None: The Markdown content, or None if conversion failed.
Logs:
- Info about the processing method used for each file type.
- Warnings for fallback scenarios or failed conversions.
"""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == ".md":
# For existing Markdown files, just read the content, ensuring UTF-8
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
logger.info(f"File '{file_path}' is already Markdown. Content read directly.")
return content
elif file_ext in [".html", ".htm"]:
logger.info(f"Processing HTML file: {file_path} with Trafilatura.")
content = _extract_markdown_from_html(file_path)
if content is None: # Fallback to MarkItDown if Trafilatura failed or returned nothing
logger.warning(
f"Trafilatura processing failed or yielded no content for HTML '{file_path}'. "
"Falling back to MarkItDown for this file."
)
content = markdown_processor.convert(file_path).text_content
return content
else: # For other file types, use the MarkItDown processor
logger.info(f"Converting non-HTML/Markdown file '{file_path}' using MarkItDown.")
return markdown_processor.convert(file_path).text_content