in yourbench/pipeline/ingestion.py [0:0]
def _extract_markdown_from_html(file_path: str) -> str | None:
"""Attempts to extract markdown content from an HTML file using Trafilatura."""
logger.debug(f"Attempting to extract Markdown from HTML file: {file_path} using Trafilatura.")
try:
with open(file_path, "r", encoding="utf-8") as f:
html_content = f.read()
# output_format='markdown' is key for direct Markdown conversion
extracted_markdown = trafilatura.extract(
html_content,
output_format="markdown",
include_comments=False, # Do not include HTML comments
include_tables=True, # Try to include table data
)
if extracted_markdown:
logger.info(f"Successfully extracted Markdown from '{file_path}' using Trafilatura.")
return extracted_markdown
logger.warning(f"Trafilatura returned no content for HTML file '{file_path}'.")
return None
except Exception as e:
logger.error(f"Error using Trafilatura for HTML file '{file_path}': {e}. Skipping Trafilatura for this file.")
return None