def _extract_markdown_from_html()

in yourbench/pipeline/ingestion.py [0:0]


def _extract_markdown_from_html(file_path: str) -> str | None:
    """Attempts to extract markdown content from an HTML file using Trafilatura."""
    logger.debug(f"Attempting to extract Markdown from HTML file: {file_path} using Trafilatura.")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            html_content = f.read()

        # output_format='markdown' is key for direct Markdown conversion
        extracted_markdown = trafilatura.extract(
            html_content,
            output_format="markdown",
            include_comments=False,  # Do not include HTML comments
            include_tables=True,  # Try to include table data
        )

        if extracted_markdown:
            logger.info(f"Successfully extracted Markdown from '{file_path}' using Trafilatura.")
            return extracted_markdown

        logger.warning(f"Trafilatura returned no content for HTML file '{file_path}'.")
        return None
    except Exception as e:
        logger.error(f"Error using Trafilatura for HTML file '{file_path}': {e}. Skipping Trafilatura for this file.")
        return None