def _get_markdown_content()

in yourbench/pipeline/ingestion.py [0:0]


def _get_markdown_content(file_path: str, markdown_processor: MarkItDown) -> str | None:
    """
    Extract or convert file content to Markdown based on file type.

    Args:
        file_path (str): The path to the source document.
        markdown_processor (MarkItDown): Configured MarkItDown instance for conversions.

    Returns:
        str | None: The Markdown content, or None if conversion failed.

    Logs:
        - Info about the processing method used for each file type.
        - Warnings for fallback scenarios or failed conversions.
    """
    file_ext = os.path.splitext(file_path)[1].lower()

    if file_ext == ".md":
        # For existing Markdown files, just read the content, ensuring UTF-8
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        logger.info(f"File '{file_path}' is already Markdown. Content read directly.")
        return content

    elif file_ext in [".html", ".htm"]:
        logger.info(f"Processing HTML file: {file_path} with Trafilatura.")
        content = _extract_markdown_from_html(file_path)
        if content is None:  # Fallback to MarkItDown if Trafilatura failed or returned nothing
            logger.warning(
                f"Trafilatura processing failed or yielded no content for HTML '{file_path}'. "
                "Falling back to MarkItDown for this file."
            )
            content = markdown_processor.convert(file_path).text_content
        return content

    else:  # For other file types, use the MarkItDown processor
        logger.info(f"Converting non-HTML/Markdown file '{file_path}' using MarkItDown.")
        return markdown_processor.convert(file_path).text_content