def _initialize_markdown_processor()

in yourbench/pipeline/ingestion.py [0:0]


def _initialize_markdown_processor(config: dict[str, Any]) -> MarkItDown:
    """
    Initialize a MarkItDown processor with optional LLM support for advanced conversion.

    This function looks up model details under `config["model_roles"]["ingestion"]`
    and `config["model_list"]` to see if an LLM is defined for ingestion tasks.
    If no suitable model is found or if necessary libraries are missing, a standard
    MarkItDown instance is returned without LLM augmentation.

    Args:
        config (dict[str, Any]): Global pipeline configuration dictionary.

    Returns:
        MarkItDown: A MarkItDown instance, possibly configured with an LLM client.

    Logs:
        - Warnings if an LLM model is specified but cannot be initialized.
        - Info about which model (if any) is used for ingestion.
    """
    try:
        # Extract typed configurations from the dictionary
        model_roles = _extract_model_roles(config)
        model_list = _extract_model_list(config)

        if not model_roles.ingestion or not model_list:
            logger.info("No LLM ingestion config found. Using default MarkItDown processor.")
            return MarkItDown()

        # Attempt to match the first model in model_list that appears in model_roles.ingestion
        matched_model = next((m for m in model_list if m.model_name in model_roles.ingestion), None)

        if not matched_model:
            logger.info(
                "No matching LLM model found for roles: {}. Using default MarkItDown.",
                model_roles.ingestion,
            )
            return MarkItDown()

        logger.info(
            "Initializing MarkItDown with LLM support: model='{}'.",
            matched_model.model_name,
        )

        # Construct the InferenceClient client (as OpenAI replacement)
        llm_client = InferenceClient(
            base_url=matched_model.base_url,
            api_key=matched_model.api_key,
            provider=matched_model.provider,
        )

        return MarkItDown(llm_client=llm_client, llm_model=matched_model.model_name)
    except Exception as exc:
        logger.warning("Failed to initialize MarkItDown with LLM support: {}", str(exc))
        return MarkItDown()