in yourbench/pipeline/ingestion.py [0:0]
def _initialize_markdown_processor(config: dict[str, Any]) -> MarkItDown:
"""
Initialize a MarkItDown processor with optional LLM support for advanced conversion.
This function looks up model details under `config["model_roles"]["ingestion"]`
and `config["model_list"]` to see if an LLM is defined for ingestion tasks.
If no suitable model is found or if necessary libraries are missing, a standard
MarkItDown instance is returned without LLM augmentation.
Args:
config (dict[str, Any]): Global pipeline configuration dictionary.
Returns:
MarkItDown: A MarkItDown instance, possibly configured with an LLM client.
Logs:
- Warnings if an LLM model is specified but cannot be initialized.
- Info about which model (if any) is used for ingestion.
"""
try:
# Extract typed configurations from the dictionary
model_roles = _extract_model_roles(config)
model_list = _extract_model_list(config)
if not model_roles.ingestion or not model_list:
logger.info("No LLM ingestion config found. Using default MarkItDown processor.")
return MarkItDown()
# Attempt to match the first model in model_list that appears in model_roles.ingestion
matched_model = next((m for m in model_list if m.model_name in model_roles.ingestion), None)
if not matched_model:
logger.info(
"No matching LLM model found for roles: {}. Using default MarkItDown.",
model_roles.ingestion,
)
return MarkItDown()
logger.info(
"Initializing MarkItDown with LLM support: model='{}'.",
matched_model.model_name,
)
# Construct the InferenceClient client (as OpenAI replacement)
llm_client = InferenceClient(
base_url=matched_model.base_url,
api_key=matched_model.api_key,
provider=matched_model.provider,
)
return MarkItDown(llm_client=llm_client, llm_model=matched_model.model_name)
except Exception as exc:
logger.warning("Failed to initialize MarkItDown with LLM support: {}", str(exc))
return MarkItDown()