def _get_pipeline_subset_info()

in yourbench/utils/dataset_engine.py [0:0]


def _get_pipeline_subset_info(config: dict[str, Any]) -> str:
    """
    Generate a formatted markdown list of enabled pipeline stages with descriptions.
    The resulting markdown is used in the dataset card to document
    which processing steps were included in the pipeline.

    Args:
        config: The complete pipeline configuration dictionary containing
               the 'pipeline' section with enabled stages

    Returns:
        str: A markdown-formatted string with bullet points for each enabled pipeline stage,
             or an empty string if no stages are enabled
    """

    mapping = {
        "ingestion": "Read raw source documents, convert them to normalized markdown and save for downstream steps",
        "upload_ingest_to_hub": "Package and push ingested markdown dataset to the Hugging Face Hub or save locally with standardized fields",
        "summarization": "Perform hierarchical summarization: chunk-level LLM summaries followed by combine-stage reduction",
        "chunking": "Split texts into token-based single-hop and multi-hop chunks",
        "single_shot_question_generation": "Generate standalone question-answer pairs per chunk using LLM",
        "multi_hop_question_generation": "Generate multi-hop QA pairs requiring reasoning across multiple chunks",
        "lighteval": "Merge QA pairs and chunk metadata into a lighteval compatible dataset for quick model-based scoring",
        "citation_score_filtering": "Compute overlap-based citation scores and filter QA pairs accordingly",
    }
    pipeline = config.get("pipeline", {})
    lines = []
    for stage, cfg in pipeline.items():
        if isinstance(cfg, dict) and cfg.get("run"):
            desc = mapping.get(stage, stage.replace("_", " ").title())
            lines.append(f"- **{stage}**: {desc}")
    return "\n".join(lines)