in yourbench/utils/dataset_engine.py [0:0]
def _get_pipeline_subset_info(config: dict[str, Any]) -> str:
"""
Generate a formatted markdown list of enabled pipeline stages with descriptions.
The resulting markdown is used in the dataset card to document
which processing steps were included in the pipeline.
Args:
config: The complete pipeline configuration dictionary containing
the 'pipeline' section with enabled stages
Returns:
str: A markdown-formatted string with bullet points for each enabled pipeline stage,
or an empty string if no stages are enabled
"""
mapping = {
"ingestion": "Read raw source documents, convert them to normalized markdown and save for downstream steps",
"upload_ingest_to_hub": "Package and push ingested markdown dataset to the Hugging Face Hub or save locally with standardized fields",
"summarization": "Perform hierarchical summarization: chunk-level LLM summaries followed by combine-stage reduction",
"chunking": "Split texts into token-based single-hop and multi-hop chunks",
"single_shot_question_generation": "Generate standalone question-answer pairs per chunk using LLM",
"multi_hop_question_generation": "Generate multi-hop QA pairs requiring reasoning across multiple chunks",
"lighteval": "Merge QA pairs and chunk metadata into a lighteval compatible dataset for quick model-based scoring",
"citation_score_filtering": "Compute overlap-based citation scores and filter QA pairs accordingly",
}
pipeline = config.get("pipeline", {})
lines = []
for stage, cfg in pipeline.items():
if isinstance(cfg, dict) and cfg.get("run"):
desc = mapping.get(stage, stage.replace("_", " ").title())
lines.append(f"- **{stage}**: {desc}")
return "\n".join(lines)