example/configs/advanced_example.yaml (59 lines of code) (raw):
# =====================
# ADVANCED EXAMPLE CONFIG
# =====================
# This configuration file retains the same functionality as the simple example,
# but includes detailed comments to illustrate each parameter's purpose and
# potential trade-offs in more complex usage scenarios.
# ==============================================================================
# HUGGING FACE SETTINGS (DATASET & AUTH)
# ==============================================================================
hf_configuration:
# ----------------------------------------------------------
# 'token': Your personal access token from HF Hub.
# Provide it directly or via an environment variable, e.g.:
# export HF_TOKEN="hf_xxxxxxx"
# This is necessary if pushing private datasets to the Hub.
token: $HF_TOKEN
# ----------------------------------------------------------
# 'hf_organization': The user/organization namespace on Hugging Face Hub
# under which the dataset will be created. If not set or left as an
# environment variable, YourBench tries to infer your HF username instead.
hf_organization: $HF_ORGANIZATION
# ----------------------------------------------------------
# 'private': Determines whether the newly created HF dataset is private.
# true => Only you and authorized collaborators can see it
# false => Publicly viewable by anyone
private: false
# ----------------------------------------------------------
# 'hf_dataset_name': The name of the dataset repository on Hugging Face Hub.
# If you're part of an organization, this might become ORG_NAME/DATASET_NAME.
# For instance, "myuser/yourbench_advanced_example"
hf_dataset_name: yourbench_advanced_example
# ----------------------------------------------------------
# 'concat_if_exist': If true and a dataset with the same subset name already
# exists, newly generated entries get appended to it. If false, an existing
# dataset with the same name is overwritten. This is useful if you want to
# iteratively build a larger dataset rather than starting fresh each time.
concat_if_exist: false
# ----------------------------------------------------------
# 'upload_card': Whether to automatically generate and upload README dataset cards
# to the Hugging Face Hub. The cards are generated using a Jinja2 template and
# include dataset metadata like number of rows, columns, data types, etc.
# true => Automatically upload dataset cards (recommended)
# false => Skip dataset card uploading
upload_card: true
# ----------------------------------------------------------
# 'local_dataset_dir' and 'local_saving': (optional)
# local_dataset_dir: path for saving the dataset locally
# local_saving: whether to actually store locally in addition to pushing
# By default, we push to HF Hub only, but if you want local backups,
# specify something like:
# local_dataset_dir: /some/path
# local_saving: true
# and the dataset would be saved to disk as well.
#
# Example:
# local_dataset_dir: /some/path
# local_saving: false
# ==============================================================================
# MODEL CONFIGURATION
# ==============================================================================
# Here, you can declare multiple models which may be used at different stages
# of the pipeline (e.g., ingestion, summarization, chunking, question generation).
# Each entry can specify where the model is hosted, concurrency limits, etc.
model_list:
# 1) Using a HF Inference provider, e.g., Fireworks, which might handle LLM inference:
- model_name: Qwen/Qwen3-30B-A3B
provider: fireworks-ai
# base_url, api_key not specified => uses default huggingface_hub credentials
# max_concurrent_requests => limit parallel requests. If your model or
# plan is cost-limited, you can lower concurrency to be safer.
# 2) Using a local or self-hosted vLLM server that implements an OpenAI-compatible API:
- model_name: Qwen/Qwen3-4B
base_url: http://localhost:8000/v1
api_key: $VLLM_API_KEY
max_concurrent_requests: 128
# This is good if you want to batch many requests and your system can handle it.
# Note, if your local endpoint does not utilize /v1, you need to specify the full url. For example, if your openwebui endpoint uses /api, specify /api/chat/completions.
# 3) Using an OpenAI model:
- model_name: gpt-4.1
base_url: https://api.openai.com/v1
api_key: $OPENAI_API_KEY
max_concurrent_requests: 8
# If rate-limited, keep concurrency low or you risk hitting the API cap.
# 4) Using a Gemini model (hypothetical Google approach):
- model_name: gemini-2.5-flash-preview
base_url: https://generativelanguage.googleapis.com/v1beta/openai/
api_key: $GEMINI_API_KEY
# Adjust concurrency similarly if needed.
# ==============================================================================
# MODEL ROLES
# ==============================================================================
# This section lets you define which models (by 'model_name' above) are used in
# each stage. For instance, you might want a large model for summarization but a
# smaller model for chunking, or multiple models for question generation, etc.
model_roles:
# 'ingestion': typically for converting multi-format docs into text
# Might need a vision-capable model for PDF w/ images, etc.
ingestion:
- Qwen/Qwen3-30B-A3B
# 'summarization': the model(s) used to create text summaries
summarization:
- Qwen/Qwen3-30B-A3B
# - gemini-2.5-flash-preview
# 'chunking': often uses a smaller embedding model to do semantic chunking
chunking:
- intfloat/multilingual-e5-large-instruct
# 'single_shot_question_generation': used for generating straightforward
# "single-hop" questions from each chunk. Could specify multiple models
# if you want an ensemble approach for variety. Each model’s outputs
# will be aggregated together by the pipeline.
single_shot_question_generation:
- Qwen/Qwen3-30B-A3B
# - gpt-4.1
# - gemini-2.5-flash-preview
# 'multi_hop_question_generation': used for advanced question generation
# that requires referencing multiple chunks simultaneously.
multi_hop_question_generation:
- Qwen/Qwen3-30B-A3B
# - gpt-4.1
# - gemini-2.5-flash-preview
# ==============================================================================
# PIPELINE STAGES
# ==============================================================================
# Each of the following keys corresponds to a pipeline "stage." Setting "run: true"
# will execute that stage, while "run: false" will skip it.
# For each stage, you can define additional parameters that fine-tune behavior.
pipeline:
# 1) INGESTION
# Reads raw documents from disk, attempts to convert them to text, normalizes
# them into standard markdown. Possibly uses an LLM for advanced parsing (images, etc.).
ingestion:
run: true
source_documents_dir: example/data/raw # Where your raw PDFs, HTML, etc. are
output_dir: example/data/processed # Where to store the post-processed .md files
# 2) UPLOAD_INGEST_TO_HUB
# Takes the .md documents from 'ingestion' output_dir, packages them into
# a Hugging Face dataset, and (optionally) pushes to the Hub.
upload_ingest_to_hub:
run: true
source_documents_dir: example/data/processed
# If the directory is empty or missing, it will fail. Make sure your
# ingestion stage successfully produced content.
# 3) SUMMARIZATION
# Summarizes each document, typically in a hierarchical manner. This stage
# can be helpful if your docs are extremely long. Summaries can provide
# global context that helps with question generation or multi-chunk tasks.
summarization:
run: true
# 'max_tokens' => The approximate max tokens per chunk. If your model has
# a 16k token limit, you might set something around 12k to be safe.
max_tokens: 16384
# If you have a 100k context window, you could increase this drastically.
# 'token_overlap' => Overlap between adjacent splits. A larger overlap
# yields more contextual continuity but also higher token usage.
token_overlap: 128
# 'encoding_name' => The tokenizer name. "cl100k_base" is used typically
# by GPT-4/3.5 and some Qwen models. "gpt2" is for older GPT-2-like models.
encoding_name: cl100k_base
# 4) CHUNKING
# Splits documents into smaller segments. Supports simple "fast_chunking" (token-based)
# or more complex "semantic_chunking" which requires sentence embeddings to decide
# boundaries based on similarity. The multi-hop question generation stage relies on
# these chunk groupings.
chunking:
run: true
chunking_configuration:
# chunking_mode: fast_chunking # "fast_chunking" or "semantic_chunking"
# WARNING: SEMANTIC CHUNKING IS TEMPORARILY REMOVED AFTER v0.3.1.
l_max_tokens: 128 # Each chunk’s maximum token length
token_overlap: 0 # Overlap between chunks in token-based splitting
encoding_name: cl100k_base # Tokenizer name for measuring token lengths
# If using 'semantic_chunking':
# l_min_tokens: 64
# tau_threshold: 0.8
# The chunker merges or splits based on semantic similarity (cosine).
#
# For multi-hop chunk creation:
# h_min: 2 # Minimum subchunks in a multi-hop combination
# h_max: 5 # Maximum subchunks in a multi-hop combination
# num_multihops_factor: 5
#
# Tweak these to produce bigger or fewer multi-hop combos or to tighten
# the chunk boundaries for more finely grained questions.
# 5) SINGLE_SHOT_QUESTION_GENERATION
# Generates single-hop question-answer pairs from each chunk. Typically simpler
# factual or direct queries. The model is guided to produce "estimated_difficulty",
# "citations," etc.
single_shot_question_generation:
run: true
# You can add global instructions guiding question style, difficulty, domain specificity:
# additional_instructions: "Generate factual, short-answer questions at a college level."
# Control the format of generated questions:
# question_mode: "open-ended" # "open-ended" (default): model generates the answer to the question
# # "multi-choice": model creates options (A), (B), (C), (D) and selects the correct one
# If your documents are large, you can randomly sample chunks to reduce cost:
# chunk_sampling:
# mode: "count" # "count" or "percentage" or "all"
# value: 5 # e.g. pick 5 random chunks per doc
# random_seed: 123
# 6) MULTI_HOP_QUESTION_GENERATION
# Generates complex questions that require reasoning across multiple chunks
# of a single document — or across different documents if cross-document
# is enabled. Useful for evaluating deeper understanding and integration.
multi_hop_question_generation:
run: true
# additional_instructions: "Try to integrate multiple pieces of evidence across chunks to create deeper questions."
# Control the format of generated questions:
# question_mode: "open-ended" # "open-ended" (default): model generates the answer to the question
# # "multi-choice": model creates options (A), (B), (C), (D) and selects the correct one
# Cross-document question generation:
# If enabled, questions can be generated from chunks taken across different documents, not just within one.
# cross_document:
# enable: true # Whether to generate cross-document questions (default: false)
# max_combinations: 5 # Max number of cross-document chunk combinations to use
# chunks_per_document: 1 # Number of chunks to sample from each document
# Similarly, you can sample the multi-hop chunks to cut down on inference:
# chunk_sampling:
# mode: "percentage"
# value: 0.3 # e.g. take 30% of multi-hop chunk combos
# random_seed: 42
# 7) LIGHTEVAL
# Combines the single-hop and multi-hop question sets into a unified “lighteval”
# subset, optionally adding chunk context and original document text. Useful for
# quick or “lightweight” downstream evaluations.
lighteval:
run: true
# 8) CITATION_SCORE_FILTERING
# Finally, runs fuzzy-match scoring on each question’s “citations” vs. the
# chunk text to see if citations are truly found in the source. Adds columns
# like 'citation_score' to the final dataset, letting you filter out or down-weight
# QA pairs with poor grounding.
citation_score_filtering:
run: true