in yourbench/utils/dataset_engine.py [0:0]
def _generate_and_upload_dataset_card(config: dict[str, Any], template_path: str | None = None) -> None:
"""
Internal implementation that generates and uploads a dataset card to Hugging Face Hub.
This is the core implementation function called by the public upload_dataset_card() function.
It handles the actual card generation and uploading without performing configuration checks.
The dataset card includes:
1. Pipeline subset descriptions based on enabled stages
2. Full sanitized configuration for reproducibility
3. YourBench version and other metadata
4. Preserved dataset_info from the existing card for proper configuration display
Args:
config: Configuration dictionary containing HF settings
template_path: Optional custom template path
"""
logger.info("Starting dataset card upload process")
if _is_offline():
logger.warning("Offline mode enabled. Skipping dataset card upload.")
return
try:
# Get dataset repo name
settings = _extract_settings(config)
dataset_repo_name = settings.repo_id
logger.info(f"Uploading card for dataset: {dataset_repo_name}")
# Load template
if not template_path:
# Try to find template in utils directory
current_dir = os.path.dirname(__file__)
template_path = os.path.join(current_dir, "yourbench_card_template.md")
logger.info(f"Loading template from: {template_path}")
if not os.path.exists(template_path):
logger.error(f"Template file not found: {template_path}")
return
with open(template_path, "r", encoding="utf-8") as f:
template_str = f.read()
logger.debug(f"Template loaded successfully, length: {len(template_str)} characters")
# Get HF token
token = settings.token
# Extract dataset_info section from existing README if available
config_data = extract_dataset_info(repo_id=dataset_repo_name, token=token)
logger.info(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters")
# Use explicitly configured pretty_name or generate one from the dataset name
hf_config = config.get("hf_configuration", {})
if "pretty_name" in hf_config:
pretty_name = hf_config["pretty_name"]
else:
dataset_name = dataset_repo_name.split("/")[-1]
pretty_name = dataset_name.replace("-", " ").replace("_", " ").title()
card_data_kwargs = {"pretty_name": pretty_name}
# Create DatasetCardData with our metadata
card_data = DatasetCardData(**card_data_kwargs)
logger.info(f"Created card data with pretty_name: {card_data.pretty_name}")
# Get YourBench version
from importlib.metadata import PackageNotFoundError, version
try:
version_str = version("yourbench")
except PackageNotFoundError:
# Fallback for development installs
version_str = "dev"
# Prepare template variables
template_vars = {
"pretty_name": card_data.pretty_name,
"yourbench_version": version_str,
"config_yaml": _serialize_config_for_card(config),
"pipeline_subsets": _get_pipeline_subset_info(config),
"config_data": config_data, # Use the extracted dataset_info section
"footer": hf_config.get("footer", "*(This dataset card was automatically generated by YourBench)*"),
}
logger.info("Rendering dataset card from template")
logger.debug(f"Template variables: {list(template_vars.keys())}")
# Render card with our template and variables
card = DatasetCard.from_template(card_data=card_data, template_str=template_str, **template_vars)
logger.info("Template rendered successfully")
logger.debug(f"Rendered card content length: {len(str(card))} characters")
# Push to hub
logger.info(f"Pushing dataset card to hub: {dataset_repo_name}")
card.push_to_hub(dataset_repo_name, token=token)
logger.success(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{dataset_repo_name}")
except Exception as e:
logger.error(f"Failed to upload dataset card: {e}")
logger.exception("Full traceback:")