def _generate_and_upload_dataset

def _generate_and_upload_dataset_card()

in yourbench/utils/dataset_engine.py [0:0]
55 lines of code
11 McCabe index (conditional complexity)

def _generate_and_upload_dataset_card(config: dict[str, Any], template_path: str | None = None) -> None:
    """
    Internal implementation that generates and uploads a dataset card to Hugging Face Hub.

    This is the core implementation function called by the public upload_dataset_card() function.
    It handles the actual card generation and uploading without performing configuration checks.

    The dataset card includes:
    1. Pipeline subset descriptions based on enabled stages
    2. Full sanitized configuration for reproducibility
    3. YourBench version and other metadata
    4. Preserved dataset_info from the existing card for proper configuration display

    Args:
        config: Configuration dictionary containing HF settings
        template_path: Optional custom template path
    """
    logger.info("Starting dataset card upload process")

    if _is_offline():
        logger.warning("Offline mode enabled. Skipping dataset card upload.")
        return

    try:
        # Get dataset repo name
        settings = _extract_settings(config)
        dataset_repo_name = settings.repo_id
        logger.info(f"Uploading card for dataset: {dataset_repo_name}")

        # Load template
        if not template_path:
            # Try to find template in utils directory
            current_dir = os.path.dirname(__file__)
            template_path = os.path.join(current_dir, "yourbench_card_template.md")

        logger.info(f"Loading template from: {template_path}")

        if not os.path.exists(template_path):
            logger.error(f"Template file not found: {template_path}")
            return

        with open(template_path, "r", encoding="utf-8") as f:
            template_str = f.read()

        logger.debug(f"Template loaded successfully, length: {len(template_str)} characters")

        # Get HF token
        token = settings.token

        # Extract dataset_info section from existing README if available
        config_data = extract_dataset_info(repo_id=dataset_repo_name, token=token)
        logger.info(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters")

        # Use explicitly configured pretty_name or generate one from the dataset name
        hf_config = config.get("hf_configuration", {})
        if "pretty_name" in hf_config:
            pretty_name = hf_config["pretty_name"]
        else:
            dataset_name = dataset_repo_name.split("/")[-1]
            pretty_name = dataset_name.replace("-", " ").replace("_", " ").title()

        card_data_kwargs = {"pretty_name": pretty_name}

        # Create DatasetCardData with our metadata
        card_data = DatasetCardData(**card_data_kwargs)
        logger.info(f"Created card data with pretty_name: {card_data.pretty_name}")

        # Get YourBench version
        from importlib.metadata import PackageNotFoundError, version

        try:
            version_str = version("yourbench")
        except PackageNotFoundError:
            # Fallback for development installs
            version_str = "dev"

        # Prepare template variables
        template_vars = {
            "pretty_name": card_data.pretty_name,
            "yourbench_version": version_str,
            "config_yaml": _serialize_config_for_card(config),
            "pipeline_subsets": _get_pipeline_subset_info(config),
            "config_data": config_data,  # Use the extracted dataset_info section
            "footer": hf_config.get("footer", "*(This dataset card was automatically generated by YourBench)*"),
        }

        logger.info("Rendering dataset card from template")
        logger.debug(f"Template variables: {list(template_vars.keys())}")

        # Render card with our template and variables
        card = DatasetCard.from_template(card_data=card_data, template_str=template_str, **template_vars)

        logger.info("Template rendered successfully")
        logger.debug(f"Rendered card content length: {len(str(card))} characters")

        # Push to hub
        logger.info(f"Pushing dataset card to hub: {dataset_repo_name}")
        card.push_to_hub(dataset_repo_name, token=token)

        logger.success(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{dataset_repo_name}")

    except Exception as e:
        logger.error(f"Failed to upload dataset card: {e}")
        logger.exception("Full traceback:")