src/datasets/dataset_dict.py [1774:1920]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        info_to_dump.download_checksums = None
        info_to_dump.download_size = total_uploaded_size
        info_to_dump.dataset_size = total_dataset_nbytes
        info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes

        # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern)
        # and delete old split shards (if they exist)
        repo_with_dataset_card, repo_with_dataset_infos = False, False
        repo_splits: list[str] = []  # use a list to keep the order of the splits
        deletions: list[CommitOperationDelete] = []
        repo_files_to_add = [addition.path_in_repo for addition in additions]
        for repo_file in api.list_repo_tree(
            repo_id=repo_id,
            revision=revision,
            repo_type="dataset",
            token=token,
            recursive=True,
        ):
            if not isinstance(repo_file, RepoFile):
                continue
            if repo_file.rfilename == config.REPOCARD_FILENAME:
                repo_with_dataset_card = True
            elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
                repo_with_dataset_infos = True
            elif (
                repo_file.rfilename.startswith(tuple(f"{data_dir}/{split}-" for split in self.keys()))
                and repo_file.rfilename not in repo_files_to_add
            ):
                deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename))
            elif fnmatch.fnmatch(
                repo_file.rfilename,
                PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"),
            ):
                pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED)
                split_pattern_fields = string_to_dict(repo_file.rfilename, pattern)
                assert split_pattern_fields is not None
                repo_split = split_pattern_fields["split"]
                if repo_split not in repo_splits:
                    repo_splits.append(repo_split)

        # get the info from the README to update them
        if repo_with_dataset_card:
            dataset_card_path = api.hf_hub_download(
                repo_id,
                config.REPOCARD_FILENAME,
                repo_type="dataset",
                revision=revision,
            )
            dataset_card = DatasetCard.load(Path(dataset_card_path))
            dataset_card_data = dataset_card.data
            metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
        # get the deprecated dataset_infos.json to update them
        elif repo_with_dataset_infos:
            dataset_card = None
            dataset_card_data = DatasetCardData()
            metadata_configs = MetadataConfigs()
        else:
            dataset_card = None
            dataset_card_data = DatasetCardData()
            metadata_configs = MetadataConfigs()
        # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed
        if not metadata_configs and repo_splits:
            default_metadata_configs_to_dump = {
                "data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits]
            }
            MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data)
        metadata_config_to_dump = {
            "data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()],
        }
        configs_to_dump = {config_name: metadata_config_to_dump}
        if set_default and config_name != "default":
            if metadata_configs:
                current_default_config_name = metadata_configs.get_default_config_name()
                if current_default_config_name == "default":
                    raise ValueError(
                        "There exists a configuration named 'default'. To set a different configuration as default, "
                        "rename the 'default' one first."
                    )
                if current_default_config_name:
                    _ = metadata_configs[current_default_config_name].pop("default")
                    configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name]
            metadata_config_to_dump["default"] = True
        # push to the deprecated dataset_infos.json
        if repo_with_dataset_infos:
            dataset_infos_path = api.hf_hub_download(
                repo_id,
                config.DATASETDICT_INFOS_FILENAME,
                repo_type="dataset",
                revision=revision,
            )
            with open(dataset_infos_path, encoding="utf-8") as f:
                dataset_infos: dict = json.load(f)
            dataset_infos[config_name] = asdict(info_to_dump)
            additions.append(
                CommitOperationAdd(
                    path_in_repo=config.DATASETDICT_INFOS_FILENAME,
                    path_or_fileobj=json.dumps(dataset_infos, indent=4).encode("utf-8"),
                )
            )
        # push to README
        DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
        MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data)
        dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
        additions.append(
            CommitOperationAdd(
                path_in_repo=config.REPOCARD_FILENAME,
                path_or_fileobj=str(dataset_card).encode(),
            )
        )

        commit_message = commit_message if commit_message is not None else "Upload dataset"
        if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:
            commit_info = api.create_commit(
                repo_id,
                operations=additions + deletions,
                commit_message=commit_message,
                commit_description=commit_description,
                token=token,
                repo_type="dataset",
                revision=revision,
                create_pr=create_pr,
            )
        else:
            logger.info(
                f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits."
            )
            num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT)
            for i in range(0, num_commits):
                operations = additions[
                    i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT
                ] + (deletions if i == 0 else [])
                commit_info = api.create_commit(
                    repo_id,
                    operations=operations,
                    commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})",
                    commit_description=commit_description,
                    token=token,
                    repo_type="dataset",
                    revision=revision,
                    create_pr=create_pr,
                )
                logger.info(
                    f"Commit #{i + 1} completed"
                    + (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "")
                    + "."
                )
        return commit_info
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



