in src/datasets/builder.py [0:0]
def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):
"""Downloads and prepares dataset for reading.
This is the internal implementation to overwrite called when user calls
`download_and_prepare`. It should download all required data and generate
the pre-processed datasets files.
Args:
dl_manager ([`DownloadManager`]):
`DownloadManager` used to download and cache data.
verification_mode ([`VerificationMode`]):
if `ALL_CHECKS`, perform all the verifications including checksums.
if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
if `NO_CHECKS`, do not perform any verification.
prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
"""
# Generating data for all splits
split_dict = SplitDict(dataset_name=self.dataset_name)
split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
# Checksums verification
if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:
verify_checksums(
self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), "dataset source files"
)
# Build splits
for split_generator in split_generators:
if str(split_generator.split_info.name).lower() == "all":
raise ValueError(
"`all` is a special split keyword corresponding to the "
"union of all splits, so cannot be used as key in "
"._split_generator()."
)
logger.info(f"Generating {split_generator.split_info.name} split")
split_dict.add(split_generator.split_info)
try:
# Prepare split will record examples associated to the split
self._prepare_split(split_generator, **prepare_split_kwargs)
except OSError as e:
raise OSError(
"Cannot find data file. "
+ (self.manual_download_instructions or "")
+ "\nOriginal error:\n"
+ str(e)
) from None
# If check_duplicates is set to True , then except DuplicatedKeysError
except DuplicatedKeysError as e:
raise DuplicatedKeysError(
e.key,
e.duplicate_key_indices,
fix_msg=f"To avoid duplicate keys, please fix the dataset splits for {self.name}",
) from None
dl_manager.manage_extracted_files()
if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
verify_splits(self.info.splits, split_dict)
# Update the info object with the splits.
self.info.splits = split_dict
self.info.download_size = dl_manager.downloaded_size