in src/datatrove/pipeline/writers/huggingface.py [0:0]
def close(self, rank: int = 0):
filelist = list(self.output_mg.get_open_files().keys())
super().close()
if filelist:
logger.info(f"Starting upload of {len(filelist)} files to {self.dataset}")
self.upload_files(*filelist)
retries = 0
while True:
try:
create_commit(
self.dataset,
repo_type="dataset",
operations=self.operations,
commit_message=f"DataTrove upload ({len(self.operations)} files)",
revision=self.revision,
)
break
except HfHubHTTPError as e:
if "A commit has happened since" in e.server_message:
if retries >= MAX_RETRIES:
logger.error(f"Failed to create commit after {MAX_RETRIES=}. Giving up.")
raise e
logger.info("Commit creation race condition issue. Waiting...")
time.sleep(BASE_DELAY * 2**retries + random.uniform(0, 2))
retries += 1
else:
raise e