in misc/reference_datasets/multilingual/download_hplt.py [0:0]
def run(self, data=None, rank: int = 0, world_size: int = 1):
"""
Will get this rank's shard and sequentially read each file in the shard, yielding Document.
Args:
data: any existing data from previous pipeline stages
rank: rank of the current task
world_size: total number of tasks
Returns:
"""
from loguru import logger
import random
if data:
yield from data
with self.data_folder.open("hplt_monolingual_map_cleaned_1.2.txt", "rt") as f:
files = [path.removeprefix("https://data.hplt-project.org/one/monotext/cleaned/") for path in
f.read().splitlines()]
files_shard = files[rank::world_size]
if len(files_shard) == 0:
if rank == 0:
raise RuntimeError(f"No files found on {self.data_folder.path}!")
# otherwise just a warning
logger.warning(f"No files found on {self.data_folder.path} for {rank=}")
if self.shuffle_files:
random.shuffle(files_shard)
for doc in self.read_files_shard(files_shard):
self.update_doc_stats(doc)
yield doc