def run()

in misc/reference_datasets/multilingual/download_hplt.py [0:0]


    def run(self, data=None, rank: int = 0, world_size: int = 1):
        """
        Will get this rank's shard and sequentially read each file in the shard, yielding Document.
        Args:
            data: any existing data from previous pipeline stages
            rank: rank of the current task
            world_size: total number of tasks

        Returns:

        """
        from loguru import logger
        import random
        if data:
            yield from data
        with self.data_folder.open("hplt_monolingual_map_cleaned_1.2.txt", "rt") as f:
            files = [path.removeprefix("https://data.hplt-project.org/one/monotext/cleaned/") for path in
                     f.read().splitlines()]
        files_shard = files[rank::world_size]
        if len(files_shard) == 0:
            if rank == 0:
                raise RuntimeError(f"No files found on {self.data_folder.path}!")
            # otherwise just a warning
            logger.warning(f"No files found on {self.data_folder.path} for {rank=}")
        if self.shuffle_files:
            random.shuffle(files_shard)
        for doc in self.read_files_shard(files_shard):
            self.update_doc_stats(doc)
            yield doc