def _load_file_list()

in src/datatrove/utils/dataset.py [0:0]


        def _load_file_list(self):
            fsizes = {}

            if not self.paths_file or not file_exists(self.paths_file):
                matched_files = self.folder_path.list_files(
                    glob_pattern=self.filename_pattern, recursive=self.recursive
                )
                if not matched_files:
                    raise FileNotFoundError(f'No files matching "{self.filename_pattern}" found in {self.folder_path}')
            else:
                with open_file(self.paths_file, "r") as f:
                    file_data = json.load(f)
                    matched_files = [f["path"] for f in file_data]
                    fsizes = {f["path"]: f["size"] for f in file_data}
                    logger.info(f"Loaded {len(matched_files)} files from {self.paths_file}")

            self.files = [
                DatatroveFileDataset(
                    (path, self.folder_path),
                    self.seq_len,
                    token_size=self.token_size,
                    return_positions=self.return_positions,
                    positions_from_eos_token_id=self.positions_from_eos_token_id,
                    fsize=fsizes.get(
                        path, None
                    ),  # potentially use a cached size to avoid excessive remote calls/possibly offloaded file
                )
                for path in matched_files
            ]

            if self.paths_file and not file_exists(self.paths_file):
                with open_file(self.paths_file, "wt") as f:
                    json.dump(
                        [{"path": rel_path, "size": f.fsize} for rel_path, f in zip(matched_files, self.files)], f
                    )