in src/datatrove/utils/dataset.py [0:0]
def _load_file_list(self):
fsizes = {}
if not self.paths_file or not file_exists(self.paths_file):
matched_files = self.folder_path.list_files(
glob_pattern=self.filename_pattern, recursive=self.recursive
)
if not matched_files:
raise FileNotFoundError(f'No files matching "{self.filename_pattern}" found in {self.folder_path}')
else:
with open_file(self.paths_file, "r") as f:
file_data = json.load(f)
matched_files = [f["path"] for f in file_data]
fsizes = {f["path"]: f["size"] for f in file_data}
logger.info(f"Loaded {len(matched_files)} files from {self.paths_file}")
self.files = [
DatatroveFileDataset(
(path, self.folder_path),
self.seq_len,
token_size=self.token_size,
return_positions=self.return_positions,
positions_from_eos_token_id=self.positions_from_eos_token_id,
fsize=fsizes.get(
path, None
), # potentially use a cached size to avoid excessive remote calls/possibly offloaded file
)
for path in matched_files
]
if self.paths_file and not file_exists(self.paths_file):
with open_file(self.paths_file, "wt") as f:
json.dump(
[{"path": rel_path, "size": f.fsize} for rel_path, f in zip(matched_files, self.files)], f
)