in src/hyperpod_nemo_adapter/collections/data/datasets/hf_dataset.py [0:0]
def _get_data_format(self, path):
if isinstance(path, str):
path = [path]
files = []
for p in path:
files += [f for f in Path(p).iterdir() if f.is_file()]
suffixes_list = list(set(["".join(Path(f).suffixes) for f in files]))
if any(suffix == DataTypes.ARROW for suffix in suffixes_list):
return DataTypes.ARROW
elif any(suffix == DataTypes.JSONGZ for suffix in suffixes_list):
return DataTypes.JSONGZ
elif any(suffix == DataTypes.JSON for suffix in suffixes_list):
return DataTypes.JSON
else:
raise NotImplementedError(
f"Unsupported file format in dataset directory. Expecting files of type '.arrow' '.json.gz' or '.json' but instead found {','.join(suffixes_list)}."
)