in src/hyperpod_nemo_adapter/collections/data/datasets/hf_dataset.py [0:0]
def fetch_dataset(self, path):
match self.data_format:
case DataTypes.ARROW:
dataset = load_from_disk(path)
case DataTypes.JSONGZ:
dataset = load_dataset(
self.input_path,
data_files=[os.path.join(path, f"*{DataTypes.JSONGZ}")],
split=self.partition,
)
case DataTypes.JSON:
dataset = load_dataset(
self.input_path,
data_files=[os.path.join(path, f"*{DataTypes.JSON}")],
split=self.partition,
)
case _:
raise NotImplementedError(f"{self.data_format} is not supported.")
return dataset