in modules/SwissArmyTransformer/sat/data_utils/jsonlds.py [0:0]
def __init__(self, path, process_fn, seed, *, shuffle_buffer=1000):
# set shuffle_buffer = 1 to disable it, model-parallel will be different due to shuffle
# parse path, may mixed with dir
# if there is a comma not between {}, add one for expansion
path_wo_brace = re.sub(r"\{.*?\}", "", path)
if ',' in path_wo_brace:
path = '{' + path + '}'
expanded_path = []
for p in braceexpand(path):
if p.endswith('.jsonl'):
expanded_path.append(p)
else:
# assert a existing folder
assert os.path.isdir(p), f"{p} is not a valid folder"
# find all jsonl files
for root, dirs, files in os.walk(p):
for file in files:
if file.endswith('.jsonl'):
file_path = os.path.join(root, file)
expanded_path.append(file_path)
path = expanded_path
try:
from sat.mpu import get_model_parallel_world_size
if get_model_parallel_world_size() > 1:
shuffle_buffer = 1
except Exception:
pass
super().__init__(
ConfiguredResampledShards(path, seed), # Lots of shards are recommended, or not evenly
jsonl_samples,
webdataset.shuffle(shuffle_buffer),
process_fn
)