in muss/mining/preprocessing.py [0:0]
def split_ccnet_shard(shard_path, output_dir, n_docs_per_subshard=10000):
'''We need to split the shards even more for the embeddings to fit in memory'''
def write_lines_to_compressed_file(lines, compressed_filepath):
with gzip.open(compressed_filepath, 'wt', compresslevel=1) as f:
for line in lines:
if not line.endswith('\n'):
line = line + '\n'
f.write(line)
if output_dir.exists():
return
assert str(shard_path).endswith('.json.gz')
shard_path = Path(shard_path)
output_dir.mkdir(exist_ok=True, parents=True)
with gzip.open(shard_path, 'rt') as f:
for file_number, lines in enumerate(batch_items(item_generator=f, batch_size=n_docs_per_subshard)):
assert file_number < 1000
output_filepath = Path(output_dir) / f'{file_number:03d}.json.gz'
if not output_filepath.exists():
write_lines_to_compressed_file(lines, output_filepath)