def split_ccnet_shard()

in muss/mining/preprocessing.py [0:0]


def split_ccnet_shard(shard_path, output_dir, n_docs_per_subshard=10000):
    '''We need to split the shards even more for the embeddings to fit in memory'''

    def write_lines_to_compressed_file(lines, compressed_filepath):
        with gzip.open(compressed_filepath, 'wt', compresslevel=1) as f:
            for line in lines:
                if not line.endswith('\n'):
                    line = line + '\n'
                f.write(line)

    if output_dir.exists():
        return
    assert str(shard_path).endswith('.json.gz')
    shard_path = Path(shard_path)
    output_dir.mkdir(exist_ok=True, parents=True)
    with gzip.open(shard_path, 'rt') as f:
        for file_number, lines in enumerate(batch_items(item_generator=f, batch_size=n_docs_per_subshard)):
            assert file_number < 1000
            output_filepath = Path(output_dir) / f'{file_number:03d}.json.gz'
            if not output_filepath.exists():
                write_lines_to_compressed_file(lines, output_filepath)