tsdb/_tools/split.py (16 lines of code) (raw):

#!/usr/bin/env python3 # Split a documents.json files in X parts # https://github.com/elastic/rally/issues/1650#issuecomment-1378344368 import contextlib import sys path = sys.argv[1] # This is the number of documents in the default corpus TOTAL_DOCS = 116633698 n_splits = int(sys.argv[2]) q, r = divmod(total_docs, n_splits) wanted_docs = q * n_splits with contextlib.ExitStack() as stack, open(path, "r") as f: full_filenames = [f"documents-split-{i}.json" for i in range(n_splits)] full_output_files = [stack.enter_context(open(fname, "w")) for fname in full_filenames] for i, line in enumerate(f): if i % 1_000_000 == 0: print(i) full_output_files[i % n_splits].write(line) if i + 1 == wanted_docs: break