elser-ingest-speedtest/_support/generate_fixed_multiple_lengths_docs.py (12 lines of code) (raw):
import json
import random
DOC_LENGTHS = [128, 256, 384, 512]
DOCUMENT_COUNT = 10000
with open("bert_vocab_whole_words.json") as word_file:
word_list = json.load(word_file)
for doc_length in DOC_LENGTHS:
with open(f"../{doc_length}_document_set.json", "w") as doc_file:
for i in range(DOCUMENT_COUNT):
doc_words = random.choices(word_list, k=doc_length)
doc = {"body": " ".join(doc_words)}
doc_file.writelines([json.dumps(doc), "\n"])