in rally-custom/custom_tracks/elasticsearch/openai_vector/_tools/parse_documents.py [0:0]
def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
output_file_path = os.path.join(OUTPUT_DIR, output_filename)
print(f"Writing {doc_count} documents to {output_file_path}")
with bz2.open(output_file_path, "wt") as output_file:
if doc_count <= 0:
# Return here so we always create the output file
return
doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)
docs_written = 0
progress_bar(docs_written, doc_count)
for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
docid_col = record_batch.column("_id")
title_col = record_batch.column("title")
text_col = record_batch.column("text")
emb_col = record_batch.column("embedding")
for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
output_file.write(
json.dumps(
{"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
)
)
output_file.write("\n")
docs_written += record_batch.num_rows
progress_bar(docs_written, doc_count)
# Print newline so that progress bar is not overwritten by next print statement
print()