in rally-custom/custom_tracks/elasticsearch/openai_vector/_tools/parse_documents.py [0:0]
def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
if max_parallel_indexing_docs < 0:
raise ValueError("max_parallel_indexing_docs must be >= 0")
os.makedirs(OUTPUT_DIR, exist_ok=True)
with pa.memory_map(input_file_path, "rb") as source:
doc_table = pa.ipc.open_stream(source).read_all()
if max_initial_indexing_docs < 0:
# Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
else:
initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)
parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)
parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)