def output_documents()

in rally-custom/custom_tracks/opensearch/openai_vector/_tools/parse_documents.py [0:0]


def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
    if max_parallel_indexing_docs < 0:
        raise ValueError("max_parallel_indexing_docs must be >= 0")

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    with pa.memory_map(input_file_path, "rb") as source:
        doc_table = pa.ipc.open_stream(source).read_all()

        if max_initial_indexing_docs < 0:
            # Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
            initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
        else:
            initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)

        parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)

        parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
        parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)