def parse_documents()

in rally-custom/custom_tracks/elasticsearch/openai_vector/_tools/parse_documents.py [0:0]

24 lines of code
4 McCabe index (conditional complexity)


def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
    output_file_path = os.path.join(OUTPUT_DIR, output_filename)
    print(f"Writing {doc_count} documents to {output_file_path}")

    with bz2.open(output_file_path, "wt") as output_file:
        if doc_count <= 0:
            # Return here so we always create the output file
            return

        doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)

        docs_written = 0
        progress_bar(docs_written, doc_count)

        for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
            docid_col = record_batch.column("_id")
            title_col = record_batch.column("title")
            text_col = record_batch.column("text")
            emb_col = record_batch.column("embedding")
            for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
                output_file.write(
                    json.dumps(
                        {"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
                    )
                )
                output_file.write("\n")

            docs_written += record_batch.num_rows
            progress_bar(docs_written, doc_count)

    # Print newline so that progress bar is not overwritten by next print statement
    print()