def output_documents()

in cohere_vector/_tools/parse_documents.py [0:0]


def output_documents(docs_file, start_index, end_index):
    doc_count = 0
    dataset_size = end_index - start_index
    print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
    docs = load_dataset(
        DATASET_NAME,
        split=f"train[{start_index}:{end_index}]",
        num_proc=DATASET_DL_PROCS,
        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    )

    progress_bar(doc_count, dataset_size)
    for doc in docs:
        docs_file.write(
            json.dumps(
                {"docid": doc["docid"], "title": doc["title"], "text": doc["text"], "emb": doc["emb"]},
                ensure_ascii=True,
            )
        )
        docs_file.write("\n")
        doc_count += 1
        if doc_count % PROGRESS_EVERY == 0:
            progress_bar(doc_count, dataset_size)
    print(f"Wrote {doc_count} documents to output file.")