in cohere_vector/_tools/parse_documents.py [0:0]
def output_documents(docs_file, start_index, end_index):
doc_count = 0
dataset_size = end_index - start_index
print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
docs = load_dataset(
DATASET_NAME,
split=f"train[{start_index}:{end_index}]",
num_proc=DATASET_DL_PROCS,
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
)
progress_bar(doc_count, dataset_size)
for doc in docs:
docs_file.write(
json.dumps(
{"docid": doc["docid"], "title": doc["title"], "text": doc["text"], "emb": doc["emb"]},
ensure_ascii=True,
)
)
docs_file.write("\n")
doc_count += 1
if doc_count % PROGRESS_EVERY == 0:
progress_bar(doc_count, dataset_size)
print(f"Wrote {doc_count} documents to output file.")