cohere_vector/_tools/parse_documents.py [13:49]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PROGRESS_EVERY = 100


def progress_bar(count, total):
    bar_length = 100
    filled_length = int(round(bar_length * count / float(total)))
    percentage = round(100.0 * count / float(total), 1)
    bar = "=" * filled_length + "-" * (bar_length - filled_length)
    sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
    sys.stdout.flush()


def output_pages(start_page, end_page):
    for page in range(start_page, end_page + 1):
        start_index = (page - 1) * MAX_DOCS_PER_FILE
        end_index = start_index + MAX_DOCS_PER_FILE
        if end_index > TOTAL_DOCS:
            end_index = TOTAL_DOCS
        output_filename = f"{OUTPUT_FILENAME}-{page:02d}.json"
        print(f"Outputing page {page} documents to {output_filename}")
        with open(output_filename, "w") as documents_file:
            output_documents(documents_file, start_index, end_index)


def output_documents(docs_file, start_index, end_index):
    doc_count = 0
    dataset_size = end_index - start_index
    print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
    docs = load_dataset(
        DATASET_NAME,
        split=f"train[{start_index}:{end_index}]",
        num_proc=DATASET_DL_PROCS,
        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    )

    progress_bar(doc_count, dataset_size)
    for doc in docs:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


msmarco-v2-vector/_tools/parse_documents.py [15:51]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PROGRESS_EVERY = 100


def progress_bar(count, total):
    bar_length = 100
    filled_length = int(round(bar_length * count / float(total)))
    percentage = round(100.0 * count / float(total), 1)
    bar = "=" * filled_length + "-" * (bar_length - filled_length)
    sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
    sys.stdout.flush()


def output_pages(start_page, end_page):
    for page in range(start_page, end_page + 1):
        start_index = (page - 1) * MAX_DOCS_PER_FILE
        end_index = start_index + MAX_DOCS_PER_FILE
        if end_index > TOTAL_DOCS:
            end_index = TOTAL_DOCS
        output_filename = f"{OUTPUT_FILENAME}-{page:02d}.json"
        print(f"Outputing page {page} documents to {output_filename}")
        with open(output_filename, "w") as documents_file:
            output_documents(documents_file, start_index, end_index)


def output_documents(docs_file, start_index, end_index):
    doc_count = 0
    dataset_size = end_index - start_index
    print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
    docs = load_dataset(
        DATASET_NAME,
        split=f"train[{start_index}:{end_index}]",
        num_proc=DATASET_DL_PROCS,
        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    )

    progress_bar(doc_count, dataset_size)
    for doc in docs:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -