def chunk_documents()

in collection/paragraph_chunker.py [0:0]


def chunk_documents(input_directory: str, output_directory: str, workers: int) -> None:
    """Iterate .jsonl files in input_directory and output .jsonl files in output_directory where each doc is chunked."""
    input_directory_path = Path(input_directory)

    jsonl_files = list(input_directory_path.glob('**/*.jsonl'))

    with multiprocessing.Pool(workers) as p:
        for i, _ in enumerate(
            p.imap_unordered(
                process_file,
                [(input_directory, output_directory, f) for f in jsonl_files],
                chunksize=16,
            )
        ):
            if (i + 1) % 100 == 0:
                logging.info(f'Processed {i + 1} / {len(jsonl_files)} files...')