in collection/paragraph_chunker.py [0:0]
def chunk_documents(input_directory: str, output_directory: str, workers: int) -> None:
"""Iterate .jsonl files in input_directory and output .jsonl files in output_directory where each doc is chunked."""
input_directory_path = Path(input_directory)
jsonl_files = list(input_directory_path.glob('**/*.jsonl'))
with multiprocessing.Pool(workers) as p:
for i, _ in enumerate(
p.imap_unordered(
process_file,
[(input_directory, output_directory, f) for f in jsonl_files],
chunksize=16,
)
):
if (i + 1) % 100 == 0:
logging.info(f'Processed {i + 1} / {len(jsonl_files)} files...')