in collection/paragraph_chunker.py [0:0]
def process_file(tup: Tuple[str, str, Path]) -> None:
"""Chunk all documents in a single file."""
input_directory, output_directory, input_file = tup
output_file = str(input_file).replace(input_directory, output_directory)
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(input_file) as f1, open(output_path, 'w') as f2:
for jsonl in f1:
doc = json.loads(jsonl)
passages = chunk_doc(doc['contents'])
for i, passage in enumerate(passages):
paragraph = {'id': f"{doc['id']}_p{i}", 'contents': passage}
f2.write(json.dumps(paragraph) + '\n')