in collection/paragraph_chunker.py [0:0]
def chunk_doc(content: str) -> List[str]:
"""Given a document, return a list of passages of no fewer than MIN_PASSAGE_TOKENS tokens / passage until EOF."""
passages = []
passage_tokens = []
lines = content.split('\n')
for line in lines:
line = line.rstrip()
if '===' in line:
continue
if len(line) == 0:
continue
tokens = line.split()
passage_tokens.extend(tokens)
if len(passage_tokens) > MIN_PASSAGE_TOKENS:
passages.append(' '.join(passage_tokens))
passage_tokens = []
passages.append(' '.join(passage_tokens))
return passages