def chunk_doc()

in collection/paragraph_chunker.py [0:0]


def chunk_doc(content: str) -> List[str]:
    """Given a document, return a list of passages of no fewer than MIN_PASSAGE_TOKENS tokens / passage until EOF."""
    passages = []
    passage_tokens = []
    lines = content.split('\n')
    for line in lines:
        line = line.rstrip()

        if '===' in line:
            continue
        if len(line) == 0:
            continue

        tokens = line.split()
        passage_tokens.extend(tokens)

        if len(passage_tokens) > MIN_PASSAGE_TOKENS:
            passages.append(' '.join(passage_tokens))
            passage_tokens = []

    passages.append(' '.join(passage_tokens))
    return passages