def text_split()

in retrieval_service/run_generate_policy_dataset.py [0:0]


def text_split(data):
    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2")]
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits = markdown_splitter.split_text(data)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=30,
        length_function=len,
    )
    splits = text_splitter.split_documents(md_header_splits)

    chunked = [{"content": s.page_content} for s in splits]
    return chunked