in retrieval_service/run_generate_policy_dataset.py [0:0]
def text_split(data):
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2")]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(data)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=30,
length_function=len,
)
splits = text_splitter.split_documents(md_header_splits)
chunked = [{"content": s.page_content} for s in splits]
return chunked