in src/doc_builder/build_embeddings.py [0:0]
def create_markdown_chunks(text, page_info=None):
# todo: replace code blocks
CODE_COMMENT_ESCAPE = "ESCAPE-PYTHON-CODE-COMMENT"
_re_codeblock = re.compile(r"```.+?```", re.DOTALL)
text = _re_codeblock.sub(lambda m: m[0].replace("#", CODE_COMMENT_ESCAPE), text)
# Insert a newline at the start if not present to standardize the split process
if not text.startswith("\n"):
text = "\n" + text
# Split by headers, keeping the headers as delimiters
sections = re.split(r"(\n#+ [^\n]+)", text)
# Organize the content under each heading
root = None
node = None
# Loop through sections to associate text with headings
for section in sections:
if section.strip() and re.match(r"\n#+ [^\n]+", section):
heading = section.strip()
heading_level = heading.count("#")
node = MarkdownChunkNode(heading)
if heading_level == 1:
root = node
else:
if root is None:
root = node
else:
root.add_child(node, heading_level)
elif node:
section.replace(CODE_COMMENT_ESCAPE, "#")
node.content += section.strip()
if root is None:
return []
CHUNK_LEN_CHARS = 2000
chunks = root.get_chunks(page_info, chunk_len_chars=CHUNK_LEN_CHARS)
return chunks