def create_markdown_chunks()

in src/doc_builder/build_embeddings.py [0:0]


def create_markdown_chunks(text, page_info=None):
    # todo: replace code blocks
    CODE_COMMENT_ESCAPE = "ESCAPE-PYTHON-CODE-COMMENT"
    _re_codeblock = re.compile(r"```.+?```", re.DOTALL)
    text = _re_codeblock.sub(lambda m: m[0].replace("#", CODE_COMMENT_ESCAPE), text)

    # Insert a newline at the start if not present to standardize the split process
    if not text.startswith("\n"):
        text = "\n" + text

    # Split by headers, keeping the headers as delimiters
    sections = re.split(r"(\n#+ [^\n]+)", text)

    # Organize the content under each heading
    root = None
    node = None

    # Loop through sections to associate text with headings
    for section in sections:
        if section.strip() and re.match(r"\n#+ [^\n]+", section):
            heading = section.strip()
            heading_level = heading.count("#")
            node = MarkdownChunkNode(heading)
            if heading_level == 1:
                root = node
            else:
                if root is None:
                    root = node
                else:
                    root.add_child(node, heading_level)
        elif node:
            section.replace(CODE_COMMENT_ESCAPE, "#")
            node.content += section.strip()

    if root is None:
        return []

    CHUNK_LEN_CHARS = 2000
    chunks = root.get_chunks(page_info, chunk_len_chars=CHUNK_LEN_CHARS)
    return chunks