def split_markdown_content()

in src/co_op_translator/utils/llm/markdown_utils.py [0:0]


def split_markdown_content(content: str, max_tokens: int, tokenizer) -> list:
    """
    Split the markdown content into smaller chunks based on code blocks, blockquotes, or HTML.

    Args:
        content (str): The markdown content to split.
        max_tokens (int): The maximum number of tokens allowed per chunk.
        tokenizer: The tokenizer to use for counting tokens.

    Returns:
        list: A list of markdown chunks.
    """
    chunks = []
    block_pattern = re.compile(
        r"(```[\s\S]*?```|<.*?>|(?:>\s+.*(?:\n>.*|\n(?!\n))*\n?)+)"
    )
    parts = block_pattern.split(content)

    current_chunk = []
    current_length = 0

    for part in parts:
        part_tokens = count_tokens(part, tokenizer)

        if current_length + part_tokens <= max_tokens:
            current_chunk.append(part)
            current_length += part_tokens
        else:
            if block_pattern.match(part):
                if current_chunk:
                    chunks.append("".join(current_chunk))
                chunks.append(part)
                current_chunk = []
                current_length = 0
            else:
                words = part.split()
                for word in words:
                    word_tokens = count_tokens(word + " ", tokenizer)
                    if current_length + word_tokens > max_tokens:
                        chunks.append("".join(current_chunk))
                        current_chunk = [word + " "]
                        current_length = word_tokens
                    else:
                        current_chunk.append(word + " ")
                        current_length += word_tokens

    if current_chunk:
        chunks.append("".join(current_chunk))

    return chunks