in src/co_op_translator/utils/llm/markdown_utils.py [0:0]
def split_markdown_content(content: str, max_tokens: int, tokenizer) -> list:
"""
Split the markdown content into smaller chunks based on code blocks, blockquotes, or HTML.
Args:
content (str): The markdown content to split.
max_tokens (int): The maximum number of tokens allowed per chunk.
tokenizer: The tokenizer to use for counting tokens.
Returns:
list: A list of markdown chunks.
"""
chunks = []
block_pattern = re.compile(
r"(```[\s\S]*?```|<.*?>|(?:>\s+.*(?:\n>.*|\n(?!\n))*\n?)+)"
)
parts = block_pattern.split(content)
current_chunk = []
current_length = 0
for part in parts:
part_tokens = count_tokens(part, tokenizer)
if current_length + part_tokens <= max_tokens:
current_chunk.append(part)
current_length += part_tokens
else:
if block_pattern.match(part):
if current_chunk:
chunks.append("".join(current_chunk))
chunks.append(part)
current_chunk = []
current_length = 0
else:
words = part.split()
for word in words:
word_tokens = count_tokens(word + " ", tokenizer)
if current_length + word_tokens > max_tokens:
chunks.append("".join(current_chunk))
current_chunk = [word + " "]
current_length = word_tokens
else:
current_chunk.append(word + " ")
current_length += word_tokens
if current_chunk:
chunks.append("".join(current_chunk))
return chunks