tablestore-java-mcp-server-rag/knowledge-data-generator/chunk.py (37 lines of code) (raw):
from markdown_it import MarkdownIt
def parse_markdown_to_ast(markdown_text):
md = MarkdownIt()
tokens = md.parse(markdown_text)
return tokens, md
def get_token_text(token):
if token.type == "inline":
return "".join(child.content for child in token.children if hasattr(child, "content"))
elif token.type in ["paragraph_open", "paragraph_close"]:
return ""
else:
return token.content or ""
def split_ast_by_size(tokens, max_size):
chunks = []
current_chunk = []
current_size = 0
for token in tokens:
token_text = get_token_text(token)
token_size = len(token_text)
if current_size + token_size > max_size and current_chunk:
chunks.append(current_chunk)
current_chunk = []
current_size = 0
current_chunk.append(token)
current_size += token_size
if current_chunk:
chunks.append(current_chunk)
return chunks
def tokens_to_markdown(tokens, md):
return md.renderer.render(tokens, md.options, {})
def process_markdown(markdown_text, max_size):
tokens, md = parse_markdown_to_ast(markdown_text)
chunks = split_ast_by_size(tokens, max_size)
return [tokens_to_markdown(chunk, md) for chunk in chunks]
def to_chunks(markdown_text, max_size):
chunks = process_markdown(markdown_text, max_size)
return chunks