code/embedding-function/utilities/document_chunking/page.py (31 lines of code) (raw):

from typing import List from .document_chunking_base import DocumentChunkingBase from langchain.text_splitter import MarkdownTextSplitter from .chunking_strategy import ChunkingSettings from ..common.source_document import SourceDocument class PageDocumentChunking(DocumentChunkingBase): def __init__(self) -> None: pass def chunk( self, documents: List[SourceDocument], chunking: ChunkingSettings ) -> List[SourceDocument]: document_url = documents[0].source splitter = MarkdownTextSplitter.from_tiktoken_encoder( chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap ) documents_chunked = [] for idx, document in enumerate(documents): chunked_content_list = splitter.split_text(document.content) for chunked_content in chunked_content_list: documents_chunked.append( SourceDocument.from_metadata( content=chunked_content, document_url=document_url, metadata={ "offset": document.offset, "page_number": document.page_number, }, idx=idx, ) ) return documents_chunked