def _chunk_content()

in chunking/chunkers/langchain_chunker.py [0:0]


    def _chunk_content(self, text):
        """
        Splits the document content into chunks according to the specified format and token limits.

        Args:
            content (str): The full content of the document to be chunked.

        Yields:
            tuple: A tuple containing the chunked content (str) and the number of tokens in the chunk (int).

        The method includes the following steps:
        1. Replaces HTML tables in the content with placeholders to facilitate chunking.
        2. Chooses an appropriate text splitter based on the document's format.
        3. Splits the content into chunks, restoring any original HTML tables after chunking.
        4. Truncates chunks that exceed the maximum token size, ensuring they fit within the limit.
        """
        file_format = self.supported_formats[self.extension]

        if file_format == "markdown":
            splitter = MarkdownTextSplitter.from_tiktoken_encoder(
                chunk_size=self.max_chunk_size, 
                chunk_overlap=self.token_overlap
            )
        elif file_format == "python":
            splitter = PythonCodeTextSplitter.from_tiktoken_encoder(
                chunk_size=self.max_chunk_size, 
                chunk_overlap=self.token_overlap
            )
        else:
            sentence_endings = [".", "!", "?"]
            word_breaks = [" ", "\n", "\t"]
            splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                separators=sentence_endings + word_breaks,
                chunk_size=self.max_chunk_size, 
                chunk_overlap=self.token_overlap
            )
    
        chunked_content_list = splitter.split_text(text)
    
        for chunked_content in chunked_content_list:
            chunk_size = self.token_estimator.estimate_tokens(chunked_content)
            yield chunked_content, chunk_size  # type: ignore