def _create_text_chunks()

in chunking/chunkers/multimodal_chunker.py [0:0]


    def _create_text_chunks(self, document):
        """
        Splits the document content into chunks based on specified format and criteria.

        Args:
            document (dict): The document containing content to be chunked.

        Returns:
            list: A list of chunk dictionaries with content and metadata.
        """
        chunks = []
        document_content = document['content']
        document_content = self._number_pagebreaks(document_content)
        text_chunks = self._chunk_content(document_content)
        
        chunk_id = 0
        skipped_chunks = 0
        current_page = 1

        for text_chunk, num_tokens, chunk_offset, chunk_length in text_chunks:
            current_page = self._update_page(text_chunk, current_page)
            chunk_page = self._determine_chunk_page(text_chunk, current_page)

            if num_tokens >= self.minimum_chunk_size:
                chunk_id += 1
                chunk = self._create_chunk(
                    chunk_id=chunk_id,
                    content=text_chunk,
                    page=chunk_page,
                    offset=chunk_offset,
                )
                chunks.append(chunk)
            else:
                skipped_chunks += 1
        
        logging.debug(f"[multimodal_chunker][{self.filename}] {len(chunks)} chunk(s) created")
        if skipped_chunks > 0:
            logging.debug(f"[multimodal_chunker][{self.filename}] {skipped_chunks} chunk(s) skipped")

        return chunks