in chunking/chunkers/multimodal_chunker.py [0:0]
def _create_text_chunks(self, document):
"""
Splits the document content into chunks based on specified format and criteria.
Args:
document (dict): The document containing content to be chunked.
Returns:
list: A list of chunk dictionaries with content and metadata.
"""
chunks = []
document_content = document['content']
document_content = self._number_pagebreaks(document_content)
text_chunks = self._chunk_content(document_content)
chunk_id = 0
skipped_chunks = 0
current_page = 1
for text_chunk, num_tokens, chunk_offset, chunk_length in text_chunks:
current_page = self._update_page(text_chunk, current_page)
chunk_page = self._determine_chunk_page(text_chunk, current_page)
if num_tokens >= self.minimum_chunk_size:
chunk_id += 1
chunk = self._create_chunk(
chunk_id=chunk_id,
content=text_chunk,
page=chunk_page,
offset=chunk_offset,
)
chunks.append(chunk)
else:
skipped_chunks += 1
logging.debug(f"[multimodal_chunker][{self.filename}] {len(chunks)} chunk(s) created")
if skipped_chunks > 0:
logging.debug(f"[multimodal_chunker][{self.filename}] {skipped_chunks} chunk(s) skipped")
return chunks