in chunking/chunkers/langchain_chunker.py [0:0]
def get_chunks(self):
"""
Splits the document content into chunks based on the specified format and criteria.
Returns:
list: A list of dictionaries, each representing a chunk of the document.
"""
chunks = []
if self.extension not in self.supported_formats:
raise UnsupportedFormatError(f"[langchain_chunker] {self.filename} {self.extension} format is not supported")
blob_data = self.document_bytes
text = self.decode_to_utf8(blob_data)
text_chunks = self._chunk_content(text)
skipped_chunks = 0
chunk_id = 0
for text_chunk, num_tokens in text_chunks:
if num_tokens >= self.minimum_chunk_size:
chunk_id += 1
chunk_size = self.token_estimator.estimate_tokens(text_chunk)
if chunk_size > self.max_chunk_size:
logging.info(f"[langchain_chunker][{self.filename}] truncating {chunk_size} size chunk to fit within {self.max_chunk_size} tokens")
text_chunk = self._truncate_chunk(text_chunk)
chunk_dict = self._create_chunk(chunk_id, text_chunk)
chunks.append(chunk_dict)
else:
skipped_chunks += 1
logging.debug(f"[langchain_chunker][{self.filename}] {len(chunks)} chunk(s) created")
if skipped_chunks > 0:
logging.debug(f"[langchain_chunker][{self.filename}] {skipped_chunks} chunk(s) skipped")
return chunks