in chunking/chunkers/base_chunker.py [0:0]
def _truncate_chunk(self, text):
"""
Truncates the chunk to ensure it fits within the maximum chunk size.
This method first cleans up the text by removing unnecessary spaces and line breaks.
If the text still exceeds the maximum token limit, it iteratively truncates the text
until it fits within the limit.
Args:
text (str): The text to be truncated.
Returns:
str: The truncated chunk.
"""
if self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
logging.info(f"[base_chunker][{self.filename}] Token limit exceeded maximum length, truncating...")
step_size = 1 # Initial step size
iteration = 0 # Iteration counter
while self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
text = text[:-step_size]
iteration += 1
# Increase step size exponentially every 5 iterations
if iteration % 5 == 0:
step_size = min(step_size * 2, 100)
return text