in chunking/chunkers/doc_analysis_chunker.py [0:0]
def _truncate_chunk(self, text):
"""
Truncates and normalizes the text to ensure it fits within the maximum chunk size.
This method first cleans up the text by removing unnecessary spaces and line breaks.
If the text still exceeds the maximum token limit, it iteratively truncates the text
until it fits within the limit.
This method overrides the parent class's method because it includes logic to retain
PageBreaks within the truncated text.
Args:
text (str): The text to be truncated and normalized.
Returns:
str: The truncated and normalized text.
"""
# Clean up text (e.g. line breaks)
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'[\n\r]+', ' ', text).strip()
page_breaks = re.findall(r'PageBreak\d{5}', text)
# Truncate if necessary
if self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
logging.info(f"[doc_analysis_chunker][{self.filename}] token limit reached, truncating...")
step_size = 1 # Initial step size
iteration = 0 # Iteration counter
while self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
# Truncate the text
text = text[:-step_size]
iteration += 1
# Increase step size exponentially every 5 iterations
if iteration % 5 == 0:
step_size = min(step_size * 2, 100)
# Reinsert page breaks and recheck size
for page_break in page_breaks:
page_break_text = f" <!-- {page_break} -->"
if page_break not in text:
# Calculate the size needed for the page break addition
needed_size = self.token_estimator.estimate_tokens(page_break_text)
# Truncate exactly the size needed to accommodate the page break
while self.token_estimator.estimate_tokens(text) + needed_size > self.max_chunk_size:
text = text[:-1] # Remove one character at a time
# Now add the page break
text += page_break_text
return text