def _truncate_chunk()

in chunking/chunkers/base_chunker.py [0:0]


    def _truncate_chunk(self, text):
        """
        Truncates the chunk to ensure it fits within the maximum chunk size.
        
        This method first cleans up the text by removing unnecessary spaces and line breaks. 
        If the text still exceeds the maximum token limit, it iteratively truncates the text 
        until it fits within the limit.
        
        Args:
            text (str): The text to be truncated.
        
        Returns:
            str: The truncated chunk.
        """
        if self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
            logging.info(f"[base_chunker][{self.filename}] Token limit exceeded maximum length, truncating...")
            step_size = 1  # Initial step size
            iteration = 0  # Iteration counter

            while self.token_estimator.estimate_tokens(text) > self.max_chunk_size:
                text = text[:-step_size]
                iteration += 1

                # Increase step size exponentially every 5 iterations
                if iteration % 5 == 0:
                    step_size = min(step_size * 2, 100)

        return text