chunking/chunkers/base

import logging import os import re from charset_normalizer import detect from tools import AzureOpenAIClient, GptTokenEstimator from utils.file_utils import get_file_extension, get_filepath_from_data class BaseChunker: """ BaseChunker class serves as an abstract base class for implementing chunking strategies across various document formats. It provides essential methods for managing and processing document content, enabling subclasses to define specific chunking logic. Initialization: --------------- The BaseChunker class is initialized with a `data` dictionary containing the document's metadata and content. The dictionary can include the following keys: Required Keys: -------------- - `documentUrl` (str): The document's URL. - `documentContentType` (str): The MIME type of the document content. Optional Keys: -------------- - `documentSasToken` (str): The SAS token for accessing the document. Can be an empty string if not using storage account or key-based storage access. - `documentContent` (str): The raw content of the document. Defaults to an empty string if not provided. - `documentBytes` (bytes): The binary content of the document. If not provided, `document_bytes` is set to `None`, and a warning is logged. Key Attributes: --------------- - `url` (str): The document's URL. - `sas_token` (str): The SAS token for accessing the document. May be empty if not required. - `file_url` (str): The full URL constructed by concatenating `url` and `sas_token`. - `filename` (str): The name of the file extracted from the URL. - `filepath` (str): The path of the file extracted from the URL. - `extension` (str): The file extension extracted from the URL. - `document_content` (str): The raw content of the document. - `document_bytes` (bytes or None): The binary content of the document if provided; otherwise, `None`. - `token_estimator` (GptTokenEstimator): An instance for estimating token counts. - `aoai_client` (AzureOpenAIClient): An instance of the Azure OpenAI client initialized with the filename. Abstract Method: ---------------- - `get_chunks`: An abstract method that must be implemented by subclasses to define specific chunking logic. This method is responsible for splitting the document content into manageable chunks. Chunk Creation: --------------- - `_create_chunk`: Initializes a chunk dictionary with metadata such as chunk ID, content, page number, and related images or files. This method also generates a content vector using Azure OpenAI embeddings. Title Extraction: ----------------- - `_extract_title_from_filename`: Extracts a title from the document's filename by removing the extension, replacing delimiters with spaces, and capitalizing words appropriately. This method ensures a user-friendly title is generated for the document. Text Truncation and Normalization: ---------------------------------- - `_truncate_and_normalize_text`: Truncates and normalizes the text to ensure it fits within a defined maximum chunk size. The method first cleans up unnecessary spaces and line breaks and then truncates the text iteratively if it exceeds the token limit. Error Handling: --------------- - Comprehensive error handling is implemented in the `_extract_title_from_filename` method, logging any issues encountered during title extraction. - If `document_bytes` is not provided during initialization, a warning is logged to inform the user. Logging: -------- - The class includes logging for truncation warnings and title extraction errors to facilitate debugging and monitoring of the chunking process. """ def __init__(self, data): """ Initializes the BaseChunker with the provided data dictionary. Parameters ---------- data : dict A dictionary containing the following keys: Required: - "documentUrl" (str): The URL of the document. - "documentContentType" (str): The MIME type of the document content. - "documentBytes" (bytes): The binary content of the document. Optional: - "documentSasToken" (str): The SAS token for accessing the document. Can be an empty string if not using storage account or key-based storage access. - "documentContent" (str): The raw content of the document. Attributes ---------- url : str The document's URL. sas_token : str The SAS token for accessing the document. May be empty if not required. file_url : str The full URL constructed by concatenating `url` and `sas_token`. filename : str The name of the file extracted from the URL. filepath : str The path of the file extracted from the URL. extension : str The file extension extracted from the URL. document_content : str The raw content of the document. document_bytes : bytes or None The binary content of the document if provided; otherwise, `None`. token_estimator : GptTokenEstimator An instance for estimating token counts. aoai_client : AzureOpenAIClient An instance of the Azure OpenAI client initialized with the filename. """ self.data = data self.url = data['documentUrl'] self.sas_token = data.get('documentSasToken', "") self.file_url = f"{self.url}{self.sas_token}" self.filename = data['fileName'] self.filepath = get_filepath_from_data(data) self.extension = get_file_extension(self.filename) document_content = data.get('documentContent') self.document_content = document_content if document_content else "" self.token_estimator = GptTokenEstimator() self.aoai_client = AzureOpenAIClient(document_filename=self.filename) document_bytes = data.get('documentBytes') if document_bytes: self.document_bytes = document_bytes else: self.document_bytes = None logging.warning(f"[base_chunker][{self.filename}] Document bytes not provided.") self.embeddings_vector_size = int(os.getenv("AZURE_EMBEDDINGS_VECTOR_SIZE", "3072")) def get_chunks(self): """Abstract method to be implemented by subclasses.""" pass def _create_chunk( self, chunk_id, content, summary="", embedding_text="", title="", page=0, offset=0, related_images=None, related_files=None ): """ Initialize a chunk dictionary with truncated content if necessary. This method creates a chunk dictionary with various attributes, including an embedding vector. If an embedding_text is provided, it will use the embedding_text to generate the embedding. If no embedding_text is available, it will fall back to using the content text. Args: chunk_id (str): Sequential number for the chunk. content (str): The main content of the chunk. summary (str, optional): A brief summary of the content. Defaults to an empty string. embedding_text (str, optional): Text used to generate the embedding. Defaults to an empty string. title (str, optional): The title of the chunk. Defaults to an empty string. page (int, optional): The page number where the chunk is located. Defaults to 0. offset (int, optional): The offset position of the chunk in the content. Defaults to 0. related_images (list, optional): A list of related images. Defaults to an empty list. related_files (list, optional): A list of related files. Defaults to an empty list. Returns: dict: A dictionary representing the chunk with all the attributes, including the embedding vector. """ # Initialize related_images and related_files if they are None if related_images is None: related_images = [] if related_files is None: related_files = [] # Define the maximum allowed byte size for the content field MAX_CONTENT_BYTES = 32766 # Function to truncate content to fit within the byte limit without breaking UTF-8 characters def truncate_content(content_str, max_bytes): encoded_content = content_str.encode('utf-8') if len(encoded_content) <= max_bytes: return content_str # No truncation needed # Truncate the byte array to the maximum allowed size truncated_bytes = encoded_content[:max_bytes] # Decode back to string, ignoring any incomplete characters at the end return truncated_bytes.decode('utf-8', 'ignore') # Truncate the content if it exceeds the maximum byte size truncated_content = truncate_content(content, MAX_CONTENT_BYTES) # Optionally, you can log or handle the truncation event here # For example: # if truncated_content != content: # self.logger.warning(f"Content truncated from {len(content.encode('utf-8'))} to {MAX_CONTENT_BYTES} bytes.") # Use summary for embedding if available; otherwise, use truncated content embedding_text = embedding_text if embedding_text else truncated_content content_vector = self.aoai_client.get_embeddings(embedding_text) return { "chunk_id": chunk_id, "url": self.url, "filepath": self.filepath, "content": truncated_content, "imageCaptions": "", "summary": summary, "category": "", "length": len(truncated_content), # Length in characters "contentVector": content_vector, "captionVector": [0.0] * self.embeddings_vector_size, "title": self._extract_title_from_filename(self.filename) if not title else title, "page": page, "offset": offset, "relatedImages": related_images, "relatedFiles": related_files } def _extract_title_from_filename(self, filename): """ Extracts a title from a filename by removing the extension and replacing underscores or other delimiters with spaces, then capitalizing words appropriately. Args: filename (str): The name of the file. Returns: str: The extracted title. """ try: # Remove the file extension title = os.path.splitext(filename)[0] # Replace common delimiters with spaces title = re.sub(r'[_-]', ' ', title) # Add a space before any capital letter that follows a lowercase letter or number title = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', title) # Capitalize the first letter of each word title = title.title() return title except Exception as e: logging.error(f"[base_chunker][{filename}] Error extracting title from filename '{filename}': {e}") return "filename" def _truncate_chunk(self, text): """ Truncates the chunk to ensure it fits within the maximum chunk size. This method first cleans up the text by removing unnecessary spaces and line breaks. If the text still exceeds the maximum token limit, it iteratively truncates the text until it fits within the limit. Args: text (str): The text to be truncated. Returns: str: The truncated chunk. """ if self.token_estimator.estimate_tokens(text) > self.max_chunk_size: logging.info(f"[base_chunker][{self.filename}] Token limit exceeded maximum length, truncating...") step_size = 1 # Initial step size iteration = 0 # Iteration counter while self.token_estimator.estimate_tokens(text) > self.max_chunk_size: text = text[:-step_size] iteration += 1 # Increase step size exponentially every 5 iterations if iteration % 5 == 0: step_size = min(step_size * 2, 100) return text def decode_to_utf8(self,blob_data): # Detect the encoding detected = detect(blob_data) encoding = detected.get('encoding', 'utf-8') # Default to UTF-8 if detection fails # Decode the data to text using the detected encoding try: text = blob_data.decode(encoding, errors='replace') except (UnicodeDecodeError, LookupError): # Fallback in case of errors logging.info(f"[base_chunker][{self.filename}] Failed to decode with detected encoding: {encoding}. Falling back to 'utf-8'.") text = blob_data.decode('utf-8', errors='replace') return text

chunking/chunkers/base_chunker.py (101 lines of code) (raw):