chunking/document_chunking.py (52 lines of code) (raw):

import logging import time import jsonschema from utils import get_filename_from_data from .chunker_factory import ChunkerFactory class DocumentChunker: """ DocumentChunker class is responsible for processing and chunking documents into smaller parts. Chunking Process: ----------------- The DocumentChunker handles the entire process of chunking a document, from initializing the appropriate chunker based on the document's file extension to splitting the document into manageable chunks. - Extension-based Chunking: The class uses `ChunkerFactory` to determine the correct chunker based on the file extension. - Error Handling: The class includes mechanisms to handle and log general errors. Error Messages: --------------- - Generates specific error messages for different scenarios. - Logs errors and exceptions with detailed information for debugging purposes. Logging: -------- - Logs the chunking process, including the time taken, the number of chunks created, and any errors or warnings encountered. Returns: -------- The `chunk` method returns a tuple containing: - chunks: The list of document chunks created during the process. - errors: A list of error messages encountered during the chunking process. - warnings: A list of warnings generated during the chunking process. """ def __init__(self): pass def _error_message(self, exception=None, filename=""): """Generate an error message based on the error type.""" error_message = "An error occurred while processing the document." if exception is not None: error_message += f" Exception: {str(exception)}" logging.error(f"[document_chunking]{f'[{filename}]' if filename else ''} Error: {error_message}, Ingested Document: {f'[{filename}]' if filename else ''}") return error_message def chunk_document(self, data): """Chunk the document into smaller parts.""" chunks = [] errors = [] warnings = [] filename = get_filename_from_data(data) try: chunker = ChunkerFactory().get_chunker(data) chunks = chunker.get_chunks() except Exception as e: errors.append(self._error_message(exception=e, filename=filename)) return chunks, errors, warnings def _format_messages(self, messages): formatted = [{"message": msg} for msg in messages] return formatted def chunk_documents(self, data): """ Processes and chunks the document provided in the input data, returning the chunks along with any errors or warnings encountered. Args: data (dict): A dictionary containing the document's metadata and content. Expected keys include: - "documentUrl" (str): URL of the document. - "documentBytes" (str): Base64-encoded bytes of the document. - Additional optional fields as defined in the input schema. Returns: tuple: A tuple containing three lists: - chunks (list[dict]): The list of document chunks created during the process. - errors (list[str]): A list of error messages encountered during chunking. - warnings (list[str]): A list of warning messages generated during chunking. Raises: jsonschema.exceptions.ValidationError: If the input data does not conform to the expected schema. Exception: For any unexpected errors during the chunking process. Example: >>> chunker = DocumentChunker() >>> chunks, errors, warnings = chunker.chunk_documents(data) """ chunks = [] errors = [] warnings = [] try: start_time = time.time() filename = get_filename_from_data(data) logging.info(f"[document_chunking][{filename}] chunking document.") chunks, errors, warnings = DocumentChunker().chunk_document(data) except jsonschema.exceptions.ValidationError as e: error_message = f"Invalid request: {e}" logging.error(f"[document_chunking] {error_message}") errors.append(error_message) finally: if warnings: warnings = self._format_messages(warnings) if errors: errors = self._format_messages(errors) elapsed_time = time.time() - start_time logging.info( f"[document_chunking][{filename}] Finished chunking in {elapsed_time:.2f} seconds. " f"{len(chunks)} chunks. {len(errors)} errors. {len(warnings)} warnings." ) return chunks, errors, warnings