chunking/document_chunking.py (52 lines of code) (raw):
import logging
import time
import jsonschema
from utils import get_filename_from_data
from .chunker_factory import ChunkerFactory
class DocumentChunker:
"""
DocumentChunker class is responsible for processing and chunking documents into smaller parts.
Chunking Process:
-----------------
The DocumentChunker handles the entire process of chunking a document, from initializing the appropriate
chunker based on the document's file extension to splitting the document into manageable chunks.
- Extension-based Chunking: The class uses `ChunkerFactory` to determine the correct chunker based on the file extension.
- Error Handling: The class includes mechanisms to handle and log general errors.
Error Messages:
---------------
- Generates specific error messages for different scenarios.
- Logs errors and exceptions with detailed information for debugging purposes.
Logging:
--------
- Logs the chunking process, including the time taken, the number of chunks created, and any errors or warnings encountered.
Returns:
--------
The `chunk` method returns a tuple containing:
- chunks: The list of document chunks created during the process.
- errors: A list of error messages encountered during the chunking process.
- warnings: A list of warnings generated during the chunking process.
"""
def __init__(self):
pass
def _error_message(self, exception=None, filename=""):
"""Generate an error message based on the error type."""
error_message = "An error occurred while processing the document."
if exception is not None:
error_message += f" Exception: {str(exception)}"
logging.error(f"[document_chunking]{f'[{filename}]' if filename else ''} Error: {error_message}, Ingested Document: {f'[{filename}]' if filename else ''}")
return error_message
def chunk_document(self, data):
"""Chunk the document into smaller parts."""
chunks = []
errors = []
warnings = []
filename = get_filename_from_data(data)
try:
chunker = ChunkerFactory().get_chunker(data)
chunks = chunker.get_chunks()
except Exception as e:
errors.append(self._error_message(exception=e, filename=filename))
return chunks, errors, warnings
def _format_messages(self, messages):
formatted = [{"message": msg} for msg in messages]
return formatted
def chunk_documents(self, data):
"""
Processes and chunks the document provided in the input data, returning the chunks along with any errors or warnings encountered.
Args:
data (dict):
A dictionary containing the document's metadata and content. Expected keys include:
- "documentUrl" (str): URL of the document.
- "documentBytes" (str): Base64-encoded bytes of the document.
- Additional optional fields as defined in the input schema.
Returns:
tuple:
A tuple containing three lists:
- chunks (list[dict]): The list of document chunks created during the process.
- errors (list[str]): A list of error messages encountered during chunking.
- warnings (list[str]): A list of warning messages generated during chunking.
Raises:
jsonschema.exceptions.ValidationError: If the input data does not conform to the expected schema.
Exception: For any unexpected errors during the chunking process.
Example:
>>> chunker = DocumentChunker()
>>> chunks, errors, warnings = chunker.chunk_documents(data)
"""
chunks = []
errors = []
warnings = []
try:
start_time = time.time()
filename = get_filename_from_data(data)
logging.info(f"[document_chunking][{filename}] chunking document.")
chunks, errors, warnings = DocumentChunker().chunk_document(data)
except jsonschema.exceptions.ValidationError as e:
error_message = f"Invalid request: {e}"
logging.error(f"[document_chunking] {error_message}")
errors.append(error_message)
finally:
if warnings:
warnings = self._format_messages(warnings)
if errors:
errors = self._format_messages(errors)
elapsed_time = time.time() - start_time
logging.info(
f"[document_chunking][{filename}] Finished chunking in {elapsed_time:.2f} seconds. "
f"{len(chunks)} chunks. {len(errors)} errors. {len(warnings)} warnings."
)
return chunks, errors, warnings