in chunking/document_chunking.py [0:0]
def chunk_documents(self, data):
"""
Processes and chunks the document provided in the input data, returning the chunks along with any errors or warnings encountered.
Args:
data (dict):
A dictionary containing the document's metadata and content. Expected keys include:
- "documentUrl" (str): URL of the document.
- "documentBytes" (str): Base64-encoded bytes of the document.
- Additional optional fields as defined in the input schema.
Returns:
tuple:
A tuple containing three lists:
- chunks (list[dict]): The list of document chunks created during the process.
- errors (list[str]): A list of error messages encountered during chunking.
- warnings (list[str]): A list of warning messages generated during chunking.
Raises:
jsonschema.exceptions.ValidationError: If the input data does not conform to the expected schema.
Exception: For any unexpected errors during the chunking process.
Example:
>>> chunker = DocumentChunker()
>>> chunks, errors, warnings = chunker.chunk_documents(data)
"""
chunks = []
errors = []
warnings = []
try:
start_time = time.time()
filename = get_filename_from_data(data)
logging.info(f"[document_chunking][{filename}] chunking document.")
chunks, errors, warnings = DocumentChunker().chunk_document(data)
except jsonschema.exceptions.ValidationError as e:
error_message = f"Invalid request: {e}"
logging.error(f"[document_chunking] {error_message}")
errors.append(error_message)
finally:
if warnings:
warnings = self._format_messages(warnings)
if errors:
errors = self._format_messages(errors)
elapsed_time = time.time() - start_time
logging.info(
f"[document_chunking][{filename}] Finished chunking in {elapsed_time:.2f} seconds. "
f"{len(chunks)} chunks. {len(errors)} errors. {len(warnings)} warnings."
)
return chunks, errors, warnings