def chunk_documents()

in chunking/document_chunking.py [0:0]
24 lines of code
4 McCabe index (conditional complexity)

    def chunk_documents(self, data):
        """
        Processes and chunks the document provided in the input data, returning the chunks along with any errors or warnings encountered.

        Args:
            data (dict): 
                A dictionary containing the document's metadata and content. Expected keys include:
                - "documentUrl" (str): URL of the document.
                - "documentBytes" (str): Base64-encoded bytes of the document.
                - Additional optional fields as defined in the input schema.

        Returns:
            tuple: 
                A tuple containing three lists:
                - chunks (list[dict]): The list of document chunks created during the process.
                - errors (list[str]): A list of error messages encountered during chunking.
                - warnings (list[str]): A list of warning messages generated during chunking.

        Raises:
            jsonschema.exceptions.ValidationError: If the input data does not conform to the expected schema.
            Exception: For any unexpected errors during the chunking process.

        Example:
            >>> chunker = DocumentChunker()
            >>> chunks, errors, warnings = chunker.chunk_documents(data)
        """
        
        chunks = []
        errors = []
        warnings = []
        
        try:
            start_time = time.time()

            filename = get_filename_from_data(data)

            logging.info(f"[document_chunking][{filename}] chunking document.")

            chunks, errors, warnings = DocumentChunker().chunk_document(data)

        except jsonschema.exceptions.ValidationError as e:
            error_message = f"Invalid request: {e}"
            logging.error(f"[document_chunking] {error_message}")
            errors.append(error_message)

        finally:

            if warnings:
                warnings = self._format_messages(warnings)

            if errors:
                errors = self._format_messages(errors)            
            
            elapsed_time = time.time() - start_time
            
            logging.info(
                f"[document_chunking][{filename}] Finished chunking in {elapsed_time:.2f} seconds. "
                f"{len(chunks)} chunks. {len(errors)} errors. {len(warnings)} warnings."
            )            
            return chunks, errors, warnings