def document_chunking()

in function_app.py [0:0]


def document_chunking(req: func.HttpRequest) -> func.HttpResponse:
    try:
        body = req.get_json()
        jsonschema.validate(body, schema=get_request_schema())

        if body:
            # Log the incoming request
            logging.info(f'[document_chunking_function] Invoked document_chunking skill. Number of items: {len(body["values"])}.')

            input_data = {}

            # Processing one item at a time to avoid exceeding the AI Search custom skill timeout (230 seconds)
            # BatchSize should be set to 1 in the Skillset definition, if it is not set, will process just the last item
            count_items = len(body["values"])
            filename = ""
            if count_items > 1:
                logging.warning('BatchSize should be set to 1 in the Skillset definition. Processing only the last item.')
            for i, item in enumerate(body["values"]):
                input_data = item["data"]
                filename = get_filename(input_data["documentUrl"])
                logging.info(f'[document_chunking_function] Chunking document: File {filename}, Content Type {input_data["documentContentType"]}.')
            
            start_time = time.time()

            # Enrich the input data with the document bytes and file name
            blob_client = BlobClient(input_data["documentUrl"])
            document_bytes = blob_client.download_blob()
            input_data['documentBytes'] = document_bytes          
            input_data['fileName'] = filename

            # Chunk the document
            chunks, errors, warnings = DocumentChunker().chunk_documents(input_data)

            # Enrich chunks with metadata to be indexed
            for chunk in chunks: chunk["source"] = "blob"
         
            # Debug logging
            for idx, chunk in enumerate(chunks):
                processed_chunk = chunk.copy()
                processed_chunk.pop('contentVector', None)
                if 'content' in processed_chunk and isinstance(processed_chunk['content'], str):
                    processed_chunk['content'] = processed_chunk['content'][:100]
                logging.debug(f"[document_chunking][{filename}] Chunk {idx + 1}: {json.dumps(processed_chunk, indent=4)}")


            # Format results
            values = {
                "recordId": item['recordId'],
                "data": {"chunks": chunks},
                "errors": errors,
                "warnings": warnings
            }
            
            results = {"values": [values]}
            result = json.dumps(results, ensure_ascii=False, cls=DateTimeEncoder)

            end_time = time.time()
            elapsed_time = end_time - start_time
            
            logging.info(f'[document_chunking_function] Finished document_chunking skill in {elapsed_time:.2f} seconds.')
            return func.HttpResponse(result, mimetype="application/json")
        else:
            error_message = "Invalid body."
            logging.error(f"[document_chunking_function] {error_message}", exc_info=True)
            return func.HttpResponse(error_message, status_code=400)
    except ValueError as e:
        error_message = f"Invalid body: {e}"
        logging.error(f"[document_chunking_function] {error_message}", exc_info=True)
        return func.HttpResponse(error_message, status_code=400)
    except jsonschema.exceptions.ValidationError as e:
        error_message = f"Invalid request: {e}"
        logging.error(f"[document_chunking_function] {error_message}", exc_info=True)
        return func.HttpResponse(error_message, status_code=400)
    except Exception as e:
        error_message = f"An unexpected error occurred: {e}"
        logging.error(f"[document_chunking_function] {error_message}", exc_info=True)
        return func.HttpResponse(error_message, status_code=500)