in tools/doc_intelligence.py [0:0]
def analyze_document_from_bytes(self, file_bytes: bytes, filename: str, model='prebuilt-layout'):
"""
Analyzes a document using the specified model, with input as bytes.
Args:
file_bytes (bytes): The bytes of the document to be analyzed.
filename (str): The name of the document file.
model (str): The model to use for document analysis.
Returns:
tuple: A tuple containing the analysis result and any errors encountered.
"""
result = {}
errors = []
result_id = None
# Get the file extension from the filename
file_ext = self._get_file_extension(filename)
if file_ext not in self.file_extensions:
error_message = f"File extension '{file_ext}' is not supported."
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
content_type = self._get_content_type(file_ext)
if file_ext == "pdf":
self.docint_features = "ocr.highResolution"
# Set request endpoint
request_endpoint = f"https://{self.service_name}.cognitiveservices.azure.com/{self.ai_service_type}/documentModels/{model}:analyze?api-version={self.api_version}"
if self.docint_features:
request_endpoint += f"&features={self.docint_features}"
if self.output_content_format:
request_endpoint += f"&outputContentFormat={self.output_content_format}"
if self.analyze_output_options:
request_endpoint += f"&output={self.analyze_output_options}"
# Set request headers
try:
token = self.credential.get_token("https://cognitiveservices.azure.com/.default")
headers = {
"Content-Type": content_type,
"Authorization": f"Bearer {token.token}",
"x-ms-useragent": "gpt-rag/1.0.0"
}
logging.debug(f"[docintelligence][{filename}] Retrieved authentication token.")
except ClientAuthenticationError as e:
error_message = f"Authentication failed: {e}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
except Exception as e:
error_message = f"Unexpected error during authentication: {e}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
try:
response = requests.post(request_endpoint, headers=headers, data=file_bytes)
logging.info(f"[docintelligence][{filename}] Sent analysis request.")
except Exception as e:
error_message = f"Error when sending request to Document Intelligence API: {e}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
if response.status_code != 202:
error_messages = {
404: "Resource not found. Please verify your request URL. The Document Intelligence API version you are using may not be supported in your region.",
}
error_message = error_messages.get(
response.status_code,
f"Document Intelligence request error, code {response.status_code}: {response.text}"
)
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
get_url = response.headers.get("Operation-Location")
if not get_url:
error_message = "Operation-Location header not found in the response."
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
return result, errors
# Extract result_id
try:
result_id = get_url.split("/")[-1].split("?")[0]
logging.debug(f"[docintelligence][{filename}] Extracted result_id: {result_id}")
except Exception as e:
error_message = f"Error extracting result_id: {e}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
result_headers = headers.copy()
result_headers["Content-Type"] = "application/json-patch+json"
while True:
try:
result_response = requests.get(get_url, headers=result_headers)
result_json = result_response.json()
if result_response.status_code != 200 or result_json.get("status") == "failed":
error_message = f"Document Intelligence polling error, code {result_response.status_code}: {result_response.text}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
break
if result_json.get("status") == "succeeded":
result = result_json.get('analyzeResult', {})
logging.debug(f"[docintelligence][{filename}] Analysis succeeded.")
break
logging.debug(f"[docintelligence][{filename}] Analysis in progress. Waiting for 2 seconds before retrying.")
time.sleep(2)
except Exception as e:
error_message = f"Error during polling for analysis result: {e}"
logging.error(f"[docintelligence][{filename}] {error_message}")
errors.append(error_message)
break
# enrich result
result['result_id'] = result_id
result['model_id'] = model
return result, errors