in pipeline/ocr/fn-call-textract/main.py [0:0]
def fetch_textract_result(job_id: str, start_api: str = None):
"""Fetch a (potentially paginated) Textract async job result into memory"""
next_token = None
result = {}
retrieved_all = False
while not retrieved_all:
req = {"JobId": job_id}
if next_token:
req["NextToken"] = next_token
last_req_timestamp = time.time()
part = (
textract.get_document_text_detection(**req)
if start_api == "StartDocumentTextDetection"
else textract.get_document_analysis(**req)
)
for key in part:
if key == "NextToken":
continue
elif key in result and isinstance(result[key], list):
result[key] += part[key]
else:
result[key] = part[key]
next_token = part.get("NextToken")
if next_token is None:
retrieved_all = True
else:
next_req_timestamp = last_req_timestamp + 1 / (
GET_DOCUMENT_TEXT_DETECTION_TPS_LIMIT
if start_api == "StartDocumentTextDetection"
else GET_DOCUMENT_ANALYSIS_TPS_LIMIT
)
time.sleep(max(0, next_req_timestamp - time.time()))
return result