def fetch_textract_result()

in pipeline/ocr/fn-call-textract/main.py [0:0]


def fetch_textract_result(job_id: str, start_api: str = None):
    """Fetch a (potentially paginated) Textract async job result into memory"""
    next_token = None
    result = {}
    retrieved_all = False
    while not retrieved_all:
        req = {"JobId": job_id}
        if next_token:
            req["NextToken"] = next_token
        last_req_timestamp = time.time()
        part = (
            textract.get_document_text_detection(**req)
            if start_api == "StartDocumentTextDetection"
            else textract.get_document_analysis(**req)
        )
        for key in part:
            if key == "NextToken":
                continue
            elif key in result and isinstance(result[key], list):
                result[key] += part[key]
            else:
                result[key] = part[key]
        next_token = part.get("NextToken")
        if next_token is None:
            retrieved_all = True
        else:
            next_req_timestamp = last_req_timestamp + 1 / (
                GET_DOCUMENT_TEXT_DETECTION_TPS_LIMIT
                if start_api == "StartDocumentTextDetection"
                else GET_DOCUMENT_ANALYSIS_TPS_LIMIT
            )
            time.sleep(max(0, next_req_timestamp - time.time()))
    return result