def get_pdf_blocks()

in src/pre_human_task_lambda.py [0:0]


def get_pdf_blocks(pdf_bytes, page_num, use_textract_only):
    """Get the Block objects from a PDF and also return it's type."""
    bytes_io_obj = BytesIO(pdf_bytes)
    blocks = []
    is_native_pdf = True
    with pdfplumber.open(bytes_io_obj) as pdf:
        page = pdf.pages[page_num - 1]
        width, height = page.width, page.height

        if use_textract_only or is_scanned_pdf(page.images, width, height):
            print(f"use_textract_only = {use_textract_only} or Scanned PDF. getting blocks from textract")
            blocks = blocks_from_scanned_pdf(pdf_bytes, page_num, dims=(float(width), float(height)))
            is_native_pdf = False
        else:
            print(f"use_textract_only = {use_textract_only} or Native PDF, getting blocks from pdf parser")
            blocks = blocks_from_native_pdf(page, page_num, width, height)
    return blocks, is_native_pdf