in src/pre_human_task_lambda.py [0:0]
def get_pdf_blocks(pdf_bytes, page_num, use_textract_only):
"""Get the Block objects from a PDF and also return it's type."""
bytes_io_obj = BytesIO(pdf_bytes)
blocks = []
is_native_pdf = True
with pdfplumber.open(bytes_io_obj) as pdf:
page = pdf.pages[page_num - 1]
width, height = page.width, page.height
if use_textract_only or is_scanned_pdf(page.images, width, height):
print(f"use_textract_only = {use_textract_only} or Scanned PDF. getting blocks from textract")
blocks = blocks_from_scanned_pdf(pdf_bytes, page_num, dims=(float(width), float(height)))
is_native_pdf = False
else:
print(f"use_textract_only = {use_textract_only} or Native PDF, getting blocks from pdf parser")
blocks = blocks_from_native_pdf(page, page_num, width, height)
return blocks, is_native_pdf