in src/pre_human_task_lambda.py [0:0]
def blocks_from_scanned_pdf(pdf_bytes, page_number, dims=None):
"""Return a list of blocks from a scanned PDF."""
png_byte_value = resize_and_convert_to_bytes(pdf_bytes, page_number, dims=dims)
print(f"len of png_byte_value = {len(png_byte_value)}")
try:
result = analyze_document(png_byte_value)
textract_blocks = result['Blocks']
textract_line_blocks = [block for block in textract_blocks if block['BlockType'] == 'LINE']
textract_word_blocks = [block for block in textract_blocks if block['BlockType'] == 'WORD']
print("== Textract blocks ==")
print(f"number of total textract blocks = {len(textract_blocks)}")
print(f"number of textract line blocks = {len(textract_line_blocks)}")
print(f"number of textract word blocks = {len(textract_word_blocks)}")
# use to quickly retrieve word blocks
idToWordBlock = {b['Id']: b for b in textract_blocks if b['BlockType'] == 'WORD'}
blocks = []
# for each textract line block, create a line block, then create the word blocks by looping through its Relationships,
# if the relationship is of type CHILD, loop through the relationships Ids array and create word blocks
index = -1
for textract_lb in textract_line_blocks:
index += 1
line_block = textract_block_to_block(page_number, textract_lb, index)
line_index = index
blocks.append(line_block)
if line_block.Relationships:
for id in line_block.Relationships[0].Ids:
index += 1
textract_word_block = idToWordBlock[id]
word_block = textract_block_to_block(page_number, textract_word_block, index, line_index)
blocks.append(word_block)
line_blocks = [block for block in blocks if block.BlockType == 'LINE']
word_blocks = [block for block in blocks if block.BlockType == 'WORD']
print(" == Blocks after conversion== ")
print(f"number of after conversion blocks = {len(blocks)}")
print(f"number of after conversion line blocks = {len(line_blocks)}")
print(f"number of after conversion word blocks = {len(word_blocks)}")
return blocks
except Exception as e:
print(f"Failed to analyze {page_number} due to {e}")
return []