def blocks_from_scanned_pdf()

in src/pre_human_task_lambda.py [0:0]


def blocks_from_scanned_pdf(pdf_bytes, page_number, dims=None):
    """Return a list of blocks from a scanned PDF."""
    png_byte_value = resize_and_convert_to_bytes(pdf_bytes, page_number, dims=dims)
    print(f"len of png_byte_value = {len(png_byte_value)}")
    try:
        result = analyze_document(png_byte_value)
        textract_blocks = result['Blocks']
        textract_line_blocks = [block for block in textract_blocks if block['BlockType'] == 'LINE']
        textract_word_blocks = [block for block in textract_blocks if block['BlockType'] == 'WORD']
        print("== Textract blocks ==")
        print(f"number of total textract blocks = {len(textract_blocks)}")
        print(f"number of textract line blocks = {len(textract_line_blocks)}")
        print(f"number of textract word blocks = {len(textract_word_blocks)}")

        # use to quickly retrieve word blocks
        idToWordBlock = {b['Id']: b for b in textract_blocks if b['BlockType'] == 'WORD'}

        blocks = []
        # for each textract line block, create a line block, then create the word blocks by looping through its Relationships,
        #   if the relationship is of type CHILD, loop through the relationships Ids array and create word blocks
        index = -1
        for textract_lb in textract_line_blocks:
            index += 1
            line_block = textract_block_to_block(page_number, textract_lb, index)
            line_index = index

            blocks.append(line_block)
            if line_block.Relationships:
                for id in line_block.Relationships[0].Ids:
                    index += 1
                    textract_word_block = idToWordBlock[id]
                    word_block = textract_block_to_block(page_number, textract_word_block, index, line_index)
                    blocks.append(word_block)

        line_blocks = [block for block in blocks if block.BlockType == 'LINE']
        word_blocks = [block for block in blocks if block.BlockType == 'WORD']

        print(" == Blocks after conversion== ")
        print(f"number of after conversion blocks = {len(blocks)}")
        print(f"number of after conversion line blocks = {len(line_blocks)}")
        print(f"number of after conversion word blocks = {len(word_blocks)}")
        return blocks
    except Exception as e:
        print(f"Failed to analyze {page_number} due to {e}")
    return []