def blocks_from_native_pdf()

in src/pre_human_task_lambda.py [0:0]


def blocks_from_native_pdf(page, page_num, page_width, page_height):
    """Return a list of blocks from a native PDF."""
    blocks = []

    text = page.extract_text()
    lines = [strippedLine for strippedLine in [lineWithSpace.strip() for lineWithSpace in (text if text else '').split('\n')] if strippedLine]
    if lines:
        lineIndex = 0
        lineWords = lines[lineIndex].split()
        wordIndex = 0

        plumberText = []
        plumberLine = []

        words = page.extract_words()
        token_sub_search_index = 0
        word_sub_search_index = 0
        token_block_list_idx = 0
        while token_block_list_idx < len(words):
            token_block = words[token_block_list_idx]
            block_word_index_in_line_word = lineWords[wordIndex][word_sub_search_index:].find(token_block['text'][token_sub_search_index:])
            if block_word_index_in_line_word > -1:
                # block word is a sub-part of text word ex. text: "word__in__line", block: "word"
                if lineWords[wordIndex][word_sub_search_index:] == token_block['text'][:token_sub_search_index]:
                    word_sub_search_index = len(lineWords[wordIndex])
                else:
                    word_sub_search_index = word_sub_search_index + block_word_index_in_line_word + len(token_block['text'][token_sub_search_index:])
                plumberLine.append(token_block)
                if word_sub_search_index == len(lineWords[wordIndex]):
                    if wordIndex < len(lineWords) - 1:
                        wordIndex += 1
                    else:
                        if lineIndex < len(lines) - 1:
                            lineIndex += 1
                            lineWords = lines[lineIndex].split()
                            wordIndex = 0
                            plumberText.append(plumberLine)
                            plumberLine = []
                    word_sub_search_index = 0

                token_block_list_idx += 1
                token_sub_search_index = 0
            else:
                # text word is a sub-part of block word ex. text: "word", block: "word__in___line"
                token_sub_search_index += len(lineWords[wordIndex])
                if wordIndex < len(lineWords) - 1:
                    wordIndex += 1
                else:
                    if lineIndex < len(lines) - 1:
                        lineIndex += 1
                        lineWords = lines[lineIndex].split()
                        wordIndex = 0
                        if plumberLine:
                            plumberText.append(plumberLine)
                            plumberLine = []

        if plumberLine:
            plumberText.append(plumberLine)

        blockIndex = -1
        if plumberText:
            for plumberLine in plumberText:
                lineAndWordBlocks, blockIndex = plumber_line_to_blocks(page_num, plumberLine, blockIndex, page_width, page_height)
                blocks.extend(lineAndWordBlocks)
    return blocks