in src/pre_human_task_lambda.py [0:0]
def blocks_from_native_pdf(page, page_num, page_width, page_height):
"""Return a list of blocks from a native PDF."""
blocks = []
text = page.extract_text()
lines = [strippedLine for strippedLine in [lineWithSpace.strip() for lineWithSpace in (text if text else '').split('\n')] if strippedLine]
if lines:
lineIndex = 0
lineWords = lines[lineIndex].split()
wordIndex = 0
plumberText = []
plumberLine = []
words = page.extract_words()
token_sub_search_index = 0
word_sub_search_index = 0
token_block_list_idx = 0
while token_block_list_idx < len(words):
token_block = words[token_block_list_idx]
block_word_index_in_line_word = lineWords[wordIndex][word_sub_search_index:].find(token_block['text'][token_sub_search_index:])
if block_word_index_in_line_word > -1:
# block word is a sub-part of text word ex. text: "word__in__line", block: "word"
if lineWords[wordIndex][word_sub_search_index:] == token_block['text'][:token_sub_search_index]:
word_sub_search_index = len(lineWords[wordIndex])
else:
word_sub_search_index = word_sub_search_index + block_word_index_in_line_word + len(token_block['text'][token_sub_search_index:])
plumberLine.append(token_block)
if word_sub_search_index == len(lineWords[wordIndex]):
if wordIndex < len(lineWords) - 1:
wordIndex += 1
else:
if lineIndex < len(lines) - 1:
lineIndex += 1
lineWords = lines[lineIndex].split()
wordIndex = 0
plumberText.append(plumberLine)
plumberLine = []
word_sub_search_index = 0
token_block_list_idx += 1
token_sub_search_index = 0
else:
# text word is a sub-part of block word ex. text: "word", block: "word__in___line"
token_sub_search_index += len(lineWords[wordIndex])
if wordIndex < len(lineWords) - 1:
wordIndex += 1
else:
if lineIndex < len(lines) - 1:
lineIndex += 1
lineWords = lines[lineIndex].split()
wordIndex = 0
if plumberLine:
plumberText.append(plumberLine)
plumberLine = []
if plumberLine:
plumberText.append(plumberLine)
blockIndex = -1
if plumberText:
for plumberLine in plumberText:
lineAndWordBlocks, blockIndex = plumber_line_to_blocks(page_num, plumberLine, blockIndex, page_width, page_height)
blocks.extend(lineAndWordBlocks)
return blocks