in seed/util/preprocess.py [0:0]
def analyze_pdf_page_content(pdf_path, text_length_thres=600):
document = fitz.open(pdf_path)
page_analysis = defaultdict(list)
for page_num in range(len(document)):
page = document.load_page(page_num)
text = page.get_text("text")
image_list = page.get_images(full=True)
text_length = len(text)
num_images = len(image_list)
if text_length > text_length_thres and num_images == 0:
content_type = 'Text'
elif text_length <= text_length_thres and num_images > 0:
content_type = 'Image'
else:
content_type = 'Mixed'
page_analysis[content_type].append(page_num)
return dict(page_analysis)