in 4-mmrag_tooluse/mmrag_bh.py [0:0]
def process_folder(folder: str, base64_output_folder: str) -> List[Dict[str, str]]:
"""
Processes all PDFs in a folder and extracts base64 images along with their quarter information.
"""
images_data = []
quarter_pattern = r'Q[1-4]\d{2}'
for file in os.listdir(folder):
if file.endswith(".pdf"):
match = re.search(quarter_pattern, file)
if match:
quarter_info = match.group()
pdf_path = os.path.join(folder, file)
base64_images = pdf_to_base64_images(pdf_path)
for i, base64_image in enumerate(base64_images):
base64_filename = f"{os.path.splitext(file)[0]}_page_{i}.txt"
base64_path = save_base64_image(
base64_image, base64_output_folder, base64_filename)
images_data.append({
'quarter_info': quarter_info,
'base64_image_path': base64_path,
'original_pdf_path': pdf_path
})
else:
logger.warning(
f"No quarter information found in filename: {file}")
return images_data