in src/pdf-merger/main.py [0:0]
def concatenate_images_into_pdf(files_bucket, files_to_concatenate,
output_bucket, output_file):
# Download all images
print(
f"Images to concatenate: {files_to_concatenate} from bucket: {files_bucket}"
)
pdf_merger = PdfFileMerger()
print("Sorting files")
# The images come as a dict { 1:file1, 2:file2, 3:file3 }
# So we will first sort the keys to ensure we concat the file in the right order
for k, v in sorted(files_to_concatenate.items()):
# Skip if this is not a valid image filename
file = files_to_concatenate[k]
print(file)
file = file.strip()
if not file:
continue
# Download image
temp_image_name = f"{str(uuid.uuid4())}.jpg"
files_bucket = storage_client.get_bucket(files_bucket)
blob = files_bucket.get_blob(file)
blob.download_to_filename(temp_image_name)
print(f"Downloaded file {file}")
# Make image searcheable and add it to the main PDF
searchable_page_pdf = pytesseract.image_to_pdf_or_hocr(temp_image_name,
extension='pdf')
searchable_page_pdf_file = f"{str(uuid.uuid4())}.pdf"
with open(searchable_page_pdf_file, 'w+b') as f:
f.write(searchable_page_pdf)
pdf_merger.append(searchable_page_pdf_file)
# Cleanup local files
os.remove(temp_image_name)
os.remove(searchable_page_pdf_file)
# Write searchable pdf to disk and upload to GCS
searchable_concatenated_pdf = f"{str(uuid.uuid4())}.pdf"
pdf_merger.write(searchable_concatenated_pdf)
# Upload final concatenated PDF to bucket
files_bucket = storage_client.get_bucket(output_bucket)
out_blob = files_bucket.blob(output_file)
out_blob.upload_from_filename(searchable_concatenated_pdf)
print(
f"Searchable concatenated PDF uploaded to: gs://{output_bucket}/{output_file}"
)
# Cleanup local files
os.remove(searchable_concatenated_pdf)
# result
return {
"full_file": f"gs://{output_bucket}/{output_file}",
"file": output_file,
"bucket": output_bucket
}