def concatenate_images_into_pdf()

in src/pdf-merger/main.py [0:0]


def concatenate_images_into_pdf(files_bucket, files_to_concatenate,
                                output_bucket, output_file):
    # Download all images
    print(
        f"Images to concatenate: {files_to_concatenate} from bucket: {files_bucket}"
    )

    pdf_merger = PdfFileMerger()
    print("Sorting files")
    # The images come as a dict { 1:file1, 2:file2, 3:file3 }
    # So we will first sort the keys to ensure we concat the file in the right order
    for k, v in sorted(files_to_concatenate.items()):
        # Skip if this is not a valid image filename
        file = files_to_concatenate[k]
        print(file)
        file = file.strip()
        if not file:
            continue

        # Download image
        temp_image_name = f"{str(uuid.uuid4())}.jpg"
        files_bucket = storage_client.get_bucket(files_bucket)
        blob = files_bucket.get_blob(file)
        blob.download_to_filename(temp_image_name)
        print(f"Downloaded file {file}")

        # Make image searcheable and add it to the main PDF
        searchable_page_pdf = pytesseract.image_to_pdf_or_hocr(temp_image_name,
                                                               extension='pdf')
        searchable_page_pdf_file = f"{str(uuid.uuid4())}.pdf"
        with open(searchable_page_pdf_file, 'w+b') as f:
            f.write(searchable_page_pdf)
        pdf_merger.append(searchable_page_pdf_file)

        # Cleanup local files
        os.remove(temp_image_name)
        os.remove(searchable_page_pdf_file)

    # Write searchable pdf to disk and upload to GCS
    searchable_concatenated_pdf = f"{str(uuid.uuid4())}.pdf"
    pdf_merger.write(searchable_concatenated_pdf)

    # Upload final concatenated PDF to bucket
    files_bucket = storage_client.get_bucket(output_bucket)
    out_blob = files_bucket.blob(output_file)
    out_blob.upload_from_filename(searchable_concatenated_pdf)
    print(
        f"Searchable concatenated PDF uploaded to: gs://{output_bucket}/{output_file}"
    )

    # Cleanup local files
    os.remove(searchable_concatenated_pdf)

    # result
    return {
        "full_file": f"gs://{output_bucket}/{output_file}",
        "file": output_file,
        "bucket": output_bucket
    }