in src/pdf-splitter/main.py [0:0]
def split_pdf(input_bucket, input_file, output_bucket, output_folder, dpi):
print(f"Downloading file: gs://{input_bucket}/{input_file}")
bucket = storage_client.get_bucket(input_bucket)
blob = bucket.get_blob(input_file)
downloaded_filename = str(uuid.uuid4())
blob.download_to_filename(downloaded_filename)
print(f"Input file downloaded from GCS to {downloaded_filename}")
images = convert_from_path(downloaded_filename, dpi)
uploaded_images = []
# Save pages as images in the pdf
bucket = storage_client.get_bucket(output_bucket)
for i in range(len(images)):
tmp_name = f"${str(uuid.uuid4())}.jpg"
images[i].save(tmp_name, 'JPEG')
# Upload image to GCS
uploaded_filename = f"{output_folder}/page-{str(i).zfill(4)}.jpg"
blob = bucket.blob(uploaded_filename)
blob.upload_from_filename(tmp_name)
# Keep a list of the uploaded images
uploaded_images.append(uploaded_filename)
print(f"Image uploaded to gs://{output_bucket}/{uploaded_filename}")
# Cleanup local file
os.remove(tmp_name)
# Cleanup local file
os.remove(downloaded_filename)
return uploaded_images