in community/pdf-annotator-python/main.py [0:0]
def main(args):
"""This functions annotates a PDF document using the Document AI API"""
if not args.project_id:
_, project_id = google.auth.default()
args.project_id = project_id
parent = f"projects/{args.project_id}/locations/{args.multi_region_location}"
client = DocumentProcessorServiceClient()
processor_id = find_processor_id_of_type(client, parent, args.form_processor_type)
if processor_id is None:
print(
f"no form processor found. "
f'creating new processor of type "{args.form_processor_type}"',
)
processor_id = create_processor(client, parent, args.form_processor_type)
if not os.path.isfile(os.path.abspath(args.input)):
print(f"could not find file at {os.path.abspath(args.input)}")
return 1
# If a output path is not specified, use input directory
if not args.output:
args.output = f'{os.path.abspath(args.input).rstrip(".pdf")}_annotated.pdf'
print("Calling Document AI API...", end="")
with open(args.input, "rb") as pdf_file:
document = client.process_document(
request={
"name": f"{parent}/processors/{processor_id}",
"raw_document": {
"content": pdf_file.read(),
"mime_type": "application/pdf",
},
}
).document
original_pdf = pikepdf.Pdf.open(os.path.abspath(args.input))
annotated_pdf = pikepdf.Pdf.new()
for page_num, page_info in enumerate(document.pages):
annotated_pdf.pages.append(original_pdf.pages[page_num])
print(
f"Found { len(page_info.form_fields)} form fields on page {page_num + 1}:"
)
# Calculate the max "x" and "y" coordinate values for the PDF
# this uses the PDF's own built in measuring units which need
# to be used to place annotations
page_max_x = float(annotated_pdf.pages[page_num].trimbox[2])
page_max_y = float(annotated_pdf.pages[page_num].trimbox[3])
page_annotations = []
for field in page_info.form_fields:
# Use the normalized vertices of the form fields and the max
# "x" and "y" coordinates to calculate the position of the
# annotation using the PDF's built in measuring units
coord1 = field.field_name.bounding_poly.normalized_vertices[0]
coord2 = field.field_name.bounding_poly.normalized_vertices[1]
rect = [
coord1.x * page_max_x,
page_max_y - coord1.y * page_max_y,
coord2.x * page_max_x,
page_max_y - coord2.y * page_max_y,
]
# Extract the parsed name and values of each field
# as determined by Document AI's API
name = layout_to_text(field.field_name, document.text)
value = layout_to_text(field.field_value, document.text)
annotation_text = f"{name}: {value}"
# Create a PDF annotation for this field name value pair
page_annotations.append(
pikepdf.Dictionary(
Type=pikepdf.Name.Annot,
Subtype=pikepdf.Name.Text,
Rect=rect,
Name=pikepdf.Name.Note,
Contents=pikepdf.String(annotation_text),
Open=False,
)
)
print(f"adding annotation: {annotation_text}")
# Add all the annotations for this page
annotated_pdf.pages[page_num].Annots = annotated_pdf.make_indirect(
pikepdf.Array(page_annotations)
)
print(f"Saving annotated PDF to {args.output}.")
annotated_pdf.save(
os.path.join(args.output),
min_version=original_pdf.pdf_version,
# Disable annotation modification
encryption=pikepdf.Encryption(
owner="", user="", allow=pikepdf.Permissions(modify_annotation=False)
),
)
print("Done.")
return 0