in vision/snippets/detect/beta_snippets.py [0:0]
def detect_batch_annotate_files_uri(gcs_uri):
"""Detects document features in a PDF/TIFF/GIF file.
While your PDF file may have several pages,
this API can process up to 5 pages only.
Args:
uri: The path to the file in Google Cloud Storage (gs://...)
"""
from google.cloud import vision_v1p4beta1 as vision
client = vision.ImageAnnotatorClient()
# Other supported mime_types: image/tiff' or 'image/gif'
mime_type = "application/pdf"
input_config = vision.InputConfig(
gcs_source=vision.GcsSource(uri=gcs_uri), mime_type=mime_type
)
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
# Annotate the first two pages and the last one (max 5 pages)
# First page starts at 1, and not 0. Last page is -1.
pages = [1, 2, -1]
request = vision.AnnotateFileRequest(
input_config=input_config, features=[feature], pages=pages
)
response = client.batch_annotate_files(requests=[request])
for image_response in response.responses[0].responses:
for page in image_response.full_text_annotation.pages:
for block in page.blocks:
print(f"\nBlock confidence: {block.confidence}\n")
for par in block.paragraphs:
print(f"\tParagraph confidence: {par.confidence}")
for word in par.words:
symbol_texts = [symbol.text for symbol in word.symbols]
word_text = "".join(symbol_texts)
print(
"\t\tWord text: {} (confidence: {})".format(
word_text, word.confidence
)
)
for symbol in word.symbols:
print(
"\t\t\tSymbol: {} (confidence: {})".format(
symbol.text, symbol.confidence
)
)