gemini/use-cases/applying-llms-to-data/gemini-and-documentai-for-entity-extraction/entity_processor.py (40 lines of code) (raw):

from abc import ABC, abstractmethod import json import mimetypes from typing import Dict from google.cloud import documentai from vertexai.generative_models import GenerationConfig, GenerativeModel, Part class EntityExtractor(ABC): """Abstract Base Class for entity extraction.""" @abstractmethod def extract_entities(self) -> Dict: """Abstract method to extract entities.""" class DocumentAIEntityExtractor(EntityExtractor): """Class for Document AI entity extraction""" def __init__(self, document: documentai.Document) -> None: self.document = document def extract_entities(self) -> Dict: entities = {} for entity in self.document.entities: entities[entity.type_] = entity.mention_text return entities class ModelBasedEntityExtractor(EntityExtractor): """Class for Gemini entity extraction""" def __init__(self, model_version: str, prompt: str, file_path: str) -> None: self.config = GenerationConfig( temperature=0.0, top_p=0.8, top_k=32, candidate_count=1, max_output_tokens=2048, response_mime_type="application/json", ) self.model = GenerativeModel(model_version, generation_config=self.config) self.prompt = prompt mime_type = mimetypes.guess_type(file_path)[0] if (mime_type is None) or (mime_type != "application/pdf"): raise ValueError("Only PDF files are supported, aborting") self.file_path = file_path def extract_entities(self) -> Dict: pdf_file = Part.from_uri(self.file_path, mime_type="application/pdf") contents = [pdf_file, self.prompt] response = self.model.generate_content(contents) cleaned_string = response.text.replace("```json\n", "").replace("\n```", "") entities = json.loads(cleaned_string) return entities