core/drive.py

# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Processors for extracting file content from Google Drive content. ## Example usage ### Authentication Auth is handled either by "pre-authorizing" in Colab, e.g.: ```py from google.colab import auth as colab_auth colab_auth.authenticate_user() ``` Or, you will need to provide credentials directly to the processor in the constructor, e.g.: ```py from google.auth import credentials as google_credentials creds = google_credentials.Credentials( token=YOUR_ACCESS_TOKEN, scopes=['https://www.googleapis.com/auth/drive'] ) p_slides = drive.Slides(creds=creds) ``` ### PDF Handling In the examples below, we pass the PDF bytes directly to the `GenaiModel`, since the `GenaiModel` will handle the PDF conversion. If you require more control over the PDF conversion, you could make use of the `PDFExtract` processor. ## Docs Example ```py from genai_processors.core import drive USER_PROMPT = "Describe the document in detail." p_docs = drive.Docs() # Or, if you are using credentials: # p_docs = drive.Docs(creds=creds) p_preamble = preamble.Preamble( content=f'''You are an expert in reviewing Google Docs documents. You have been provided with a document, and must use it to answer the question. User question: {USER_PROMPT}''' ) p_genai = genai_model.GenaiModel( model_name='gemini-2.5-flash-preview-05-20', api_key=GOOGLE_API_KEY ) pipeline = ( p_docs + p_preamble + p_genai ) req = drive.DocsRequest(doc_id='YOUR-DOC-ID') req_part = processor.ProcessorPart.from_dataclass(dataclass=req) input_stream = processor.stream_content([req_part]) async for content_part in pipeline(input_stream): print(content_part.text) ``` ## Sheets Example ```py from genai_processors.core import drive USER_PROMPT = "Describe the spreadsheet in detail." p_sheets = drive.Sheets() # Or, if you are using credentials: # p_sheets = drive.Sheets(creds=creds) p_preamble = preamble.Preamble( content=f'''You are an expert in reviewing Google Sheets spreadsheets. You have been provided with a spreadsheet, and must use it to answer the question. User question: {USER_PROMPT}''' ) p_genai = genai_model.GenaiModel( model_name='gemini-2.5-flash-preview-05-20', api_key=GOOGLE_API_KEY ) pipeline = ( p_sheets + p_preamble + p_genai ) req = drive.SheetsRequest(spreadsheet_id='YOUR-SPREADSHEET-ID') req_part = processor.ProcessorPart.from_dataclass(dataclass=req) input_stream = processor.stream_content([req_part]) async for content_part in pipeline(input_stream): print(content_part.text) ``` ## Slides Example ```py from genai_processors.core import drive USER_PROMPT = "Describe the presentation in detail." p_slides = drive.Slides() # Or, if you are using credentials: # p_slides = drive.Slides(creds=creds) p_preamble = preamble.Preamble( content=f'''You are an expert in reviewing Google Slides presentations. You have been provided with slides, and must use them to answer the question. User question: {USER_PROMPT}''' ) p_genai = genai_model.GenaiModel( model_name='gemini-2.5-flash-preview-05-20', api_key=GOOGLE_API_KEY ) pipeline = ( p_slides + p_preamble + p_genai ) req = drive.SlidesRequest(presentation_id='YOUR-PRESENTATION-ID') req_part = processor.ProcessorPart.from_dataclass(dataclass=req) input_stream = processor.stream_content([req_part]) async for content_part in pipeline(input_stream): print(content_part.text) ``` """ import csv import dataclasses import io from typing import Any, AsyncIterable import dataclasses_json from genai_processors import content_api from genai_processors import processor from google.auth import credentials as google_credentials from googleapiclient import discovery import pdfrw # Shared helper functions def get_drive_pdf( file_id: str, creds: google_credentials.Credentials | None = None, ) -> bytes: """Fetches the requested file as PDF bytes. Args: file_id: The ID of the file to fetch. creds: The credentials to use for the request. If not provided, the application default credentials will be used. Returns: The requested file as PDF bytes. """ service = discovery.build('drive', 'v3', credentials=creds) return ( service.files() .export(fileId=file_id, mimeType='application/pdf') .execute() ) # Google Docs @dataclasses_json.dataclass_json @dataclasses.dataclass class DocsRequest: """Request for fetching Google Docs data. Attributes: doc_id: The ID of the document to fetch. (e.g. for a doc with URL https://docs.google.com/document/d/foo, the ID is "foo") """ doc_id: str class Docs(processor.PartProcessor): """Processor for extracting PDF content from Google Docs.""" def __init__( self, creds: google_credentials.Credentials | None = None, ) -> None: self._creds = creds def match(self, part: content_api.ProcessorPart) -> bool: return part.mimetype == 'application/json; type=DocsRequest' async def call( self, part: content_api.ProcessorPart ) -> AsyncIterable[content_api.ProcessorPart]: google_docs_request = part.get_dataclass(DocsRequest) doc_id = google_docs_request.doc_id doc_pdf = get_drive_pdf( file_id=doc_id, creds=self._creds, ) yield content_api.ProcessorPart('Document:\n\n') yield content_api.ProcessorPart(doc_pdf, mimetype='application/pdf') # Google Sheets @dataclasses_json.dataclass_json @dataclasses.dataclass class SheetsRequest: """Request for fetching Google Sheets data. Attributes: spreadsheet_id: The ID of the spreadsheet to fetch. (e.g. for a spreadsheet with URL https://docs.google.com/spreadsheets/d/foo, the ID is "foo") ranges: Cell ranges to fetch. If not specified, the entire worksheet is fetched. For additional details on how to specify the ranges, see here the `ranges` paramter in https://developers.google.com/workspace/sheets/api/reference/rest/v4/spreadsheets/get worksheet_names: The names of the worksheets to fetch. If not provided, all worksheets will be fetched. """ spreadsheet_id: str ranges: list[str] | None = None worksheet_names: list[str] | None = None class Sheets(processor.PartProcessor): """Processor for extracting PDF content from Google Sheets.""" def __init__( self, creds: google_credentials.Credentials | None = None, ) -> None: self._creds = creds def _fetch_sheet_data( self, sheets_request: SheetsRequest ) -> list[dict[str, Any]]: """Returns a list of dicts representing sheet data, for the given spreadsheet or range. The response dicts contain sheets data with the fields described here: https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/sheets Args: sheets_request: The SheetsRequest to fetch. Returns: A list of response dicts for the given spreadsheet or range. """ service = discovery.build('sheets', 'v4', credentials=self._creds) spreadsheet_id = sheets_request.spreadsheet_id ranges = sheets_request.ranges return [ service.spreadsheets() .get(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=r) .execute() for r in ranges or [None] ] def match(self, part: content_api.ProcessorPart) -> bool: return part.mimetype == 'application/json; type=SheetsRequest' async def call( self, part: content_api.ProcessorPart ) -> AsyncIterable[content_api.ProcessorPart]: google_sheets_request = part.get_dataclass(SheetsRequest) spreadsheet_data = self._fetch_sheet_data(google_sheets_request) sheet_ranges = google_sheets_request.ranges worksheet_names = google_sheets_request.worksheet_names for i, res in enumerate(spreadsheet_data): for sheet in res['sheets']: try: title = sheet['properties']['title'] if worksheet_names is not None and title not in worksheet_names: continue data = [] for row in sheet['data'][0]['rowData']: if 'values' in row.keys(): values = [] for cell in row['values']: values.append(cell.get('formattedValue', '')) data.append(values) columns = data[0] num_columns = len(columns) rows = data[1:] for index in range(len(rows)): if len(rows[index]) > len(columns): rows[index] = rows[index][:num_columns] if len(rows[index]) < len(columns): rows[index] = rows[index] + [''] * ( num_columns - len(rows[index]) ) output = io.StringIO() writer = csv.writer(output, lineterminator='\n') writer.writerow(columns) writer.writerows(rows) range_or_title = sheet_ranges[i] if sheet_ranges else title yield content_api.ProcessorPart(f'Sheet {range_or_title}:\n\n') yield content_api.ProcessorPart( output.getvalue(), mimetype='text/csv' ) except (ValueError, TypeError, IndexError): yield content_api.ProcessorPart('Failed to parse sheet data.') # Google Slides @dataclasses_json.dataclass_json @dataclasses.dataclass class SlidesRequest: """Request for fetching Google Slides data. Attributes: presentation_id: The ID of the presentation to fetch. (e.g. for a presentation with URL https://docs.google.com/presentation/d/foo, the ID is "foo") slide_numbers: The slide numbers to fetch, indexed from 1. If not provided, all slides will be fetched. """ presentation_id: str slide_numbers: list[int] | None = None class Slides(processor.PartProcessor): """Processor for extracting PDF content from Google Slides.""" def __init__( self, creds: google_credentials.Credentials | None = None, ) -> None: self._creds = creds def _get_presentation_pdf_by_slide( self, presentation_id: str, slide_numbers: list[int] | None = None, ) -> dict[int, bytes]: """Fetches the requested presentation as a dictionary of slide numbers to PDF bytes. Args: presentation_id: The ID of the presentation to fetch. slide_numbers: (Optional) A list of slide numbers to fetch (indexed from 1). If not provided, all slides will be fetched. Returns: A dictionary of slide numbers to PDF bytes. """ pdf_bytes = get_drive_pdf( file_id=presentation_id, creds=self._creds, ) read_pdf = pdfrw.PdfReader(io.BytesIO(pdf_bytes)) num_slides = read_pdf.numPages slide_dict = {} for i in range(num_slides): if slide_numbers and i + 1 not in slide_numbers: continue slide_writer = pdfrw.PdfWriter() slide_writer.addPage(read_pdf.getPage(i)) slide_bytes_stream = io.BytesIO() slide_writer.write(slide_bytes_stream) slide_bytes_stream.seek(0) slide_bytes = slide_bytes_stream.read() slide_dict[i + 1] = slide_bytes return slide_dict def match(self, part: content_api.ProcessorPart) -> bool: return part.mimetype == 'application/json; type=SlidesRequest' async def call( self, part: content_api.ProcessorPart ) -> AsyncIterable[content_api.ProcessorPart]: google_slides_request = part.get_dataclass(SlidesRequest) presentation_id = google_slides_request.presentation_id slide_numbers = google_slides_request.slide_numbers presentation_pdfs = self._get_presentation_pdf_by_slide( presentation_id=presentation_id, slide_numbers=slide_numbers, ) for slide_num, pdf_bytes in presentation_pdfs.items(): yield content_api.ProcessorPart(f"""Slide {slide_num}:\n\n""") yield content_api.ProcessorPart(pdf_bytes, mimetype='application/pdf')

core/drive.py (159 lines of code) (raw):