bots/sdlc-slackbot/sdlc_slackbot/gdoc.py (75 lines of code) (raw):

from __future__ import print_function import os.path import re from logging import getLogger from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from googleapiclient.errors import HttpError # If modifying these scopes, delete the file token.json. SCOPES = ["https://www.googleapis.com/auth/documents.readonly"] logger = getLogger(__name__) def read_paragraph_element(element): """Returns the text in the given ParagraphElement. Args: element: a ParagraphElement from a Google Doc. """ text_run = element.get("textRun") if not text_run: return "" return text_run.get("content") def read_structural_elements(elements): """Recurses through a list of Structural Elements to read a document's text where text may be in nested elements. Args: elements: a list of Structural Elements. """ text = "" for value in elements: if "paragraph" in value: elements = value.get("paragraph").get("elements") for elem in elements: text += read_paragraph_element(elem) elif "table" in value: # The text in table cells are in nested Structural Elements and tables may be # nested. table = value.get("table") for row in table.get("tableRows"): cells = row.get("tableCells") for cell in cells: text += read_structural_elements(cell.get("content")) elif "tableOfContents" in value: # The text in the TOC is also in a Structural Element. toc = value.get("tableOfContents") text += read_structural_elements(toc.get("content")) return text def gdoc_creds(): creds = None # The file token.json stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. creds_path = "./bots/sdlc-slackbot/sdlc_slackbot/" if os.path.exists(creds_path + "token.json"): creds = Credentials.from_authorized_user_file(creds_path + "token.json", SCOPES) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( creds_path + "credentials.json", SCOPES ) creds = flow.run_local_server(port=0) # Save the credentials for the next run with open(creds_path + "token.json", "w") as token: token.write(creds.to_json()) return creds def gdoc_get(gdoc_url): # https://docs.google.com/document/d/<ID>/edit result = None logger.info(gdoc_url) if not gdoc_url.startswith("https://docs.google.com/document") and not gdoc_url.startswith( "docs.google.com/document" ): logger.error("Invalid google doc url") return result # This regex captures the ID after "/d/" and before an optional "/edit", "/" or the end of the string. pattern = r"/d/([^/]+)" match = re.search(pattern, gdoc_url) if match: document_id = match.group(1) logger.info(document_id) else: logger.error("No ID found in the URL") return result creds = gdoc_creds() try: service = build("docs", "v1", credentials=creds) # Retrieve the documents contents from the Docs service. document = service.documents().get(documentId=document_id).execute() logger.info("The title of the document is: {}".format(document.get("title"))) doc_content = document.get("body").get("content") result = read_structural_elements(doc_content) except HttpError as err: logger.error(err) return result