webhook/utils.py (38 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from google.cloud import logging import datetime import re ABSTRACT_LENGTH = 150 * 8 # Abstract recommended max word length * avg 8 letters long CONCLUSION_LENGTH = 200 * 8 # Conclusion max word length * avg 8 letters long ABSTRACT_H1 = "abstract" CONCLUSION_H1 = "conclusion" CONTENT_ERROR_MESSAGE = """ Uploaded PDF doesn't contain an abstract or conclusion paragraph. The document summarization pipeline will attempt a best effort at summarizing the PDF. Your results might vary in quality. For best results, use a single-column, academic paper that contains both a labeled 'Abstract' and 'Conclusion' section. """ def coerce_datetime_zulu(input_datetime: datetime.datetime): """Force datetime into specific format. Args: input_datetime (datetime.datetime): the datetime to coerce """ regex = re.compile(r"(.*)(Z$)") regex_match = regex.search(input_datetime) if regex_match: assert input_datetime.startswith(regex_match.group(1)) assert input_datetime.endswith(regex_match.group(2)) return datetime.datetime.fromisoformat(f"{input_datetime[:-1]}+00:00") raise RuntimeError( "The input datetime is not in the expected format. " 'Please check format of the input datetime. Expected "Z" at the end' ) def truncate_complete_text(complete_text: str, logger_name: str) -> str: """Extracts the abstract and conclusion from an academic paper. Uses a heuristics to approximate the extent of the abstract and conclusion. For abstract: assumes beginning after the string `abstract` and extends for 6-7 sentences For conclusion: assumes beginning after the string `conclusion` and extends for 7-9 sentences #56 : Improve this function Args: complete_text (str): the complete text of the academic paper Returns str: the truncated paper """ complete_text = complete_text.lower() abstract_start = complete_text.find(ABSTRACT_H1) # If no "Abstract" heading found, produce the entire text if abstract_start == -1: abstract_start = 0 log_content_error(logger_name=logger_name) conclusion_start = complete_text.find(CONCLUSION_H1) # If no "Conclusion" heading found, produce the last little bit # of the text if conclusion_start == -1: conclusion_start = len(complete_text) - (CONCLUSION_LENGTH) log_content_error(logger_name=logger_name) abstract = complete_text[abstract_start:ABSTRACT_LENGTH] conclusion = complete_text[conclusion_start:] if len(conclusion) > CONCLUSION_LENGTH: conclusion = conclusion[:CONCLUSION_LENGTH] return f""" Abstract: {abstract} Conclusion: {conclusion} """ def log_content_error(logger_name: str): logging_client = logging.Client() logger = logging_client.logger(logger_name) logger.log(CONTENT_ERROR_MESSAGE, severity="WARNING")