webhook/bigquery.py (44 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Sequence, Mapping
from datetime import datetime
from google.cloud import bigquery
def write_summarization_to_table(
project_id: str,
dataset_id: str,
table_id: str,
bucket: str,
filename: str,
complete_text: str,
complete_text_uri: str,
summary: str,
summary_uri: str,
timestamp: datetime,
) -> Sequence[Mapping]:
"""Updates the BigQuery table with the document summarization
Original sample is here:
https://cloud.google.com/bigquery/docs/samples/bigquery-table-insert-rows-explicit-none-insert-ids
Args:
project_id (str): the Google Cloud project ID
dataset_id (str): the name of the BigQuery dataset
table_id (str): the name of the BigQuery table
bucket (str): the name of the bucket with the PDF
filename (str): path of PDF relative to bucket root
complete_text (str): the complete text of the PDF
complete_text_uri (str): the Storage URI of the complete TXT document
summary (str): the text summary of the document
summary_uri (str): the Storage URI of the summary TXT document
timestamp (datetime): when the processing occurred
"""
if (project_id == "") or (dataset_id == "") or (table_id == ""):
return [ValueError("project_id, dataset_id, or table_id is missing")]
if (
(bucket == "")
and (filename == "")
and (complete_text == "")
and (summary_uri == "")
and (summary == "")
and (complete_text_uri == "")
and (timestamp is None)
):
return [ValueError("no row data provided for updating table")]
client = bigquery.Client()
table_name = f"{project_id}.{dataset_id}.{table_id}"
rows_to_insert = [
{
"bucket": bucket,
"filename": filename,
"extracted_text": complete_text,
"summary_uri": summary_uri,
"summary": summary,
"complete_text_uri": complete_text_uri,
"timestamp": timestamp.isoformat(),
}
]
errors = client.insert_rows_json(
table_name, rows_to_insert, row_ids=bigquery.AutoRowIDs.GENERATE_UUID
)
return errors