In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# News summarization with PaLM API

## Overview

This notebook illustrates how to use Vertex AI PaLM text models for news summarization. You will discover the most popular Google Search terms and summarize news articles related to those terms. A system like that could be beneficial in a variety of business situations, including marketing, political analysis, and more.

Trending search terms are retrieved from [Google Trends dataset](https://pantheon.corp.google.com/marketplace/product/bigquery-public-datasets/google-search-trends?project=jk-mlops-dev) and news articles from [the GDELT database](https://www.gdeltproject.org/). 
The Google Trends dataset contains the top 25 overall and top 25 rising queries from Google Trends in the past 30 days. The dataset is hosted on Google BigQuery as part of the Google Cloud Datasets initiative.

The GDELT Project, which is supported by [Google Jigsaw](https://jigsaw.google.com/), monitors the world's broadcast, print, and web news from nearly every corner of every country in over 100 languages. The GDELT database is free to use and accessible via a variety of interfaces, including Google BigQuery and the REST API. In this notebook, we will be using the REST API.

The notebook is structured as follows:
- You will begin by installing the necessary packages and configuring your GCP environment.
- You will query Google Trends dataset to bring top search terms
- You will query GDELT API to bring news related to top search terms
- Finally, you will summarize these news articles



## Install pre-requisites

Install the following python packages.

In [None]:
! pip install -U google-cloud-aiplatform
! pip install -U python-dateutil
! pip install -U newspaper3k

---

#### ⚠️ Do not forget to RESTART THE RUNTIME before continue.

---

## Configure Google Cloud environment settings

Set the following constants to reflect your GCP environment.
- `PROJECT_ID`: Your Google Cloud Project ID.
- `REGION`: The region to use for Vertex AI

In [None]:
PROJECT_ID = '<YOUR PROJECT ID HERE>'
REGION = 'us-central1'

Initialize the SDK and import some modules.

In [None]:
import logging
import os
import requests
import vertexai

from newspaper import Article
from newspaper import ArticleException

from dateutil.parser import parse as parse_date
from datetime import date, timedelta, datetime
from google.cloud import bigquery
from vertexai.preview.language_models import TextGenerationModel
from typing import Any, Dict, List


logging.basicConfig(level = logging.INFO)
vertexai.init(project=PROJECT_ID, location=REGION)

bq_client = bigquery.Client(project=PROJECT_ID)
llm = TextGenerationModel.from_pretrained("text-bison@001")

### Google Trends lookup tool

Returns top (rank 1) search term(s) for a given date.

In [None]:
class GoogleTrends:
    """Get Trends from BQ dataset
    Useful for when you need to find top search terms on a given date. 
    Input is a JSON object that has the field date.
    """

    def __init__(
            self, 
            project_id: str, 
            bq_client: Any):
        self.project_id = project_id
        self.bq_client = bq_client


    def run(self, json_params: Dict):
        refresh_date = self._parse_date(json_params)

        if refresh_date:
            df = self._query_top_terms(refresh_date)
            terms = df.loc[0].values[0]
            terms = terms.split(' ')
        else:
            terms = []

        return terms


    def _query_top_terms(self, date: str):
        """Retrieve top terms from Google Trends."""
        query = f"""
            SELECT term, rank FROM `bigquery-public-data.google_trends.top_terms`
            WHERE refresh_date = '{date}'
            GROUP BY 1,2
            ORDER by rank ASC
        """
        query_job = self.bq_client.query(
            query,
            location="US",
        )

        df = query_job.to_dataframe()
        return df


    def _parse_date(self, json_params: Dict):
        """Parse date."""
        params = json_params

        if 'date' in params:
            try:
                dt = parse_date(params['date'])
                dt = dt.date()
            except:
                dt = date.today()
        else:
            dt = date.today()

        dt_str = dt.strftime('%Y-%m-%d')
        if dt >= date.today() or dt <= date.today() - timedelta(days=30):
            dt_str = ""
        else:
            dt_str = dt.strftime('%Y-%m-%d')

        return dt_str

In [None]:
# Google Trends dataset in BigQuery only stores data from the past month.
# Change to a valid date.

google_trends_tool = GoogleTrends(project_id=PROJECT_ID, bq_client=bq_client)
google_trends_tool.run({'date':'11-24-2023'})

### GDELT Retriever

The `GDELT Retriever` obtains information about articles that match a set of keywords. The class takes a JSON object as input, with the following format:

```
{
    "date": "05-16-2023",
    "keywords": ["Real", "Madrid"],
    "tone": "positive"
}
```

The script will search for articles published between the dates `date - time_window_days` and `date + time_window_days`, where `time_window_days` is a configurable parameter. The `tone` field can be either `positive`, `negative`, or `unknown`. Please refer to the [GDELT documentation](https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/#:~:text=theme%3ATERROR-,Tone,-.%20Allows%20you%20to) for more information on the tone setting. We define `positive` as a tone value greater than 5 and `negative` as a tone value less than 5.

Besides the time_window, there are other configuration parameters that control the results, including the maximum number of returned records (`max_records`) and the maximum distance between the keywords in an article (`n_near_words`).

The script returns a text that is a compilation of article titles and their content.

Let's start by defining the retriever.

In [None]:
class GDELTRetriever:
    def __init__(self, max_records:int = 10, tone: str = 'positive'):
        self.gdelt_api_url: str = 'https://api.gdeltproject.org/api/v2/doc/doc'
        self.mode: str = 'ArtList'
        self.format: str = 'json'
        self.max_records: int = max_records
        self.n_near_words: int = 50
        self.source_country: str = 'US'
        self.source_lang: str = 'english'
        self.time_window_days: int = 3
        
        if tone == 'positive':
            self.tone = 'tone>5'
        elif tone == 'negative':
            self.tone = 'tone<-5'


    def _get_articles_info(
            self, 
            keywords: list[str], 
            startdatetime: datetime, 
            enddatetime: datetime) -> Dict:

        startdatetime_str = startdatetime.strftime('%Y%m%d%H%M%S')
        enddatetime_str = enddatetime.strftime('%Y%m%d%H%M%S')

        query = f'near{self.n_near_words}:"{" ".join(keywords)}" '
        query += f'sourcecountry:{self.source_country} sourcelang:{self.source_lang} '
        query += f'{self.tone}'
        params = {'query': query,
                  'format': self.format,
                  'mode': self.mode,
                  'maxrecords': str(self.max_records),
                  'startdatetime': startdatetime_str,
                  'enddatetime': enddatetime_str}

        response = requests.get(self.gdelt_api_url, params=params)
        response.raise_for_status()
        return response.json()


    def _parse_article(self, url: str) -> Article:
        article = Article(url)

        try:
            article.download()
            article.parse()
        except ArticleException:
            return Article("www.google.com")
        else:
            return article


    def _get_documents(self, articles: Dict) -> List[Dict]:
        documents = []
        unique_docs = set()

        for article in articles['articles']:
            parsed_article = self._parse_article(article['url'])
            if parsed_article and parsed_article.text and (article['title'] not in unique_docs):
                unique_docs.add(article['title'])
                document = {
                    'page_content': parsed_article.text,
                    'title': article['title'],
                    'url': article['url'],
                    'domain': article['domain'],
                    'date': article['seendate']
                }
                documents.append(document)
        return documents


    def get_relevant_documents(self, query: str) -> List[Dict]:
        query_params = query
        keywords = query_params['keywords']
        event_date = parse_date(query_params['date'])
        startdatetime = datetime.combine(event_date - timedelta(days=self.time_window_days), datetime.min.time())
        enddatetime = datetime.combine(event_date + timedelta(days=self.time_window_days), datetime.min.time())
        articles = self._get_articles_info(keywords, startdatetime, enddatetime)
        documents = self._get_documents(articles)

        return documents

In [None]:
retriever = GDELTRetriever(max_records=10)
documents = retriever.get_relevant_documents(query={'keywords': ['Cowboys'], 'date': '11-24-2023'})

print('=> Sample article')
print(documents[0]['page_content'][:200] + '...\n')
print('=> Total number of documents: ' + str(len(documents)))

#### Summarize news articles

This is a function that processes each document in a list using a configured LLM and a prompt. The script returns the list of responses together with original documents.

In [None]:
def summarize_news_article(document: Dict, llm):
    prompt_template = f"""Write a one sentence summary of the following article delimited by triple backticks:

    ```{document['page_content']}```
    """
    document['summary'] = llm.predict(prompt_template).text
    return document


def summarize_documents(documents: Dict, llm) -> List:
    summaries = []

    for document in documents:
        summaries.append(
            summarize_news_article(document, llm)
        )

    return summaries

In [None]:
summarize_news_article(documents[0], llm)

In [None]:
summarize_documents(documents=documents, llm=llm)