In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Sentiment Analysis for large scale data using LLM (Gemini)

<table align="left">

<a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
<img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
View on GitHub
</a>
</td>
<td>
<a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
<img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
</a>
</td>
</table>

## Overview

This notebook shows how to perform sentimental analysis on large scale data using LLM.
The dataset used is a public dataset from Bigquery Public Datasets.

#### **Steps**
Using Spark, 
1) This notebook reads data from Bigquery public dataset **bigquery-public-data.imdb.reviews**
2) It calls [Vertex AI Gemini API](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart#try_text_prompts) to find the sentiment of each review (positive vs negative)
3) We compare the result side by side
4) Find accuracy, and again trim the input and observe the accuracy increase

#### Related content

- [Text Prompt](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts)
- [Content Classification](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts#content-classification)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer

import google.auth
import google.auth.transport.requests
import requests

import time

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip install --upgrade google-cloud-aiplatform google-cloud-vision
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

#### Get credentials to authenticate with Google APIs


In [None]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

### Create Spark Session for the notebook

In [None]:
spark = SparkSession.builder \
    .appName("Sentimental Analysis using Dataproc and Vertex LLM") \
    .getOrCreate()

### Read data from Bigquery Public Dataset 

In [None]:
movie_reviews = spark.read.format("bigquery").option("table", "bigquery-public-data.imdb.reviews").load()

|                                                                                              review|split|   label| movie_id|reviewer_rating|                           movie_url|title|
|----------------------------------------------------------------------------------------------------|-----|--------|---------|---------------|------------------------------------|-----|
|I had to see this on the British Airways plane. It was terribly bad acting and a dumb story. Not ...| test|Negative|tt0158887|              2|http://www.imdb.com/title/tt0158887/| null|
|This is a family movie that was broadcast on my local ITV station at 1.00 am a couple of nights a...| test|Negative|tt0158887|              4|http://www.imdb.com/title/tt0158887/| null|
|I would like to comment on how the girls are chosen. why is that their are always more white wome...| test|Negative|tt0391576|              2|http://www.imdb.com/title/tt0391576/| null|
|Tyra & the rest of the modeling world needs to know that real women like myself and my daughter d...| test|Negative|tt0391576|              3|http://www.imdb.com/title/tt0391576/| null|

### Get Positive Reviews from Dataset

In [None]:
positive_movie_reviews = movie_reviews.select(col("review"), col("reviewer_rating"), col("movie_id"), col("label")).where(col("label") == "Positive").limit(100)

### Get Negative Reviews from Dataset

In [None]:
negative_movie_reviews = movie_reviews.select(col("review"), col("reviewer_rating"), col("movie_id"), col("label")).where(col("label") == "Negative").limit(100)

### Mix positive and negative 
Making union of positive and negative reviews to get a good dataset of mixed set of reviews. For the purpose notebook, each class of reviews has 100 rows each.

In [None]:
movie_reviews_mixed = positive_movie_reviews.union(negative_movie_reviews)

|              review|reviewer_rating| movie_id|   label|
|--------------------|---------------|---------|--------|
|This movie is ama...|             10|tt0187123|Positive|
|THE HAND OF DEATH...|             10|tt0187123|Positive|
|The Hand of Death...|              7|tt0187123|Positive|
|Just as a reminde...|             10|tt0163955|Positive|
|Like an earlier c...|              9|tt0163955|Positive|

### Final count is 200 as can be seen below

In [None]:
movie_reviews_mixed.count()

### Creating a UDF to get predictions from Gemini Model
In this method, text whose sentiment is to be predicted is passed

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part , HarmCategory, HarmBlockThreshold

vertexai.init(project=project_id, location="us-central1")

def gemini_predict(prompt):
      
    gemini_pro_model = GenerativeModel("gemini-1.0-pro")
    config = {"max_output_tokens": 2048, "temperature": 0.4, "top_p": 1, "top_k": 32}
    safety_config = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    }
    
    prediction = gemini_pro_model.generate_content([
          prompt
        ],
        generation_config=config,
        safety_settings=safety_config,
        stream=True
    )
                    
    text_responses = []
    try:
        for response in prediction:
            text_responses.append(response.text)
    except:
        pass
    return "".join(text_responses)

In [None]:
def find_sentiment_zero_shot(text):
    
    prompt = f"""For the given text below, provide the sentiment classification from the two classes mentioned below:
    The two classes are: Negative, Positive.
    Always choose between one of them (the most appropriate one.
    Text: {text}
    Sentiment:"""
    
    sentiment = gemini_predict(prompt)
    return sentiment
    
find_sentiment_zero_shot_udf = udf(find_sentiment_zero_shot)

In [None]:
movie_reviews_mixed.printSchema()

### Get prediction from the LLM using the UDF on the movie reviews

In [None]:
movie_review_sentiment_pred = movie_reviews_mixed.withColumn("predicted_sentiment", find_sentiment_zero_shot_udf(movie_reviews_mixed["review"]))

In [None]:
# Trim whitespaces
trimmed_movie_review_sentiment_pred = movie_review_sentiment_pred.withColumn("predicted_sentiment", trim(col("predicted_sentiment"))).withColumn("label", trim(col("label")))

### Let's check the predicted value and do a quick comparison of required output v/s actual label

In [None]:
trimmed_movie_review_sentiment_pred.select(col("predicted_sentiment"), col("label")).show(200,100)

In [None]:
trimmed_movie_review_sentiment_pred.cache()

### Evaluation

Let's index the classes Negative and Positive to 1 and 0.  

In [None]:
inputs = ["predicted_sentiment", "label"]
outputs = ["predicted_sentiment_indexed", "label_indexed"]

stringIndexer = StringIndexer(inputCols=inputs, outputCols=outputs)
indexer = stringIndexer.fit(trimmed_movie_review_sentiment_pred)

movie_review_sentiment_pred_indexed = indexer.transform(trimmed_movie_review_sentiment_pred)

And use the BinaryClassificationEvaluator to output our Area Under the ROC curve (AUC-ROC)

In [None]:
evaluator = BinaryClassificationEvaluator()
evaluator.setRawPredictionCol("predicted_sentiment_indexed")
evaluator.setLabelCol("label_indexed")

area_under_roc = evaluator.evaluate(movie_review_sentiment_pred_indexed, {evaluator.metricName: "areaUnderROC"})

print("area_under_roc (%): ", area_under_roc)

Without any prior training or chaining prompts, only by zero shot, the model has been able to predict sentiments properly with 94% AUC-ROC

#### Count the number of unsuccessful predictions

In [None]:
match_predictions_df = movie_review_sentiment_pred_indexed.withColumn("if_match", when((col("predicted_sentiment_indexed")==col("label_indexed")),1).otherwise(0))

In [None]:
match_predictions_df.where(col("if_match")==0).count()

#### Percentage Accuracy  

Total Rows = 200
* Mislabeled rows = 10
* Accuracy = (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
* Percentage Accuracy= 190 / 200 = 90%

#### Check the mismatch predictions
Find the mismatched rows and show it

In [None]:
mismatch_df = match_predictions_df.where(col("if_match")==0).select(col('predicted_sentiment'),col('label'),col('review'))

In [None]:
mismatch_df.show()