colab-enterprise/gen-ai-demo/Customer-Reviews-Detect-Themes-GenAI.ipynb (378 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## <img src=\"https://lh3.googleusercontent.com/mUTbNK32c_DTSNrhqETT5aQJYFKok2HB1G2nk2MZHvG5bSs0v_lmDm_ArW7rgd6SDGHXo0Ak2uFFU96X6Xd0GQ=w160-h128\" width=\"45\" valign=\"top\" alt=\"BigQuery\"> Detecting Themes within Customer Reviews using GenAI\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### License"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"##################################################################################\n",
"# Copyright 2024 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"# \n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"# \n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"###################################################################################"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Notebook Overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- This notebook will use GenAI to detect the prevaling theme of each customer reviews. The themes can then be summarized for each location within the web application.\n",
" - Example Themes:\n",
" - \"Good value\"\n",
" - \"Long Wait Time\"\n",
"\n",
"- Notebook Logic:\n",
" 1. Gather the reviews that currently have not had their theme extracted.\n",
"\n",
" 2. Create our LLM prompt:\n",
" - The prompt will contain the review text.\n",
" - The prompt will contain the list of themes that we want the LLM to determine matches.\n",
" - The prompt will ask the LLM to provide an explaination of its thought process.\n",
" - The prompt will provide example JSON results so the LLM knows the output format to generate the results.\n",
"\n",
" 3. Execute our LLM prompt using BigFrames\n",
"\n",
" 4. Parse the JSON from the LLM and construct an UPDATE SQL statement so we can update the cutomer review table.\n",
" - We check for valid JSON in case the LLM created incomplete or invalid JSON\n",
" \n",
" 5. Execute the SQL statement against BigQuery."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DszuLZoo9A7k"
},
"source": [
"## Initialize Python"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bhKxJadjWa1R"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"import bigframes.pandas as bf\n",
"from bigframes.ml.llm import GeminiTextGenerator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MSC6-rboip3h"
},
"outputs": [],
"source": [
"from google.cloud import bigquery\n",
"client = bigquery.Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lpU1P_fAXviJ"
},
"outputs": [],
"source": [
"PROJECT_ID = \"${project_id}\"\n",
"REGION = \"us\"\n",
"DATASET_ID = \"${bigquery_data_beans_curated_dataset}\"\n",
"CONNECTION_NAME = \"vertex-ai\"\n",
"\n",
"# Connection for BigFrames\n",
"connection = f\"{PROJECT_ID}.{REGION}.{CONNECTION_NAME}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qMijiOqUZmAp"
},
"outputs": [],
"source": [
"# bf.reset_session() # if you need to change the region\n",
"bf.options.bigquery.project = PROJECT_ID\n",
"bf.options.bigquery.location = REGION"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zy7lRW09Ws1J"
},
"outputs": [],
"source": [
"session = bf.get_global_session()\n",
"\n",
"gemini_model = GeminiTextGenerator(session=session, connection_name=connection)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6YeExbVqf4ZE"
},
"source": [
"## Supporting Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QtSLYNa_exfc"
},
"outputs": [],
"source": [
"def PrettyPrintJson(json_string):\n",
" json_object = json.loads(json_string)\n",
" json_formatted_str = json.dumps(json_object, indent=2)\n",
" print(json_formatted_str)\n",
" return json.dumps(json_object)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sthZI1h2fe8H"
},
"outputs": [],
"source": [
"def LLM(prompt, isOutputJson, max_output_tokens=1024, temperature=0, top_p=0, top_k=1):\n",
" print()\n",
" print(\"Prompt: \", prompt)\n",
" print()\n",
" df_prompt = pd.DataFrame(\n",
" {\n",
" \"prompt\": [prompt],\n",
" })\n",
" bf_df_prompt = bf.read_pandas(df_prompt)\n",
" prediction = gemini_model.predict(bf_df_prompt,\n",
" max_output_tokens=max_output_tokens,\n",
" temperature=temperature, # 0 to 1 (1 random)\n",
" top_p=top_p, # 0 to 1 (1 random)\n",
" top_k=top_k, # (1 to 40 random)\n",
" ).to_pandas()\n",
" try:\n",
" # Remove common LLM output mistakes\n",
" result = prediction['ml_generate_text_llm_result'][0]\n",
"\n",
" result = result.replace(\"```json\\n\",\"\")\n",
" result = result.replace(\"```JSON\\n\",\"\")\n",
" result = result.replace(\"```json\",\"\")\n",
" result = result.replace(\"```JSON\",\"\")\n",
" result = result.replace(\"```sql\\n\",\"\")\n",
" result = result.replace(\"```SQL\\n\",\"\")\n",
" result = result.replace(\"```sql\",\"\")\n",
" result = result.replace(\"```SQL\",\"\")\n",
" result = result.replace(\"```\",\"\")\n",
"\n",
" if isOutputJson:\n",
" json_string = PrettyPrintJson(result)\n",
" json_string = json_string.replace(\"'\",\"\\\\'\")\n",
" json_string = json_string.strip()\n",
" return json_string\n",
" else:\n",
" result = result.replace(\"'\",\"\\\\'\")\n",
" result = result.strip()\n",
" return result\n",
"\n",
" except:\n",
" print(\"Error (raw): \", prediction['ml_generate_text_llm_result'][0])\n",
" print(\"Error (result): \", result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "DFqXl0cefaOe"
},
"outputs": [],
"source": [
"# Runs a query against BigQuery and waits for it to complete\n",
"def RunQuery(sql):\n",
" import time\n",
"\n",
" job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n",
" query_job = client.query(sql, job_config=job_config)\n",
"\n",
" # Check on the progress by getting the job's updated state.\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n",
"\n",
" while query_job.state != \"DONE\":\n",
" time.sleep(2)\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {} with error result of {}\".format(query_job.job_id, query_job.state, query_job.error_result))\n",
"\n",
" if query_job.error_result == None:\n",
" return True\n",
" else:\n",
" return False"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "mOtL7RgiZox9"
},
"source": [
"## Detect Customer Themes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "y7r3Kzovd-y2"
},
"outputs": [],
"source": [
"# Gather the unprocessed customer reviews\n",
"sql = \"\"\"SELECT customer_review_id,\n",
" review_text\n",
" FROM `${project_id}.${bigquery_data_beans_curated_dataset}.customer_review`\n",
" WHERE llm_detected_theme IS NULL\n",
" ORDER BY customer_review_id\"\"\"\n",
"\n",
"df_process = client.query(sql).to_dataframe()\n",
"\n",
"# Loop through the results\n",
"for row in df_process.itertuples():\n",
" customer_review_id = row.customer_review_id\n",
" review_text = row.review_text\n",
"\n",
" llm_valid_execution = False\n",
" while llm_valid_execution == False:\n",
" try:\n",
" prompt=\"\"\"\n",
" Classify the below customer review as one or more of the below themes.\n",
" - Return the results the below json format.\n",
" - Include an explaination for selecting each theme.\n",
" - Do not include double quotes in the explaination.\n",
" - Do not include any special characters, double quotes or \"```json\" in the json output.\n",
"\n",
" Themes\n",
" - \"Bad Service\"\n",
" - \"Long Wait Time\"\n",
" - \"Slow Service\"\n",
" - \"Dirty\"\n",
" - \"Overpriced\"\n",
" - \"Overcrowded\"\n",
" - \"Noisy Location\"\n",
" - \"Lack of Allergan Information\"\n",
" - \"Inconsistent Quality\"\n",
" - \"Lack of Seating\"\n",
" - \"No Flavor\"\n",
" - \"Too weak\"\n",
" - \"Too strong\"\n",
" - \"Too bitter\"\n",
" - \"Limited Menu\"\n",
" - \"Good Service\"\n",
" - \"Short Wait Time\"\n",
" - \"Fast Service\"\n",
" - \"Clean\"\n",
" - \"Good value\"\n",
" - \"Cozy Seating Areas\"\n",
" - \"Quite Location\"\n",
" - \"Variety of Milk Alternatives\"\n",
" - \"Consistent Quality\"\n",
" - \"Lots of places to sit\"\n",
" - \"Lots of Flavor\"\n",
" - \"Good Taste\"\n",
" - \"Good Selection\"\n",
"\n",
" JSON format: [{ \"theme\" : \"value\", \"explaination\" : \"llm explaination\" }]\n",
" Sample JSON Response: [{ \"theme\" : \"Fast Service\", \"explaination\" : \"The customer got their order fast.\" }]\n",
" Sample JSON Response: [{ \"theme\" : \"Overpriced\", \"explaination\" : \"The customer said it was too expensive.\" }]\n",
"\n",
" Review:\"\"\"\n",
" prompt = prompt + review_text\n",
"\n",
" json_result = LLM(prompt, True, max_output_tokens=1024, temperature=0, top_p=0, top_k=1)\n",
" print(f\"json_result: {json_result}\")\n",
"\n",
" if json_result == None:\n",
" llm_valid_execution = False\n",
" else:\n",
" sql = f\"\"\"UPDATE `${project_id}.${bigquery_data_beans_curated_dataset}.customer_review`\n",
" SET llm_detected_theme = JSON'{json_result}'\n",
" WHERE customer_review_id = {customer_review_id}\n",
" \"\"\"\n",
"\n",
" print(f\"sql: {sql}\")\n",
"\n",
" llm_valid_execution = RunQuery(sql)\n",
" except Exception as error:\n",
" print(\"An error occurred:\", error)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"DszuLZoo9A7k",
"6YeExbVqf4ZE"
],
"name": "BigQuery table",
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}