colab-enterprise/rideshare_llm_step_01_customer_sentiment_analysis.ipynb (294 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "hmBT4GGX5Xd2" }, "source": [ "# Determine the sentiment of customer review\n", "1. Reset the data (optional)\n", "2. Create a LLM prompt for the LLM model to determine the sentiment\n", "3. Score the data in batches\n", "4. Extract the text from the scored Json result" ] }, { "cell_type": "markdown", "metadata": { "id": "InTiZalrwUjD" }, "source": [ "## Generate the prompts (Enriched Zone: customer_review)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "I2Vm5bES1wcM" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "-- OPTIONAL: Reset all the fields to null\n", "-- If you need to reset you data back to fresh data run the stored procedure: CALL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.sp_reset_demo`();\n", "/*\n", "UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n", " SET llm_sentiment_prompt = NULL,\n", " raw_sentiment_json = NULL,\n", " review_sentiment = NULL\n", " WHERE TRUE;\n", "*/" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pL3k7_jIwfMn" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "-- Create the LLM prompt to determine the sentiment of each review\n", "UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n", " SET llm_sentiment_prompt = CONCAT('For the given review classify the sentiment as Positive, Neutral or Negative.','\\n','Review: ',customer_review_text)\n", " WHERE llm_sentiment_prompt IS NULL;" ] }, { "cell_type": "markdown", "metadata": { "id": "say5TbngwlmY" }, "source": [ "## Score all items in batches\n", "- Find all records that have not been scored\n", "- Score in a batch (we can do up to 10,000)\n", "- The LLM temperature, max_output_tokens, top_p and top_k parameters have been set (locked for a deterministic value)\n", "- Repeat until done" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ge8WWnQmyMGY" }, "outputs": [], "source": [ "from google.cloud import bigquery\n", "import pandas as pd\n", "\n", "client = bigquery.Client()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7xXZQtU20iWI" }, "outputs": [], "source": [ "# Process in batches\n", "batch_size = 100\n", "\n", "# Set the parameters so we are more deterministic and less creative/random responses\n", "llm_temperature = 1\n", "llm_max_output_tokens = 50\n", "llm_top_p = 0\n", "llm_top_k = 1\n", "\n", "update_sql=\"\"\"\n", "UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n", " SET raw_sentiment_json = child.ml_generate_text_result\n", " FROM (SELECT *\n", " FROM ML.GENERATE_TEXT(MODEL`${project_id}.${bigquery_rideshare_llm_enriched_dataset}.gemini_model`,\n", " (SELECT trip_id,\n", " llm_sentiment_prompt AS prompt\n", " FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n", " WHERE (raw_sentiment_json IS NULL\n", " OR\n", " JSON_VALUE(raw_sentiment_json, '$.candidates[0].content.parts[0].text') IS NULL\n", " )\n", " AND customer_review_text IS NOT NULL\n", " AND llm_sentiment_prompt IS NOT NULL\n", " AND review_sentiment IS NULL\n", " LIMIT {batch_size}),\n", " STRUCT(\n", " {llm_temperature} AS temperature,\n", " {llm_max_output_tokens} AS max_output_tokens,\n", " {llm_top_p} AS top_p,\n", " {llm_top_k} AS top_k\n", " ))\n", " ) AS child\n", "WHERE customer_review.trip_id = child.trip_id;\n", " \"\"\".format(batch_size = batch_size,\n", " llm_temperature = llm_temperature,\n", " llm_max_output_tokens = llm_max_output_tokens,\n", " llm_top_p = llm_top_p,\n", " llm_top_k = llm_top_k)\n", "\n", "print(\"SQL: {update_sql}\".format(update_sql=update_sql))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "w_rkIbY7wpwg" }, "outputs": [], "source": [ "# Score while records remain\n", "# Score in groups of batch_size records (we can do up to 10,000 at a time)\n", "import time\n", "\n", "done = False\n", "original_record_count = 0\n", "displayed_first_sql = False\n", "\n", "while done == False:\n", " # Get the count of records to score\n", " sql = \"\"\"\n", " SELECT COUNT(*) AS cnt\n", " FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n", " WHERE (raw_sentiment_json IS NULL\n", " OR\n", " JSON_VALUE(raw_sentiment_json, '$.candidates[0].content.parts[0].text') IS NULL\n", " )\n", " AND customer_review_text IS NOT NULL\n", " AND llm_sentiment_prompt IS NOT NULL\n", " AND review_sentiment IS NULL\n", " \"\"\"\n", "\n", " df_record_count = client.query(sql).to_dataframe()\n", " cnt = df_record_count['cnt'].head(1).item()\n", " if displayed_first_sql == False:\n", " original_record_count = cnt\n", " displayed_first_sql = True\n", "\n", " print(\"Remaining records to process: \", cnt, \" out of\", original_record_count, \" batch_size: \", batch_size)\n", "\n", " if cnt == 0:\n", " done = True\n", " else:\n", " # https://github.com/googleapis/python-bigquery/tree/master/samples\n", " job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n", " query_job = client.query(update_sql, job_config=job_config)\n", "\n", " # Check on the progress by getting the job's updated state.\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n", "\n", " while query_job.state != \"DONE\":\n", " time.sleep(5)\n", " query_job = client.get_job(\n", " query_job.job_id, location=query_job.location\n", " )\n", " print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n" ] }, { "cell_type": "markdown", "metadata": { "id": "snavNXz63Rd2" }, "source": [ "## Parse the LLM results to get a rating\n", "- Rating should be Positive, Neutral or Negative" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "z--dotG43ryD" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "SELECT trip_id, llm_sentiment_prompt, raw_sentiment_json\n", " FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n", " WHERE llm_sentiment_prompt IS NOT NULL\n", " AND raw_sentiment_json IS NOT NULL\n", " AND review_sentiment IS NULL\n", "LIMIT 20;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JE43nGB93XOC" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n", " SET review_sentiment = JSON_VALUE(raw_sentiment_json, '$.candidates[0].content.parts[0].text')\n", " WHERE review_sentiment IS NULL\n", " AND raw_sentiment_json IS NOT NULL\n", ";" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QrTAZu9235YU" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "SELECT trip_id, llm_sentiment_prompt, raw_sentiment_json, review_sentiment\n", " FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n", " WHERE review_sentiment IS NOT NULL\n", " AND raw_sentiment_json IS NOT NULL\n", "LIMIT 20;" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HVfYUdGoZVZ3" }, "outputs": [], "source": [ "%%bigquery\n", "\n", "SELECT review_sentiment, COUNT(*) AS cnt\n", " FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n", " WHERE review_sentiment IS NOT NULL\n", " AND raw_sentiment_json IS NOT NULL\n", "GROUP BY 1;\n" ] } ], "metadata": { "colab": { "name": "BigQuery table", "private_outputs": true, "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }