colab-enterprise/rideshare_llm_step_02_driver_themes.ipynb (290 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "YUsBkE4F_iHn"
},
"source": [
"# Generate the Driver Profile (Enriched Zone: driver) using LLMs\n",
"- Create a Qualitative data analysis for Driver Habits\n",
" - We have found some patterns that we want to dive deeper into:\n",
" - trunk space\n",
" - driving speed\n",
" - hours worked\n",
" - preferred pickup locations\n",
" - average trip distance\n",
" - crossing state lines\n",
" - speaks spanish\n",
" - vechicle cleanliness\n",
" - vechile temperature\n",
" - maximum passengers\n",
" - conversation with customer\n",
" - music playing\n",
" - distracted driver\n",
" - target pay"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AzafP3BhPEFr"
},
"source": [
"## Score all items in batches\n",
"- Find all records that have not been scored\n",
"- Score in a batch (we can do up to 10,000)\n",
"- The LLM temperature, max_output_tokens, top_p and top_k parameters have been set (locked for a deterministic value)\n",
"- Repeat until done"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "vEOtezvfl-SG"
},
"outputs": [],
"source": [
"from google.cloud import bigquery\n",
"import pandas as pd\n",
"\n",
"client = bigquery.Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "P0pnkbK2njYZ"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- OPTIONAL: Reset all the fields to null\n",
"-- If you need to reset you data back to fresh data run the stored procedure: CALL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.sp_reset_demo`();\n",
"/*\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n",
" SET extracted_driver_theme_json = NULL,\n",
" extracted_driver_theme = NULL\n",
" WHERE TRUE;\n",
"*/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NItw0Av6FS4Q"
},
"outputs": [],
"source": [
"# need quotes around each category; otherwise, we get part of the category text\n",
"prompt = \"\"\"\n",
"Classify the text as one of the following categories:\n",
"- \"trunk space small\"\n",
"- \"trunk space large\"\n",
"- \"driving too fast\"\n",
"- \"driving too slow\"\n",
"- \"driver speaks spanish\"\n",
"- \"driver does not speak spanish\"\n",
"- \"clean car\"\n",
"- \"dirty car\"\n",
"- \"car too hot\"\n",
"- \"car too cold\"\n",
"- \"driver likes conversation\"\n",
"- \"driver likes no conversation\"\n",
"- \"driver likes music\"\n",
"- \"driver likes no music\"\n",
"- \"distracted driver\"\n",
"\n",
"Text:\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "uubVBImVl9Y_"
},
"outputs": [],
"source": [
"# Proces in batches\n",
"batch_size = 100\n",
"\n",
"# Set the parameters so we are more deterministic and less creative/random responses\n",
"llm_temperature = 1\n",
"llm_max_output_tokens = 50\n",
"llm_top_p = 0\n",
"llm_top_k = 1\n",
"\n",
"update_sql=\"\"\"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n",
" SET extracted_driver_theme_json = child.ml_generate_text_result\n",
" FROM (SELECT *\n",
" FROM ML.GENERATE_TEXT(MODEL`${project_id}.${bigquery_rideshare_llm_enriched_dataset}.gemini_model`,\n",
" (SELECT trip_id,\n",
" customer_id,\n",
" CONCAT(\\\"\\\"\\\"{prompt}\\\"\\\"\\\",customer_review_text) AS prompt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n",
" WHERE (extracted_driver_theme_json IS NULL\n",
" OR\n",
" JSON_VALUE(extracted_driver_theme_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND customer_review_text IS NOT NULL\n",
" LIMIT {batch_size}),\n",
" STRUCT(\n",
" {llm_temperature} AS temperature,\n",
" {llm_max_output_tokens} AS max_output_tokens,\n",
" {llm_top_p} AS top_p,\n",
" {llm_top_k} AS top_k\n",
" ))\n",
" ) AS child\n",
"WHERE customer_review.trip_id = child.trip_id\n",
" AND customer_review.customer_id = child.customer_id;\n",
" \"\"\".format(batch_size = batch_size,\n",
" prompt = prompt,\n",
" llm_temperature = llm_temperature,\n",
" llm_max_output_tokens = llm_max_output_tokens,\n",
" llm_top_p = llm_top_p,\n",
" llm_top_k = llm_top_k)\n",
"\n",
"print(\"SQL: {update_sql}\".format(update_sql=update_sql))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yNJYtvg1mIPY"
},
"outputs": [],
"source": [
"# Score while records remain\n",
"# score in groups of batch_size records (we can do up to 10,000 at a time)\n",
"import time\n",
"\n",
"done = False\n",
"original_record_count = 0\n",
"displayed_first_sql = False\n",
"\n",
"while done == False:\n",
" # Get the count of records to score\n",
" sql = \"\"\"\n",
" SELECT COUNT(*) AS cnt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n",
" WHERE (extracted_driver_theme_json IS NULL\n",
" OR\n",
" JSON_VALUE(extracted_driver_theme_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND customer_review_text IS NOT NULL;\n",
" \"\"\"\n",
"\n",
" df_record_count = client.query(sql).to_dataframe()\n",
" cnt = df_record_count['cnt'].head(1).item()\n",
" if displayed_first_sql == False:\n",
" original_record_count = cnt\n",
" displayed_first_sql = True\n",
"\n",
" print(\"Remaining records to process: \", cnt, \" out of\", original_record_count, \" batch_size: \", batch_size)\n",
"\n",
" if cnt == 0:\n",
" done = True\n",
" else:\n",
" # https://github.com/googleapis/python-bigquery/tree/master/samples\n",
" job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n",
" query_job = client.query(update_sql, job_config=job_config)\n",
"\n",
" # Check on the progress by getting the job's updated state.\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n",
"\n",
" while query_job.state != \"DONE\":\n",
" time.sleep(5)\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ymDrniXkoTrx"
},
"source": [
"## Parse the LLM JSON results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eLhmYOSkoUAB"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n",
" SET extracted_driver_theme = JSON_VALUE(extracted_driver_theme_json, '$.candidates[0].content.parts[0].text')\n",
" WHERE extracted_driver_theme_json IS NOT NULL\n",
" AND extracted_driver_theme IS NULL;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2Ip7NyPVoT6g"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"SELECT trip_id, customer_review_text, extracted_driver_theme\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n",
" WHERE extracted_driver_theme_json IS NOT NULL\n",
" AND extracted_driver_theme IS NOT NULL\n",
"LIMIT 20;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9hFB30qKdHH1"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"SELECT extracted_driver_theme, count(*) AS cnt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review`\n",
" WHERE extracted_driver_theme_json IS NOT NULL\n",
" AND extracted_driver_theme IS NOT NULL\n",
"GROUP BY 1;\n"
]
}
],
"metadata": {
"colab": {
"name": "BigQuery table",
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}