colab-enterprise/rideshare_llm_step_04_driver_summary.ipynb (666 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "HhgOK3uTr6M-"
},
"source": [
"# Create Driver Summary (Preferences)\n",
"- This notebook take about 5 to 10 minutes to execute\n",
"- We will create 2 summaries\n",
" - The driver preferences based upon what themes they mention in their reviews\n",
" - A summary of all their reviews for a consolidated overall review"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4xV3PMBh8Jjd"
},
"source": [
"## Create Summary Prompt and run through LLM"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eGtjlAFZr3pO"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- The table is populated by: CALL ${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sp_step_02_extract_driver_attributes()\n",
"SELECT *\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute`\n",
"ORDER BY driver_id,\n",
" driver_attribute_grouping,\n",
" extracted_driver_attribute,\n",
" rank_order\n",
"LIMIT 100;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aL5QLba1M9Ia"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- Distinct list of driver attributes\n",
"\n",
"SELECT DISTINCT extracted_driver_attribute\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YwWMtN19SAcF"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- Insert safe driver if they do not have any \"unsafe attributed\"\n",
"INSERT INTO `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute`\n",
"(driver_id, extracted_driver_attribute, driver_attribute_grouping, cnt, rank_order)\n",
"SELECT driver_id, 'safe driver', 'safe-driver', 1, 1\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" WHERE driver.include_in_llm_processing = TRUE\n",
" AND EXISTS (SELECT *\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute` AS driver_attribute\n",
" WHERE driver.driver_id = driver_attribute.driver_id)\n",
" AND NOT EXISTS (SELECT *\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute` AS driver_attribute\n",
" WHERE driver.driver_id = driver_attribute.driver_id\n",
" AND TRIM(extracted_driver_attribute) = 'distracted driver')\n",
" AND NOT EXISTS (SELECT *\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute` AS driver_attribute\n",
" WHERE driver.driver_id = driver_attribute.driver_id\n",
" AND TRIM(extracted_driver_attribute) = 'driving too fast')\n",
" AND NOT EXISTS (SELECT *\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute` AS child\n",
" WHERE driver.driver_id = child.driver_id\n",
" AND child.extracted_driver_attribute = 'safe driver')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xlwTohJKyGsS"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- Sample prompt aggregation of attributes\n",
"SELECT driver_id,\n",
" STRING_AGG(\n",
" CASE WHEN TRIM(extracted_driver_attribute) = 'driver likes music' THEN 'Attribute: the driver likes the radio on\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes no music' THEN 'Attribute: the driver likes the radio off\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'trunk space large' THEN 'Attribute: the driver has a large amount of trunk space\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'trunk space small' THEN 'Attribute: the driver has a small trunk\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'Category: trunk space small' THEN 'Attribute: the driver has a small trunk\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes conversation' THEN 'Attribute: the driver likes to have a conversation with the passengers\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes no conversation' THEN 'Attribute: the driver does not like to talk to the passengers\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driving too fast' THEN 'Attribute: the driver tends to drive too fast\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driving too slow' THEN 'Attribute: the driver drives too slow\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'clean car' THEN 'Attribute: the driver keeps their car clean on the inside and out\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'dirty car' THEN 'Attribute: the driver has a dirty car\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'car too hot' THEN 'Attribute: the driver has a keeps their car too warm inside\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'car too cold' THEN 'Attribute: the driver keeps their car too cold\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver speaks spanish' THEN 'Attribute: the driver is bilingual\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'distracted driver' THEN 'Attribute: the driver is a distracted driver\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'safe driver' THEN 'Attribute: the driver is a safe driver\\n'\n",
" ELSE ''\n",
" END,'') AS driver_attribute_agg\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute`\n",
"GROUP BY driver_id\n",
"LIMIT 100;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jNzpvp1qWk6F"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- OPTIONAL: Reset all the fields to null\n",
"-- If you need to reset you data back to fresh data run the stored procedure: CALL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.sp_reset_demo`();\n",
"\n",
"/*\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET driver_attribute_llm_summary_prompt = NULL,\n",
" llm_summary_driver_attribute_json = NULL,\n",
" llm_summary_driver_attribute = NULL\n",
" WHERE TRUE;\n",
"*/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sGmYo5mO1FIA"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- Create the LLM prompt\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET driver_attribute_llm_summary_prompt =\n",
" CONCAT('Write a 100 to 500 word summary for the following attributes.\\n',\n",
" '1. Randomly sort the attributes\\n',\n",
" '2. Select the first 3 to 5 attributes\\n',\n",
" '3. Write the summary in present tense for only the first 3 to 5 attributes\\n',\n",
" '4. The driver\\'s name is ', driver.driver_name, '\\n',\n",
" '5. Use sentences with varying lengths\\n',\n",
" '6. Write 2 to 5 sentences',\n",
" driver_attribute_agg),\n",
" llm_summary_driver_attribute_json = NULL,\n",
" llm_summary_driver_attribute = NULL\n",
" FROM\n",
" (SELECT driver_id,\n",
" STRING_AGG(\n",
" CASE WHEN TRIM(extracted_driver_attribute) = 'driver likes music' THEN 'Attribute: the driver likes the radio on\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes no music' THEN 'Attribute: the driver likes the radio off\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'trunk space large' THEN 'Attribute: the driver has a large amount of trunk space\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'trunk space small' THEN 'Attribute: the driver has a small trunk\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'Category: trunk space small' THEN 'Attribute: the driver has a small trunk\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes conversation' THEN 'Attribute: the driver likes to have a conversation with the passengers\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver likes no conversation' THEN 'Attribute: the driver does not like to talk to the passengers\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driving too fast' THEN 'Attribute: the driver tends to drive too fast\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driving too slow' THEN 'Attribute: the driver drives too slow\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'clean car' THEN 'Attribute: the driver keeps their car clean on the inside and out\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'dirty car' THEN 'Attribute: the driver has a dirty car\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'car too hot' THEN 'Attribute: the driver has a keeps their car too warm inside\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'car too cold' THEN 'Attribute: the driver keeps their car too cold\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'driver speaks spanish' THEN 'Attribute: the driver is bilingual\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'distracted driver' THEN 'Attribute: the driver is a distracted driver\\n'\n",
" WHEN TRIM(extracted_driver_attribute) = 'safe driver' THEN 'Attribute: the driver is a safe driver\\n'\n",
" ELSE ''\n",
" END,'') AS driver_attribute_agg\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute`\n",
" GROUP BY driver_id\n",
" ) AS driver_attribute\n",
"WHERE driver.driver_id = driver_attribute.driver_id;\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rfyk4FRh12do"
},
"outputs": [],
"source": [
"%%bigquery\n",
"SELECT driver_attribute_llm_summary_prompt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" WHERE driver_attribute_llm_summary_prompt IS NOT NULL\n",
" LIMIT 10;"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UrQsqrtj2x69"
},
"source": [
"## Run the LLM to generate a Driver Summary on Discovered Attributes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xEnr3nBp3y0Z"
},
"outputs": [],
"source": [
"from google.cloud import bigquery\n",
"import pandas as pd\n",
"\n",
"client = bigquery.Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B8uFxM_220li"
},
"outputs": [],
"source": [
"# Process in batches\n",
"batch_size = 100\n",
"\n",
"# Set the parameters so we are more deterministic and less creative/random responses\n",
"llm_temperature = 1\n",
"llm_max_output_tokens = 1024\n",
"llm_top_p = 1\n",
"llm_top_k = 40\n",
"\n",
"update_sql=\"\"\"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET llm_summary_driver_attribute_json = child.ml_generate_text_result\n",
" FROM (SELECT *\n",
" FROM ML.GENERATE_TEXT(MODEL`${project_id}.${bigquery_rideshare_llm_enriched_dataset}.gemini_model`,\n",
" (SELECT driver_id,\n",
" driver_attribute_llm_summary_prompt AS prompt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE (llm_summary_driver_attribute_json IS NULL\n",
" OR\n",
" JSON_VALUE(llm_summary_driver_attribute_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND include_in_llm_processing = TRUE\n",
" AND driver_attribute_llm_summary_prompt IS NOT NULL\n",
" LIMIT {batch_size}),\n",
" STRUCT(\n",
" {llm_temperature} AS temperature,\n",
" {llm_max_output_tokens} AS max_output_tokens,\n",
" {llm_top_p} AS top_p,\n",
" {llm_top_k} AS top_k\n",
" ))\n",
" ) AS child\n",
"WHERE driver.driver_id = child.driver_id\n",
" \"\"\".format(batch_size = batch_size,\n",
" llm_temperature = llm_temperature,\n",
" llm_max_output_tokens = llm_max_output_tokens,\n",
" llm_top_p = llm_top_p,\n",
" llm_top_k = llm_top_k)\n",
"\n",
"print(\"SQL: {update_sql}\".format(update_sql=update_sql))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1gFA-Koc3nLx"
},
"outputs": [],
"source": [
"# Score while records remain\n",
"# score in groups of batch_size records (we can do up to 10,000 at a time)\n",
"import time\n",
"\n",
"done = False\n",
"displayed_first_sql = False\n",
"original_record_count = 0\n",
"\n",
"while done == False:\n",
" # Get the count of records to score\n",
" sql = \"\"\"\n",
" SELECT COUNT(*) AS cnt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE (llm_summary_driver_attribute_json IS NULL\n",
" OR\n",
" JSON_VALUE(llm_summary_driver_attribute_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND include_in_llm_processing = TRUE\n",
" AND driver_attribute_llm_summary_prompt IS NOT NULL;\n",
" \"\"\"\n",
"\n",
" df_record_count = client.query(sql).to_dataframe()\n",
" cnt = df_record_count['cnt'].head(1).item()\n",
" if displayed_first_sql == False:\n",
" original_record_count = cnt\n",
" displayed_first_sql = True\n",
"\n",
" print(\"Remaining records to process: \", cnt, \" out of\", original_record_count, \" batch_size: \", batch_size)\n",
"\n",
" if cnt == 0:\n",
" done = True\n",
" else:\n",
" # https://github.com/googleapis/python-bigquery/tree/master/samples\n",
" job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n",
" query_job = client.query(update_sql, job_config=job_config)\n",
"\n",
" # Check on the progress by getting the job's updated state.\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n",
"\n",
" while query_job.state != \"DONE\":\n",
" time.sleep(5)\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Lw4oA-qy8gzQ"
},
"source": [
"## Parse the LLM JSON results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nFM-JNqg39up"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET llm_summary_driver_attribute = JSON_VALUE(llm_summary_driver_attribute_json, '$.candidates[0].content.parts[0].text')\n",
" WHERE llm_summary_driver_attribute_json IS NOT NULL\n",
" AND llm_summary_driver_attribute IS NULL;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_-YX3rPU39g5"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"SELECT driver_id, driver_attribute_llm_summary_prompt, llm_summary_driver_attribute\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE llm_summary_driver_attribute_json IS NOT NULL\n",
" AND llm_summary_driver_attribute IS NOT NULL\n",
"LIMIT 20;"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "H98uiO97jhPP"
},
"source": [
"# Create Driver Summary baesd upon Customer Reviews"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Cgu2Gx-vlJJv"
},
"source": [
"## Create Summary Prompt and run through LLM"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "biJGzUwwVg9R"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- OPTIONAL: Reset all the fields to null\n",
"-- If you need to reset you data back to fresh data run the stored procedure: CALL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.sp_reset_demo`();\n",
"\n",
"/*\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET driver_review_summary_llm_summary_prompt = NULL,\n",
" llm_summary_driver_review_summary_json = NULL,\n",
" llm_summary_driver_review_summary = NULL\n",
" WHERE TRUE;\n",
"*/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3YvdDQ2ojuRn"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"-- Create the LLM prompt (latest 50 customer reviews per driver)\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET driver_review_summary_llm_summary_prompt =\n",
" CONCAT('Write a 100 to 600 word summary for the following customer reviews.\\n',\n",
" '1. Write the summary in present tense.\\n',\n",
" '2. Write the summary from the customers prespective.\\n',\n",
" '3. Do not repeat the same subject in the summary.\\n',\n",
" '4. The reviews are for the driver ', driver_name, ' at a rideshare company.\\n',\n",
" '5. The reviews are written by different customers.\\n',\n",
" '6. Write 3 to 6 sentences.\\n',\n",
" customer_review_agg)\n",
" FROM (SELECT customer_review.driver_id,\n",
" STRING_AGG(CONCAT(\"Review: \",customer_review_text),'\\n') AS customer_review_agg\n",
" FROM (SELECT driver_id,\n",
" customer_review_text\n",
" FROM (SELECT trip.driver_id,\n",
" trip.pickup_time,\n",
" customer_review.customer_review_text,\n",
" ROW_NUMBER() OVER (PARTITION BY trip.driver_id ORDER BY trip.pickup_time) AS row_nbr\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review\n",
" INNER JOIN `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip` AS trip\n",
" ON customer_review.trip_id = trip.trip_id\n",
" ) AS reviews\n",
" WHERE row_nbr < 50 -- top 50 most recent\n",
" ) AS customer_review\n",
" GROUP BY driver_id) AS top_reviews\n",
"WHERE driver.driver_id = top_reviews.driver_id;\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "iTKx8_VFjuGv"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"SELECT driver_review_summary_llm_summary_prompt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
"WHERE driver_review_summary_llm_summary_prompt IS NOT NULL\n",
"LIMIT 5;\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lMas79holRYH"
},
"source": [
"## Run the LLM to generate a driver summary of all customer reviews"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QtBFZxaDlWmp"
},
"outputs": [],
"source": [
"# Process in batches\n",
"batch_size = 10\n",
"\n",
"# Set the parameters for more creative\n",
"llm_temperature = 1\n",
"llm_max_output_tokens = 1024\n",
"llm_top_p = 1\n",
"llm_top_k = 40\n",
"\n",
"update_sql=\"\"\"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver\n",
" SET llm_summary_driver_review_summary_json = child.ml_generate_text_result\n",
" FROM (SELECT *\n",
" FROM ML.GENERATE_TEXT(MODEL`${project_id}.${bigquery_rideshare_llm_enriched_dataset}.gemini_model`,\n",
" (SELECT driver_id,\n",
" driver_review_summary_llm_summary_prompt AS prompt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE (llm_summary_driver_review_summary_json IS NULL\n",
" OR\n",
" JSON_VALUE(llm_summary_driver_review_summary_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND include_in_llm_processing = TRUE\n",
" AND driver_review_summary_llm_summary_prompt IS NOT NULL\n",
" LIMIT {batch_size}),\n",
" STRUCT(\n",
" {llm_temperature} AS temperature,\n",
" {llm_max_output_tokens} AS max_output_tokens,\n",
" {llm_top_p} AS top_p,\n",
" {llm_top_k} AS top_k\n",
" ))\n",
" ) AS child\n",
"WHERE driver.driver_id = child.driver_id\n",
" \"\"\".format(batch_size = batch_size,\n",
" llm_temperature = llm_temperature,\n",
" llm_max_output_tokens = llm_max_output_tokens,\n",
" llm_top_p = llm_top_p,\n",
" llm_top_k = llm_top_k)\n",
"\n",
"print(\"SQL: {update_sql}\".format(update_sql=update_sql))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B7ajlaR1lXCw"
},
"outputs": [],
"source": [
"# Score while records remain\n",
"# score in groups of batch_size records (we can do up to 10,000 at a time)\n",
"import time\n",
"\n",
"done = False\n",
"displayed_first_sql = False\n",
"original_record_count = 0\n",
"\n",
"while done == False:\n",
" # Get the count of records to score\n",
" sql = \"\"\"\n",
" SELECT COUNT(*) AS cnt\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE (llm_summary_driver_review_summary_json IS NULL\n",
" OR\n",
" JSON_VALUE(llm_summary_driver_review_summary_json, '$.candidates[0].content.parts[0].text') IS NULL\n",
" )\n",
" AND include_in_llm_processing = TRUE\n",
" AND driver_review_summary_llm_summary_prompt IS NOT NULL;\n",
" \"\"\"\n",
"\n",
" df_record_count = client.query(sql).to_dataframe()\n",
" cnt = df_record_count['cnt'].head(1).item()\n",
" if displayed_first_sql == False:\n",
" original_record_count = cnt\n",
" displayed_first_sql = True\n",
"\n",
" print(\"Remaining records to process: \", cnt, \" out of\", original_record_count, \" batch_size: \", batch_size)\n",
"\n",
"\n",
" if cnt == 0:\n",
" done = True\n",
" else:\n",
" # https://github.com/googleapis/python-bigquery/tree/master/samples\n",
" job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)\n",
" query_job = client.query(update_sql, job_config=job_config)\n",
"\n",
" # Check on the progress by getting the job's updated state.\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n",
"\n",
" while query_job.state != \"DONE\":\n",
" time.sleep(5)\n",
" query_job = client.get_job(\n",
" query_job.job_id, location=query_job.location\n",
" )\n",
" print(\"Job {} is currently in state {}\".format(query_job.job_id, query_job.state))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VI9lWCDklf3_"
},
"source": [
"## Parse the LLM JSON results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6N_CCkPNlduv"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` driver\n",
" SET llm_summary_driver_review_summary = JSON_VALUE(llm_summary_driver_review_summary_json, '$.candidates[0].content.parts[0].text')\n",
" WHERE llm_summary_driver_review_summary_json IS NOT NULL\n",
" AND llm_summary_driver_review_summary IS NULL;"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2ULcMnEelW4f"
},
"outputs": [],
"source": [
"%%bigquery\n",
"\n",
"SELECT driver_id, driver_review_summary_llm_summary_prompt, llm_summary_driver_review_summary_json, llm_summary_driver_review_summary\n",
" FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver`\n",
" WHERE llm_summary_driver_review_summary_json IS NOT NULL\n",
" AND llm_summary_driver_review_summary IS NOT NULL\n",
"LIMIT 20;\n"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [
"4xV3PMBh8Jjd",
"UrQsqrtj2x69",
"Lw4oA-qy8gzQ",
"Cgu2Gx-vlJJv"
],
"name": "BigQuery table",
"private_outputs": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}