sql-scripts/rideshare_llm_curated/sp_step_00_initialize.sql (393 lines of code) (raw):
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Use Cases:
-
Description:
- Initialize the Curated zone of the AI Lakehouse
Reference:
-
Clean up / Reset script:
- n/a
*/
------------------------------------------------------------------------------------------------------------
-- Create link to the LLM
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE MODEL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.gemini_model`
REMOTE WITH CONNECTION `${project_id}.us.vertex-ai`
OPTIONS (endpoint = 'gemini-2.0-flash');
------------------------------------------------------------------------------------------------------------
-- Create link to the LLM (embeddings)
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE MODEL `${project_id}.${bigquery_rideshare_llm_curated_dataset}.llm_embedding_model`
REMOTE WITH CONNECTION `${project_id}.us.vertex-ai`
OPTIONS(endpoint = 'text-embedding-005');
------------------------------------------------------------------------------------------------------------
-- Location Table
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.location`
CLUSTER BY location_id
OPTIONS (description='This is the zone table that contains information about each pickup location or dropoff location.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.location`;
-- 5 Operations per 10 seconds rate limit: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#alter_schema_set_options_statement
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.location`
ALTER COLUMN location_id SET OPTIONS (description='Pickup Location Id or Dropoff Location Id - Primary Key'),
ALTER COLUMN borough SET OPTIONS (description='Borough'),
ALTER COLUMN zone SET OPTIONS (description='Taxi Zone or Neighborhood'),
ALTER COLUMN service_zone SET OPTIONS (description='Service Zone or Service Area'),
ALTER COLUMN latitude SET OPTIONS (description='Latitude Geolocation'),
ALTER COLUMN longitude SET OPTIONS (description='Longitude Geolocation');
------------------------------------------------------------------------------------------------------------
-- Payment Type
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.payment_type`
CLUSTER BY payment_type_id
OPTIONS (description='This is the payment type table that contains the different payment type decriptions.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.payment_type`;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.payment_type`
ALTER COLUMN payment_type_id SET OPTIONS (description='Payment Type Id - Primary Key'),
ALTER COLUMN payment_type_description SET OPTIONS (description='Payment Type Description - How the customer paid for the rideshare trip.');
------------------------------------------------------------------------------------------------------------
-- Trip table
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.trip`
CLUSTER BY trip_id, driver_id, customer_id
OPTIONS (description='This is the trip table that hold every rideshare trip. It contains the trip, driver and customer intersection of data.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip`;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.trip`
ALTER COLUMN trip_id SET OPTIONS (description='Trip Id - Primary Key'),
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - Foreign Key to Driver Table'),
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Foreign Key to Customer Table'),
ALTER COLUMN pickup_time SET OPTIONS (description='Pickup Time - The time the customer was picked up by the driver.'),
ALTER COLUMN dropoff_time SET OPTIONS (description='Dropoff Time - The time the customer was dropped off by the driver.'),
ALTER COLUMN pickup_location_id SET OPTIONS (description='Pickup Location Id - Foreign Key to the Location table. The name of the place where the customer was picked up by the driver for their trip.'),
ALTER COLUMN dropoff_location_id SET OPTIONS (description='Dropoff Location Id - Foreign Key to the Location table. The name of the place where the customer was dropped off by the driver for their trip..'),
ALTER COLUMN payment_type_id SET OPTIONS (description='Payment Type Id - Foreign Key to the Payment Type Table.'),
ALTER COLUMN passenger_count SET OPTIONS (description='Passenger Count - The number of passengers in the car during the trip.'),
ALTER COLUMN trip_distance SET OPTIONS (description='Trip Distance - The distance of the trip in miles.'),
ALTER COLUMN fare_amount SET OPTIONS (description='Fare Amount - The amount charged for the trip excluding tip amount.'),
ALTER COLUMN tip_amount SET OPTIONS (description='Tip Amount - The amount the customer tipped the driver.'),
ALTER COLUMN total_amount SET OPTIONS (description='Total Amount - The total cost of the trip which is the fare amount added with the tip amount.');
------------------------------------------------------------------------------------------------------------
-- Customer
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer`
CLUSTER BY customer_id
OPTIONS (description='This is the customer table that holds each customers name, their LLM review summaries and their quantitative data.')
AS
SELECT customer.customer_id,
customer.customer_name,
customer.customer_since_date,
customer.include_in_llm_processing,
customer.llm_summary_customer_attribute AS customer_attribute_summary,
customer.llm_summary_customer_review_summary AS customer_review_summary,
customer.llm_customer_quantitative_analysis AS customer_quantitative_analysis,
COUNT(*) AS total_trip_count,
AVG(passenger_count) AS avg_passenger_count,
AVG(trip_distance) AS avg_trip_distance,
AVG(fare_amount) AS avg_fare_amount,
AVG(tip_amount) AS avg_tip_amount,
AVG(total_amount) AS avg_total_amount,
SUM(total_amount) AS sum_total_amount
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer` AS customer
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.trip` AS trip
ON customer.customer_id = trip.customer_id
GROUP BY 1,2,3,4,5,6,7;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer`
--Quantitive analysys embeddings
ADD COLUMN IF NOT EXISTS llm_customer_quantitative_analysis_embedding ARRAY<FLOAT64>,
--List of prefered drivers based on semantic matching
ADD COLUMN IF NOT EXISTS prefered_drivers ARRAY<INT64>;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Primary Key'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN customer_since_date SET OPTIONS (description='Customer Since Date - The first time the customer used the rideshare service. Their inception date.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN customer_attribute_summary SET OPTIONS (description='Customer Attribute Summary - The LLM summary of the customer preferences.'),
ALTER COLUMN customer_review_summary SET OPTIONS (description='Customer Review Summary - The LLM summary of all the customer reviews. This summary spans drivers.'),
ALTER COLUMN customer_quantitative_analysis SET OPTIONS (description='Customer Quantitative Analysis - The LLM summary of the quantitative data analysis.'),
ALTER COLUMN total_trip_count SET OPTIONS (description='Total Trip Count - The number of trips taken by this customer.'),
ALTER COLUMN avg_passenger_count SET OPTIONS (description='Avg Passenger Count - The average number of passengers for all the customer trips.'),
ALTER COLUMN avg_trip_distance SET OPTIONS (description='Avg Trip Distance Count - The average trip distance for all the customer trips.'),
ALTER COLUMN avg_fare_amount SET OPTIONS (description='Avg Fare Amount - The average fare amount for all the customer trips.'),
ALTER COLUMN avg_tip_amount SET OPTIONS (description='Avg Tip Amount - The average tip amount for all the customer trips.'),
ALTER COLUMN avg_total_amount SET OPTIONS (description='Avg Total Amount - The average total amount for all the customer trips.'),
ALTER COLUMN sum_total_amount SET OPTIONS (description='Sum Total Amount - The total amount the customer has spent using our service.'),
ALTER COLUMN llm_customer_quantitative_analysis_embedding SET OPTIONS (description='Embeddings of the customer quantitative analysis.'),
ALTER COLUMN prefered_drivers SET OPTIONS (description='List of prefered drivers based on semantic matching.');
------------------------------------------------------------------------------------------------------------
-- Driver.');
------------------------------------------------------------------------------------------------------------
-- Customer Reviews
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review`
CLUSTER BY customer_id
OPTIONS (description='This is the customer review table that holds each customer review they have provided for their trips.')
AS
SELECT customer.customer_id,
customer.customer_name,
customer.include_in_llm_processing AS customer_include_in_llm_processing,
customer_review.trip_id,
customer_review.review_date,
driver.driver_id,
driver.driver_name,
driver.include_in_llm_processing AS driver_include_in_llm_processing,
customer_review.customer_review_text,
customer_review.review_sentiment,
customer_review.extracted_driver_theme,
customer_review.extracted_customer_theme
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer` AS customer
INNER JOIN `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review
ON customer.customer_id = customer_review.customer_id
INNER JOIN `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver
ON customer_review.driver_id = driver.driver_id;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Composite Primary Key (along with Trip Id). Foreign Key to the Customer table.'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN customer_include_in_llm_processing SET OPTIONS (description='Customer Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN trip_id SET OPTIONS (description='Trip Id - Composite Primary Key (along with Customer Id). Foreign Key to the Trip table.'),
ALTER COLUMN review_date SET OPTIONS (description='Review Data - The date the customer wrote their review of the trip.'),
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - The Foreign Key to the driver table. This is the driver the review is being created by the customer.'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN driver_include_in_llm_processing SET OPTIONS (description='Driver Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN customer_review_text SET OPTIONS (description='Customer Review Text - The review text written by the customer for the trip/driver.'),
ALTER COLUMN review_sentiment SET OPTIONS (description='Review Sentiment - The sentiment of the review text which is: Positive, Neutral or Negative.'),
ALTER COLUMN extracted_driver_theme SET OPTIONS (description='Extract Driver Theme - The attributes the LLM extraced about the driver per the customer review text.'),
ALTER COLUMN extracted_customer_theme SET OPTIONS (description='Extract Customer Theme - The attributes the LLM extraced about the customer per the customer review text.');
------------------------------------------------------------------------------------------------------------
-- Customer Reviews (Summary)
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE MATERIALIZED VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review_summary`
CLUSTER BY customer_id
OPTIONS (enable_refresh = true, refresh_interval_minutes = 30, description='This is the customer review summary that counts the sentiment for each customer review.')
AS
SELECT customer.customer_id,
customer.customer_name,
customer.include_in_llm_processing,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%positive%'
OR
LOWER(customer_review.review_sentiment) LIKE '%neutral%'
OR
LOWER(customer_review.review_sentiment) LIKE '%negative%') AS total_review_count,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%positive%') AS total_review_count_postive,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%neutral%') AS total_review_count_neutral,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%negative%') AS total_review_count_negative,
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer` AS customer
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review` AS customer_review
ON customer.customer_id = customer_review.customer_id
GROUP BY 1, 2, 3;
/* - Not available for Materialized Views
ALTER MATERIALIZED VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review_summary`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Primary Key'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN total_review_count SET OPTIONS (description='Total Review Count - The total number of reviews created by this customer'),
ALTER COLUMN total_review_count_postive SET OPTIONS (description='Total Review Count Positive - The number of reviews with a positive sentiment analysis.'),
ALTER COLUMN total_review_count_neutral SET OPTIONS (description='Total Review Count Neutral - The number of reviews with a neutral sentiment analysis.'),
ALTER COLUMN total_review_count_negative SET OPTIONS (description='Total Review Count Negative - The number of reviews with a negative sentiment analysis.');
*/
------------------------------------------------------------------------------------------------------------
-- Customer Table (Preferences)
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_preference`
CLUSTER BY customer_id
OPTIONS (description='This is the customer preference table that holds the distinct list of customer preferences extracted by the LLM.')
AS
SELECT DISTINCT
customer.customer_id,
customer.customer_name,
customer.include_in_llm_processing,
CASE WHEN TRIM(LOWER(extracted_customer_attribute)) = 'speaks spanish' THEN 'Bilingual'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'prefers english' THEN 'Speaks English'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes a clean car' THEN 'Likes a clean car'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'prefers the radio on' THEN 'Likes the radio on'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'prefers the radio off' THEN 'Likes the radio off'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'travels with large luggage' THEN 'Likes a large amount of trunck space'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'travels with small luggage' THEN 'Typically travels without luggage'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes a warm vehicle inside' THEN 'Likes the car to be warm inside'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes a cooler vehicle inside' THEN 'Likes the car to be cool inside'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes their driver to drive fast' THEN 'Likes a driver that drives quickly'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes their driver to drive slow' THEN 'Likes a driver that drives slowly'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'likes to have a conversation' THEN 'Likes a driver who likes conversation'
WHEN TRIM(LOWER(extracted_customer_attribute)) = 'perfers a driver that does not talk' THEN 'Likes a quiet driver'
ELSE 'Other'
END AS preference
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_attribute` AS customer_attribute
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer` AS customer
ON customer_attribute.customer_id = customer.customer_id
AND customer_attribute.rank_order = 1;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_preference`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Foreign key to the customer table.'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN preference SET OPTIONS (description='Preference - The preference detected by the LLM for the customer.');
------------------------------------------------------------------------------------------------------------
-- Looker "Customer" View to keep Looker Simple
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer`
OPTIONS (description='This is the reporting table used by looker for customer information.')
AS
SELECT customer.customer_id,
customer.customer_name,
customer.customer_since_date,
customer.include_in_llm_processing,
customer.customer_attribute_summary,
customer.customer_review_summary,
customer.customer_quantitative_analysis,
customer.total_trip_count,
customer.avg_passenger_count,
customer.avg_trip_distance,
customer.avg_fare_amount,
customer.avg_tip_amount,
customer.avg_total_amount,
customer.sum_total_amount,
customer_review_summary.total_review_count,
customer_review_summary.total_review_count_postive,
customer_review_summary.total_review_count_neutral,
customer_review_summary.total_review_count_negative,
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer` AS customer
LEFT JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review_summary` AS customer_review_summary
ON customer.customer_id = customer_review_summary.customer_id;
ALTER VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Primary Key'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN customer_since_date SET OPTIONS (description='Customer Since Date - The first time the customer used the rideshare service. Their inception date.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN customer_attribute_summary SET OPTIONS (description='Customer Attribute Summary - The LLM summary of the customer preferences.'),
ALTER COLUMN customer_review_summary SET OPTIONS (description='Customer Review Summary - The LLM summary of all the customer reviews. This summary spans drivers.'),
ALTER COLUMN customer_quantitative_analysis SET OPTIONS (description='Customer Quantitative Analysis - The LLM summary of the quantitative data analysis.'),
ALTER COLUMN total_trip_count SET OPTIONS (description='Total Trip Count - The number of trips taken by this customer.'),
ALTER COLUMN avg_passenger_count SET OPTIONS (description='Avg Passenger Count - The average number of passengers for all the customer trips.'),
ALTER COLUMN avg_trip_distance SET OPTIONS (description='Avg Trip Distance Count - The average trip distance for all the customer trips.'),
ALTER COLUMN avg_fare_amount SET OPTIONS (description='Avg Fare Amount - The average fare amount for all the customer trips.'),
ALTER COLUMN avg_tip_amount SET OPTIONS (description='Avg Tip Amount - The average tip amount for all the customer trips.'),
ALTER COLUMN avg_total_amount SET OPTIONS (description='Avg Total Amount - The average total amount for all the customer trips.'),
ALTER COLUMN sum_total_amount SET OPTIONS (description='Sum Total Amount - The total amount the customer has spent using our service.'),
ALTER COLUMN total_review_count SET OPTIONS (description='Total Review Count - The total number of reviews created by this customer'),
ALTER COLUMN total_review_count_postive SET OPTIONS (description='Total Review Count Positive - The number of reviews with a positive sentiment analysis.'),
ALTER COLUMN total_review_count_neutral SET OPTIONS (description='Total Review Count Neutral - The number of reviews with a neutral sentiment analysis.'),
ALTER COLUMN total_review_count_negative SET OPTIONS (description='Total Review Count Negative - The number of reviews with a negative sentiment analysis.');
------------------------------------------------------------------------------------------------------------
-- Looker "Customer Preferences" View to keep Looker Simple
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer_preference`
OPTIONS (description='This is the reporting table used by looker for customer preferences.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_preference`;
ALTER VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer_preference`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Foreign key to the customer table.'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN preference SET OPTIONS (description='Preference - The preference detected by the LLM for the customer.');
------------------------------------------------------------------------------------------------------------
-- Looker "Customer Reviews" View to keep Looker Simple
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer_review`
OPTIONS (description='This is the reporting table used by looker for customer reviews.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review`;
ALTER VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_customer_review`
ALTER COLUMN customer_id SET OPTIONS (description='Customer Id - Composite Primary Key (along with Trip Id). Foreign Key to the Customer table.'),
ALTER COLUMN customer_name SET OPTIONS (description='Customer Name - The name of the customer.'),
ALTER COLUMN customer_include_in_llm_processing SET OPTIONS (description='Customer Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN trip_id SET OPTIONS (description='Trip Id - Composite Primary Key (along with Customer Id). Foreign Key to the Trip table.'),
ALTER COLUMN review_date SET OPTIONS (description='Review Data - The date the customer wrote their review of the trip.'),
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - The Foreign Key to the driver table. This is the driver the review is being created by the customer.'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN driver_include_in_llm_processing SET OPTIONS (description='Driver Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN customer_review_text SET OPTIONS (description='Customer Review Text - The review text written by the customer for the trip/driver.'),
ALTER COLUMN review_sentiment SET OPTIONS (description='Review Sentiment - The sentiment of the review text which is: Positive, Neutral or Negative.'),
ALTER COLUMN extracted_driver_theme SET OPTIONS (description='Extract Driver Theme - The attributes the LLM extraced about the driver per the customer review text.'),
ALTER COLUMN extracted_customer_theme SET OPTIONS (description='Extract Customer Theme - The attributes the LLM extraced about the customer per the customer review text.');
------------------------------------------------------------------------------------------------------------
-- Driver Table w/Averages
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver`
CLUSTER BY driver_id
OPTIONS (description='This is driver table that holds information about each driver. It constains the LLM summaries and the quanitative analysis.')
AS
SELECT driver.driver_id,
driver.driver_name,
driver.driver_since_date,
driver.include_in_llm_processing,
driver.llm_summary_driver_attribute AS driver_attribute_summary,
driver.llm_summary_driver_review_summary AS driver_review_summary,
driver.llm_driver_quantitative_analysis AS driver_quantitative_analysis,
COUNT(*) AS total_trip_count,
AVG(passenger_count) AS avg_passenger_count,
AVG(trip_distance) AS avg_trip_distance,
AVG(fare_amount) AS avg_fare_amount,
AVG(tip_amount) AS avg_tip_amount,
AVG(total_amount) AS avg_total_amount,
SUM(total_amount) AS sum_total_amount
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.trip` AS trip
ON driver.driver_id = trip.driver_id
GROUP BY 1,2,3,4,5,6,7;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver`
ADD COLUMN IF NOT EXISTS llm_driver_quantitative_analysis_embedding ARRAY<FLOAT64>;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver`
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - Primary Key'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN driver_since_date SET OPTIONS (description='Driver Since Date - The first time the driver made a trip. The inception date.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN driver_attribute_summary SET OPTIONS (description='Driver Attribute Summary - The LLM summary of the driver preferences.'),
ALTER COLUMN driver_review_summary SET OPTIONS (description='Driver Review Summary - The LLM summary of all the customer reviews. This summary spans customers.'),
ALTER COLUMN driver_quantitative_analysis SET OPTIONS (description='Driver Quantitative Analysis - The LLM summary of the quantitative data analysis.'),
ALTER COLUMN total_trip_count SET OPTIONS (description='Total Trip Count - The number of trips taken by this customer.'),
ALTER COLUMN avg_passenger_count SET OPTIONS (description='Avg Passenger Count - The average number of passengers for all the customer trips.'),
ALTER COLUMN avg_trip_distance SET OPTIONS (description='Avg Trip Distance Count - The average trip distance for all the customer trips.'),
ALTER COLUMN avg_fare_amount SET OPTIONS (description='Avg Fare Amount - The average fare amount for all the customer trips.'),
ALTER COLUMN avg_tip_amount SET OPTIONS (description='Avg Tip Amount - The average tip amount for all the customer trips.'),
ALTER COLUMN avg_total_amount SET OPTIONS (description='Avg Total Amount - The average total amount for all the customer trips.'),
ALTER COLUMN sum_total_amount SET OPTIONS (description='Sum Total Amount - The total amount the customer has spent using our service.'),
ALTER COLUMN llm_driver_quantitative_analysis_embedding SET OPTIONS (description='Embeddings of the driver quantitative analysis.');
------------------------------------------------------------------------------------------------------------
-- Driver Attributes (some data cleaning from LLM)'),
------------------------------------------------------------------------------------------------------------
-- Driver Reviews (some data cleaning from LLM)
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE MATERIALIZED VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_review_summary`
CLUSTER BY driver_id
OPTIONS (enable_refresh = true, refresh_interval_minutes = 30, description='This is the driver review summary table that holds the counts of driver sentiments based on the customer reviews of the driver.')
AS
SELECT driver.driver_id,
driver.driver_name,
driver.include_in_llm_processing,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%positive%'
OR
LOWER(customer_review.review_sentiment) LIKE '%neutral%'
OR
LOWER(customer_review.review_sentiment) LIKE '%negative%') AS total_review_count,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%positive%') AS total_review_count_postive,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%neutral%') AS total_review_count_neutral,
COUNTIF(LOWER(customer_review.review_sentiment) LIKE '%negative%') AS total_review_count_negative,
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver` AS driver
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.customer_review` AS customer_review
ON driver.driver_id = customer_review.driver_id
GROUP BY 1, 2, 3;
/* - Not available for Materialized Views
ALTER MATERIALIZED VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_review_summary`
ALTER COLUMN TTT SET OPTIONS (description='TTT'),
ALTER COLUMN TTT SET OPTIONS (description='TTT');
*/
------------------------------------------------------------------------------------------------------------
-- Driver Table (Preferences)
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_preference`
CLUSTER BY driver_id
OPTIONS (description='This is the driver preference table which holds the preferences of each driver extracted by the LLM.')
AS
SELECT DISTINCT
driver.driver_id,
driver.driver_name,
driver.include_in_llm_processing,
CASE WHEN TRIM(LOWER(extracted_driver_attribute)) = 'safe driver' THEN 'Safe Driver'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver likes music' THEN 'Radio On'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver likes no music' THEN 'Radio Off'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'trunk space large' THEN 'Large amount of truck space'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'trunk space small' THEN 'Limited amount of truck space'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver likes conversation' THEN 'Likes conversation with customer(s)'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver likes no conversation' THEN 'Likes prefers not talking with customer(s)'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driving too fast' THEN 'Drives fast'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driving too slow' THEN 'Drives slow'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'clean car' THEN 'Keeps a clean car'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'dirty car' THEN 'Keeps a ditry car'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'car too hot' THEN 'Keeps car temperature warm'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'car too cold' THEN 'Keeps car temperature cool'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver speaks spanish' THEN 'Bilingual'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'driver does not speak spanish' THEN 'Speaks English'
WHEN TRIM(LOWER(extracted_driver_attribute)) = 'distracted driver' THEN 'Unsafe Distracted Driver'
ELSE 'Other'
END AS preference
FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_attribute` AS driver_attribute
INNER JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver` AS driver
ON driver_attribute.driver_id = driver.driver_id
AND driver_attribute.rank_order = 1;
ALTER TABLE `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_preference`
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - Foreign key to the driver table.'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN preference SET OPTIONS (description='Preference - The preference detected by the LLM for the driver.');
------------------------------------------------------------------------------------------------------------
-- Looker "Driver" View to keep Looker Simple
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_driver`
OPTIONS (description='This is the reporting table used by looker for drivers.')
AS
SELECT driver.driver_id,
driver.driver_name,
driver.driver_since_date,
driver.include_in_llm_processing,
driver.driver_attribute_summary,
driver.driver_review_summary,
driver.driver_quantitative_analysis,
driver.total_trip_count,
driver.avg_passenger_count,
driver.avg_trip_distance,
driver.avg_fare_amount,
driver.avg_tip_amount,
driver.avg_total_amount,
driver.sum_total_amount,
driver_review_summary.total_review_count,
driver_review_summary.total_review_count_postive,
driver_review_summary.total_review_count_neutral,
driver_review_summary.total_review_count_negative,
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver` AS driver
LEFT JOIN `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_review_summary` AS driver_review_summary
ON driver.driver_id = driver_review_summary.driver_id;
ALTER VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_driver`
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - Primary Key'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN driver_since_date SET OPTIONS (description='Driver Since Date - The first time the driver made a trip. The inception date.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN driver_attribute_summary SET OPTIONS (description='Driver Attribute Summary - The LLM summary of the driver preferences.'),
ALTER COLUMN driver_review_summary SET OPTIONS (description='Driver Review Summary - The LLM summary of all the customer reviews. This summary spans customers.'),
ALTER COLUMN driver_quantitative_analysis SET OPTIONS (description='Driver Quantitative Analysis - The LLM summary of the quantitative data analysis.'),
ALTER COLUMN total_trip_count SET OPTIONS (description='Total Trip Count - The number of trips taken by this customer.'),
ALTER COLUMN avg_passenger_count SET OPTIONS (description='Avg Passenger Count - The average number of passengers for all the customer trips.'),
ALTER COLUMN avg_trip_distance SET OPTIONS (description='Avg Trip Distance Count - The average trip distance for all the customer trips.'),
ALTER COLUMN avg_fare_amount SET OPTIONS (description='Avg Fare Amount - The average fare amount for all the customer trips.'),
ALTER COLUMN avg_tip_amount SET OPTIONS (description='Avg Tip Amount - The average tip amount for all the customer trips.'),
ALTER COLUMN avg_total_amount SET OPTIONS (description='Avg Total Amount - The average total amount for all the customer trips.'),
ALTER COLUMN sum_total_amount SET OPTIONS (description='Sum Total Amount - The total amount the customer has spent using our service.'),
ALTER COLUMN total_review_count SET OPTIONS (description='Total Review Count - The total number of reviews for this driver'),
ALTER COLUMN total_review_count_postive SET OPTIONS (description='Total Review Count Positive - The number of reviews, for this driver, with a positive sentiment analysis.'),
ALTER COLUMN total_review_count_neutral SET OPTIONS (description='Total Review Count Neutral - The number of reviews, for this driver, with a neutral sentiment analysis.'),
ALTER COLUMN total_review_count_negative SET OPTIONS (description='Total Review Count Negative - The number of reviews, for this driver, with a negative sentiment analysis.');
------------------------------------------------------------------------------------------------------------
-- Looker "Driver Preferences" View to keep Looker Simple
------------------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_driver_preference`
OPTIONS (description='This is the reporting table used by looker for driver preferences.')
AS
SELECT *
FROM `${project_id}.${bigquery_rideshare_llm_curated_dataset}.driver_preference`;
ALTER VIEW `${project_id}.${bigquery_rideshare_llm_curated_dataset}.looker_driver_preference`
ALTER COLUMN driver_id SET OPTIONS (description='Driver Id - Foreign key to the driver table.'),
ALTER COLUMN driver_name SET OPTIONS (description='Driver Name - The name of the driver.'),
ALTER COLUMN include_in_llm_processing SET OPTIONS (description='Include in LLM Processing - For the demo we only process a subset of all the data.'),
ALTER COLUMN preference SET OPTIONS (description='Preference - The preference detected by the LLM for the driver.');