sql-scripts/rideshare_llm_enriched/sp_step_00_initialize.sql (54 lines of code) (raw):

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* Use Cases: - Description: - Initialize the "enriched" zone of the AI Lakehouse Reference: - Clean up / Reset script: - n/a */ ------------------------------------------------------------------------------------------------------------ -- Create link to the LLM ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE MODEL `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.gemini_model` REMOTE WITH CONNECTION `${project_id}.us.vertex-ai` OPTIONS (endpoint = 'gemini-2.0-flash'); ------------------------------------------------------------------------------------------------------------ -- Create link to the STT model ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE MODEL `${project_id}.${bigquery_rideshare_llm_raw_dataset}.cloud_ai_stt_v2` REMOTE WITH CONNECTION `${project_id}.us.biglake-connection` OPTIONS ( REMOTE_SERVICE_TYPE = 'CLOUD_AI_SPEECH_TO_TEXT_V2' ); ------------------------------------------------------------------------------------------------------------ -- Location Table ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.location` CLUSTER BY location_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.location`; ------------------------------------------------------------------------------------------------------------ -- Payment Type ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.payment_type` CLUSTER BY payment_type_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.payment_type`; ------------------------------------------------------------------------------------------------------------ -- Trip table ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip` CLUSTER BY trip_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.trip`; ------------------------------------------------------------------------------------------------------------ -- Customer ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer` CLUSTER BY customer_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.customer`; ------------------------------------------------------------------------------------------------------------ -- Driver ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` CLUSTER BY driver_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.driver`; ------------------------------------------------------------------------------------------------------------ -- Customer Reviews ------------------------------------------------------------------------------------------------------------ CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` CLUSTER BY customer_id, trip_id, driver_id AS SELECT * FROM `${project_id}.${bigquery_rideshare_llm_raw_dataset}.customer_review`; ------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------ -- Bring in Pre-Scored LLM data -- This way the demo is pre-seeded with data and you do not need to process ALL the data yourself ------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------ -- Driver LLM fields (Add fields) ------------------------------------------------------------------------------------------------------------ ALTER TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` ADD COLUMN IF NOT EXISTS driver_attribute_llm_summary_prompt STRING, ADD COLUMN IF NOT EXISTS llm_summary_driver_attribute_json JSON, ADD COLUMN IF NOT EXISTS llm_summary_driver_attribute STRING, ADD COLUMN IF NOT EXISTS driver_review_summary_llm_summary_prompt STRING, ADD COLUMN IF NOT EXISTS llm_summary_driver_review_summary_json JSON, ADD COLUMN IF NOT EXISTS llm_summary_driver_review_summary STRING, ADD COLUMN IF NOT EXISTS driver_quantitative_analysis_prompt STRING, ADD COLUMN IF NOT EXISTS llm_driver_quantitative_analysis_json JSON, ADD COLUMN IF NOT EXISTS llm_driver_quantitative_analysis STRING ; -- Driver LLM fields (Load the data) LOAD DATA OVERWRITE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_driver` CLUSTER BY driver_id FROM FILES ( format = 'PARQUET', uris = ['gs://${gcs_rideshare_lakehouse_raw_bucket}/rideshare_llm_export/enriched_zone/sidecar_driver/*.parquet'] ); -- Driver LLM fields (Update the table) UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver` AS driver SET driver_attribute_llm_summary_prompt = sidecar_driver.driver_attribute_llm_summary_prompt, llm_summary_driver_attribute_json = TO_JSON(sidecar_driver.llm_summary_driver_attribute_json), llm_summary_driver_attribute = sidecar_driver.llm_summary_driver_attribute, driver_review_summary_llm_summary_prompt = sidecar_driver.driver_review_summary_llm_summary_prompt, llm_summary_driver_review_summary_json = TO_JSON(sidecar_driver.llm_summary_driver_review_summary_json), llm_summary_driver_review_summary = sidecar_driver.llm_summary_driver_review_summary, driver_quantitative_analysis_prompt = sidecar_driver.driver_quantitative_analysis_prompt, llm_driver_quantitative_analysis_json = TO_JSON(sidecar_driver.llm_driver_quantitative_analysis_json), llm_driver_quantitative_analysis = sidecar_driver.llm_driver_quantitative_analysis FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_driver` AS sidecar_driver WHERE driver.driver_id = sidecar_driver.driver_id; ------------------------------------------------------------------------------------------------------------ -- Customer LLM fields (Add fields) ------------------------------------------------------------------------------------------------------------ ALTER TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer` ADD COLUMN IF NOT EXISTS customer_attribute_llm_summary_prompt STRING, ADD COLUMN IF NOT EXISTS llm_summary_customer_attribute_json JSON, ADD COLUMN IF NOT EXISTS llm_summary_customer_attribute STRING, ADD COLUMN IF NOT EXISTS customer_review_summary_llm_summary_prompt STRING, ADD COLUMN IF NOT EXISTS llm_summary_customer_review_summary_json JSON, ADD COLUMN IF NOT EXISTS llm_summary_customer_review_summary STRING, ADD COLUMN IF NOT EXISTS customer_quantitative_analysis_prompt STRING, ADD COLUMN IF NOT EXISTS llm_customer_quantitative_analysis_json JSON, ADD COLUMN IF NOT EXISTS llm_customer_quantitative_analysis STRING ; -- Customer LLM fields (Load the data) LOAD DATA OVERWRITE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_customer` CLUSTER BY customer_id FROM FILES ( format = 'PARQUET', uris = ['gs://${gcs_rideshare_lakehouse_raw_bucket}/rideshare_llm_export/enriched_zone/sidecar_customer/*.parquet'] ); -- Customer LLM fields (Update the table) UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer` AS customer SET customer_attribute_llm_summary_prompt = sidecar_customer.customer_attribute_llm_summary_prompt, llm_summary_customer_attribute_json = TO_JSON(sidecar_customer.llm_summary_customer_attribute_json), llm_summary_customer_attribute = sidecar_customer.llm_summary_customer_attribute, customer_review_summary_llm_summary_prompt = sidecar_customer.customer_review_summary_llm_summary_prompt, llm_summary_customer_review_summary_json = TO_JSON(sidecar_customer.llm_summary_customer_review_summary_json), llm_summary_customer_review_summary = sidecar_customer.llm_summary_customer_review_summary, customer_quantitative_analysis_prompt = sidecar_customer.customer_quantitative_analysis_prompt, llm_customer_quantitative_analysis_json = TO_JSON(sidecar_customer.llm_customer_quantitative_analysis_json), llm_customer_quantitative_analysis = sidecar_customer.llm_customer_quantitative_analysis FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_customer` AS sidecar_customer WHERE customer.customer_id = sidecar_customer.customer_id; ------------------------------------------------------------------------------------------------------------ -- Customer Review LLM fields (Add fields) ------------------------------------------------------------------------------------------------------------ ALTER TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` ADD COLUMN IF NOT EXISTS llm_sentiment_prompt STRING, ADD COLUMN IF NOT EXISTS raw_sentiment_json JSON, ADD COLUMN IF NOT EXISTS review_sentiment STRING, ADD COLUMN IF NOT EXISTS extracted_driver_theme_json JSON, ADD COLUMN IF NOT EXISTS extracted_driver_theme STRING, ADD COLUMN IF NOT EXISTS extracted_customer_theme_json JSON, ADD COLUMN IF NOT EXISTS extracted_customer_theme STRING ; -- Customer Review LLM fields (Load the data) LOAD DATA OVERWRITE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_customer_review` CLUSTER BY customer_id, trip_id, driver_id FROM FILES ( format = 'PARQUET', uris = ['gs://${gcs_rideshare_lakehouse_raw_bucket}/rideshare_llm_export/enriched_zone/sidecar_customer_review/*.parquet'] ); -- Customer Review LLM fields (Update the table) UPDATE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_review` AS customer_review SET llm_sentiment_prompt = sidecar_customer_review.llm_sentiment_prompt, raw_sentiment_json = TO_JSON(sidecar_customer_review.raw_sentiment_json), review_sentiment = sidecar_customer_review.review_sentiment, extracted_driver_theme_json = TO_JSON(sidecar_customer_review.extracted_driver_theme_json), extracted_driver_theme = sidecar_customer_review.extracted_driver_theme, extracted_customer_theme_json = TO_JSON(sidecar_customer_review.extracted_customer_theme_json), extracted_customer_theme = sidecar_customer_review.extracted_customer_theme FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.sidecar_customer_review` AS sidecar_customer_review WHERE customer_review.customer_id = sidecar_customer_review.customer_id AND customer_review.trip_id = sidecar_customer_review.trip_id AND customer_review.driver_id = sidecar_customer_review.driver_id;