sql-scripts/rideshare_lakehouse_curated/sp_model_training.sql (184 lines of code) (raw):

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* Use Cases: - Trains the model to predict the high value rides Description: - Trains the model with all the data - Uses the unstructed data for the model (processed images) - Uses shared weather data for the model (Analytics Hub) Show: - Training Data - Analytics Hub - Scoring data w/explainable AI - Vertex AI (Model Registry) References: - Notes: - is holiday timeframe (need a list of holidays) - images of people with packages (need images based upon day of year and pickup location) - surge pricing - is rush hour -- short or long trip - show the probablity of being high values - Predict the dropoff location - Predict the total amount - Predict the tip amount Clean up / Reset script: DROP TABLE IF EXISTS `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` DROP MODEL IF EXISTS `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.model_predict_high_value` DROP VIEW IF EXISTS `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.analytics_hub_weather_data` */ IF LOWER("${bigquery_region}") = "us" THEN -- NOTE: In 2024 you need to change this to ghcnd_2023 CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.analytics_hub_weather_data` AS SELECT * FROM `${project_id}.ghcn_daily.ghcnd_2022`; ELSE -- NOTE: Analytics hub does not have this data in other regions (this keeps things from breaking) CREATE OR REPLACE VIEW `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.analytics_hub_weather_data` AS SELECT CAST (NULL AS STRING) AS id, CAST (NULL AS DATE) AS date, CAST (NULL AS STRING) AS element, CAST (NULL AS FLOAT64) AS value, CAST (NULL AS STRING) AS mflag, CAST (NULL AS STRING) AS qflag, CAST (NULL AS STRING) AS sflag, CAST (NULL AS STRING) AS time, CAST (NULL AS STRING) AS source_url, CAST (NULL AS TIMESTAMP) AS etl_timestamp; END IF; -- Train for the same period "last year" CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` AS WITH RideshareData AS ( SELECT rideshare_trip_id, CAST(pickup_location_id AS STRING) AS pickup_location_id, pickup_location_id AS location_id, CAST(pickup_datetime AS DATE) AS pickup_date, CAST(EXTRACT(YEAR FROM pickup_datetime) AS STRING) AS pickup_year, CAST(EXTRACT(MONTH FROM pickup_datetime) AS STRING) AS pickup_month, CAST(EXTRACT(DAY FROM pickup_datetime) AS STRING) AS pickup_day, CAST(EXTRACT(DAYOFWEEK FROM pickup_datetime) AS STRING) AS pickup_day_of_week, CAST(EXTRACT(HOUR FROM pickup_datetime) AS STRING) AS pickup_hour, ride_distance, is_airport, fare_amount, tip_amount, TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS ride_duration, bigquery_rideshare_zone.borough FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip` AS bigquery_rideshare_trip INNER JOIN `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_zone` AS bigquery_rideshare_zone ON bigquery_rideshare_trip.pickup_location_id = bigquery_rideshare_zone.location_id AND pickup_datetime BETWEEN CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 24 MONTH) AS TIMESTAMP) AND CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 23 MONTH) AS TIMESTAMP) AND ride_distance < 100 ) , WeatherRainData AS ( SELECT CASE WHEN id = 'USC00305679' THEN 'EWR' WHEN id = 'USW00094728' THEN 'Manhattan' WHEN id = 'US1NYQN0029' THEN 'Queens' WHEN id = 'USC00300961' THEN 'Bronx' WHEN id = 'USC00300958' THEN 'Brooklyn' WHEN id = 'US1NYRC0016' THEN 'Staten Island' END AS borough, date, element, MAX(value) AS value FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.analytics_hub_weather_data` WHERE date BETWEEN CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 24 MONTH) AS DATE) AND CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 23 MONTH) AS DATE) AND element = 'PRCP' AND id IN ('USC00305679','USW00094728','US1NYQN0029','USC00300961','USC00300958','US1NYRC0016') GROUP BY 1, 2, 3 ) , WeatherSnowData AS ( SELECT CASE WHEN id = 'USC00305679' THEN 'EWR' WHEN id = 'USW00094728' THEN 'Manhattan' WHEN id = 'US1NYQN0029' THEN 'Queens' WHEN id = 'USC00300961' THEN 'Bronx' WHEN id = 'USC00300958' THEN 'Brooklyn' WHEN id = 'US1NYRC0016' THEN 'Staten Island' END AS borough, date, element, MAX(value) AS value FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.analytics_hub_weather_data` WHERE date BETWEEN CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 24 MONTH) AS DATE) AND CAST(DATETIME_SUB(CURRENT_DATETIME('America/New_York'),INTERVAL 23 MONTH) AS DATE) AND element = 'SNOW' AND id IN ('USC00305679','USW00094728','US1NYQN0029','USC00300961','USC00300958','US1NYRC0016') GROUP BY 1, 2, 3 ) , PeopleTraveling AS ( -- NOTE: The dates are not used since we require images for every day/hour of the year, which is a lot of images SELECT location_id, COUNT(1) AS cnt, AVG(score) as avg_score FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_images_ml_detection` WHERE name IN ('Luggage and bags','Rolling','Baggage','Suitcase') GROUP BY location_id ) , People AS ( -- NOTE: The dates are not used since we require images for every day/hour of the year, which is a lot of images SELECT location_id, COUNT(1) AS cnt, AVG(score) as avg_score FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_images_ml_detection` WHERE name IN ('Pedestrian','Person') GROUP BY location_id ) SELECT rideshare_trip_id, RideshareData.pickup_location_id AS location_id, RideshareData.pickup_year, RideshareData.pickup_month, RideshareData.pickup_day, RideshareData.pickup_day_of_week, RideshareData.pickup_hour, CASE WHEN RideshareData.ride_distance < 2 THEN "short" WHEN RideshareData.ride_distance < 4 THEN "medium" ELSE "long" END AS ride_distance, CASE WHEN IFNULL(WeatherRainData.value,0) > 0 THEN TRUE ELSE FALSE END AS is_raining, CASE WHEN IFNULL(WeatherSnowData.value,0) > 0 THEN TRUE ELSE FALSE END AS is_snowing, CAST(IFNULL(PeopleTraveling.cnt,0) AS STRING) AS people_traveling_cnt, CAST(IFNULL(People.cnt,0) AS STRING) AS people_cnt, CASE WHEN ride_distance > 5 AND fare_amount> 25 AND tip_amount > (fare_amount * .30) THEN 1 WHEN ride_distance > 4 AND fare_amount> 20 AND tip_amount > (fare_amount * .25) THEN 1 WHEN ride_distance > 3 AND fare_amount> 15 AND tip_amount > (fare_amount * .20) THEN 1 WHEN ride_distance > 2 AND fare_amount> 10 AND tip_amount > (fare_amount * .15) THEN 1 ELSE 0 END AS is_high_value_ride, CAST(NULL AS NUMERIC) AS predicted_is_high_value_ride FROM RideshareData LEFT JOIN WeatherRainData ON RideshareData.borough = WeatherRainData.borough AND RideshareData.pickup_date = WeatherRainData.date LEFT JOIN WeatherSnowData ON RideshareData.borough = WeatherSnowData.borough AND RideshareData.pickup_date = WeatherSnowData.date LEFT JOIN PeopleTraveling ON RideshareData.location_id = PeopleTraveling.location_id LEFT JOIN People ON RideshareData.location_id = People.location_id; SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` WHERE is_high_value_ride > 0 LIMIT 100; -- Train a model to predict high value rides CREATE OR REPLACE MODEL `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.model_predict_high_value` OPTIONS( MODEL_TYPE = 'linear_reg', INPUT_LABEL_COLS = ['is_high_value_ride'], MODEL_REGISTRY = "vertex_ai" ) AS SELECT * EXCEPT(rideshare_trip_id, predicted_is_high_value_ride) FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data`; -- View a prediction with Explainable AI EXECUTE IMMEDIATE """ SELECT * FROM ML.EXPLAIN_PREDICT (MODEL `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.model_predict_high_value`, (SELECT '138' AS location_id, '2022' AS pickup_year, '3' AS pickup_month, '6' AS pickup_day, '1' AS pickup_day_of_week, '23' AS pickup_hour, 'short' AS ride_distance, FALSE AS is_raining, FALSE AS is_snowing, '0' AS people_traveling_cnt, '0' AS people_cnt )); """; -- Score all the results EXECUTE IMMEDIATE """ UPDATE `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` AS bigquery_model_training_data SET predicted_is_high_value_ride = CAST(ScoredData.predicted_is_high_value_ride AS NUMERIC) FROM (SELECT * FROM ML.PREDICT (MODEL `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.model_predict_high_value`, (SELECT rideshare_trip_id, -- for matching to source data location_id, pickup_year, pickup_month, pickup_day, pickup_day_of_week, pickup_hour, ride_distance, is_raining, is_snowing, people_traveling_cnt, people_cnt FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` ))) AS ScoredData WHERE ScoredData.rideshare_trip_id = bigquery_model_training_data.rideshare_trip_id; """; -- See the best scored data SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_model_training_data` ORDER BY predicted_is_high_value_ride DESC LIMIT 500;