sql-scripts/rideshare_llm_enriched/sp_step_01_quantitative_analysis.sql (201 lines of code) (raw):

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* Use Cases: - Description: - Perform the quantitative analysis on our data Reference: - Clean up / Reset script: - n/a */ --------------------------------------------------------------------------------------------------------------------- -- Customer: what days of the week and hours of the day do they use our service --------------------------------------------------------------------------------------------------------------------- CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.customer_quantitative_analysis` CLUSTER BY customer_id AS WITH data AS ( SELECT customer_id, COUNT(*) AS total_number_of_trips, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 1) AS dayofweek_1, -- Sunday COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 2) AS dayofweek_2, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 3) AS dayofweek_3, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 4) AS dayofweek_4, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 5) AS dayofweek_5, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 6) AS dayofweek_6, -- We could check for Friday nights COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 7) AS dayofweek_7, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 0) AS hourofday_0, -- Midnight COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 1) AS hourofday_1, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 2) AS hourofday_2, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 3) AS hourofday_3, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 4) AS hourofday_4, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 5) AS hourofday_5, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 6) AS hourofday_6, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 7) AS hourofday_7, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 8) AS hourofday_8, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 9) AS hourofday_9, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 10) AS hourofday_10, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 11) AS hourofday_11, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 12) AS hourofday_12, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 13) AS hourofday_13, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 14) AS hourofday_14, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 15) AS hourofday_15, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 16) AS hourofday_16, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 17) AS hourofday_17, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 18) AS hourofday_18, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 19) AS hourofday_19, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 20) AS hourofday_20, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 21) AS hourofday_21, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 22) AS hourofday_22, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 23) AS hourofday_23, FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip` AS trip GROUP BY customer_id ) SELECT customer_id, CASE WHEN (dayofweek_1 + dayofweek_7) > CAST(total_number_of_trips * .8 AS INT64) THEN 'weekend-customer' WHEN (dayofweek_2 + dayofweek_3 + dayofweek_4 + dayofweek_5 + dayofweek_6) > CAST(total_number_of_trips * .8 AS INT64) THEN 'weekday-customer' ELSE 'any-day-customer' END AS day_of_week, CASE WHEN (hourofday_0 + hourofday_1 + hourofday_2 + hourofday_3 + hourofday_4 + hourofday_22 + hourofday_23) > CAST(total_number_of_trips * .8 AS INT64) THEN 'night-hour-customer' WHEN (hourofday_7 + hourofday_8 + hourofday_9 + hourofday_16 + hourofday_17 + hourofday_18 + hourofday_19) > CAST(total_number_of_trips * .8 AS INT64) THEN 'rush-hour-customer' ELSE 'any-hour-customer' END AS hour_of_day FROM data ; --------------------------------------------------------------------------------------------------------------------- -- Driver: What days/hours do they work and what are their common/preferred pickup locations --------------------------------------------------------------------------------------------------------------------- CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.driver_quantitative_analysis` CLUSTER BY driver_id AS WITH data AS ( SELECT driver_id, COUNT(DISTINCT pickup_location_id) AS pickup_location_count, COUNT(DISTINCT dropoff_location_id) AS dropoff_location_count, STRING_AGG(DISTINCT location_pickup.zone,", ") AS distinct_pickup_location_zones, STRING_AGG(DISTINCT location_dropoff.zone,", ") AS distinct_dropoff_location_zones, COUNTIF(location_pickup.service_zone = 'EWR' OR location_dropoff.service_zone = 'EWR') AS crosses_state_line_count, COUNTIF(location_pickup.service_zone = 'EWR' OR location_dropoff.service_zone = 'EWR' OR location_pickup.service_zone = 'Airports' OR location_dropoff.service_zone = 'Airports') AS airport_count, COUNT(*) AS total_number_of_trips, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 1) AS dayofweek_1, -- Sunday COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 2) AS dayofweek_2, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 3) AS dayofweek_3, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 4) AS dayofweek_4, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 5) AS dayofweek_5, COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 6) AS dayofweek_6, -- We could check for Friday nights COUNTIF(EXTRACT(DAYOFWEEK FROM trip.pickup_time) = 7) AS dayofweek_7, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 0) AS hourofday_0, -- Midnight COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 1) AS hourofday_1, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 2) AS hourofday_2, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 3) AS hourofday_3, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 4) AS hourofday_4, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 5) AS hourofday_5, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 6) AS hourofday_6, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 7) AS hourofday_7, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 8) AS hourofday_8, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 9) AS hourofday_9, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 10) AS hourofday_10, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 11) AS hourofday_11, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 12) AS hourofday_12, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 13) AS hourofday_13, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 14) AS hourofday_14, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 15) AS hourofday_15, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 16) AS hourofday_16, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 17) AS hourofday_17, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 18) AS hourofday_18, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 19) AS hourofday_19, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 20) AS hourofday_20, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 21) AS hourofday_21, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 22) AS hourofday_22, COUNTIF(EXTRACT(HOUR FROM trip.pickup_time) = 23) AS hourofday_23, FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip` AS trip INNER JOIN `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.location` AS location_pickup ON trip.pickup_location_id = location_pickup.location_id INNER JOIN `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.location` AS location_dropoff ON trip.dropoff_location_id = location_dropoff.location_id GROUP BY driver_id ) , assign_attribute AS ( SELECT driver_id, distinct_pickup_location_zones, distinct_dropoff_location_zones, pickup_location_count, dropoff_location_count, CASE WHEN pickup_location_count BETWEEN 1 AND 3 THEN 'few-1-3-pickup-locations' WHEN pickup_location_count BETWEEN 4 AND 6 THEN 'average-4-6-pickup-locations' WHEN pickup_location_count BETWEEN 7 AND 9 THEN 'many-7-9-pickup-locations' ELSE 'any-pickup-locations' END AS pickup_location_habit, CASE WHEN pickup_location_count BETWEEN 1 AND 9 THEN distinct_pickup_location_zones ELSE 'too-many-to-list' END AS pickup_location_zone_habit, CASE WHEN dropoff_location_count BETWEEN 1 AND 3 THEN 'few-1-3-dropoff-locations' WHEN dropoff_location_count BETWEEN 4 AND 6 THEN 'average-4-6-dropoff-locations' WHEN dropoff_location_count BETWEEN 7 AND 9 THEN 'many-7-9-dropoff-locations' ELSE 'any-pickup-locations' END AS dropoff_location_habit, CASE WHEN dropoff_location_count BETWEEN 1 AND 9 THEN distinct_dropoff_location_zones ELSE 'too-many-to-list' END AS dropoff_location_zone_habit, CASE WHEN crosses_state_line_count > CAST(total_number_of_trips * .2 AS INT64) THEN 'crosses-state-line' WHEN crosses_state_line_count < CAST(total_number_of_trips * .05 AS INT64) THEN 'does-not-cross-state-line' ELSE 'any-state-line' END AS cross_state_habit, CASE WHEN airport_count > CAST(total_number_of_trips * .75 AS INT64) THEN 'airport-driver' WHEN airport_count < CAST(total_number_of_trips * .1 AS INT64) THEN 'non-airport-driver' ELSE 'airport-agnostic' END AS airport_habit, CASE WHEN (dayofweek_1 + dayofweek_7) > CAST(total_number_of_trips * .8 AS INT64) THEN 'weekend-driver' WHEN (dayofweek_2 + dayofweek_3 + dayofweek_4 + dayofweek_5 + dayofweek_6) > CAST(total_number_of_trips * .8 AS INT64) THEN 'weekday-driver' ELSE 'any-day-driver' END AS day_of_week, CASE WHEN (hourofday_0 + hourofday_1 + hourofday_2 + hourofday_3 + hourofday_4 + hourofday_22 + hourofday_23) > CAST(total_number_of_trips * .8 AS INT64) THEN 'night-hour-driver' WHEN (hourofday_7 + hourofday_8 + hourofday_9 + hourofday_16 + hourofday_17 + hourofday_18 + hourofday_19) > CAST(total_number_of_trips * .8 AS INT64) THEN 'rush-hour-driver' ELSE 'any-hour-driver' END AS hour_of_day FROM data ) , driver_daily_amount AS ( SELECT driver_id, CAST(pickup_time AS DATE) AS dt, SUM(trip.total_amount) as daily_total_amount FROM `${project_id}.${bigquery_rideshare_llm_enriched_dataset}.trip` AS trip --WHERE driver_id = 620 GROUP BY 1,2 ) , arr_ag AS ( SELECT driver_id, AVG(daily_total_amount) AS average_daily_pay, ARRAY_AGG(daily_total_amount) as array_daily_total_amount FROM driver_daily_amount GROUP BY 1 ) , stddev_tbl AS ( SELECT driver_id, average_daily_pay, STDDEV_SAMP(daily_total_amount) AS stddev_amt FROM arr_ag CROSS JOIN UNNEST(arr_ag.array_daily_total_amount) AS daily_total_amount GROUP BY 1, 2 ) , stddev_results AS ( SELECT stddev_tbl.*, FROM stddev_tbl WHERE stddev_amt < 20 AND stddev_amt is not null ) SELECT assign_attribute.*, stddev_results.average_daily_pay, stddev_results.stddev_amt FROM assign_attribute LEFT JOIN stddev_results ON assign_attribute.driver_id = stddev_results.driver_id;