sql-scripts/thelook_ecommerce/churn_demo_step_0_create_artifacts.sql (143 lines of code) (raw):
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Author: Polong Lin
Use Cases:
- An e-commerce store collects Google Analytics 4 data. With their GA4 data in BigQuery, want to predict which users who have spend 24 hours on the website are most likely
- to churn. We can then provide coupons on incentives to entice the customer to come back to the store.
Description:
- This will create the tables necessary to train the model
Show:
- New tables will be created, based on the raw public data: `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
- training_data: contains the final cleaned data used to train a classifier to predict churn, comprised of the three other tables joined togehter
- returningusers: table to check if each user will return or churn
- user_demographics: data on each user's demographic information
- user_aggregate_behavior: data aggregated based on behavior of each user in their first 24h on the website
References:
- https://cloud.google.com/bigquery-ml/docs/linear-regression-tutorial
Clean up / Reset script:
DROP TABLE IF EXISTS `${project_id}.${bigquery_thelook_ecommerce_dataset}.returningusers`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_thelook_ecommerce_dataset}.user_demographics`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_thelook_ecommerce_dataset}.user_aggregate_behavior`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_thelook_ecommerce_dataset}.training_data`;
*/
-- Exploring the dataset
SELECT *
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
TABLESAMPLE SYSTEM (1 PERCENT);
-- Identify users and churn label
CREATE OR REPLACE TABLE ${bigquery_thelook_ecommerce_dataset}.returningusers AS (
WITH firstlasttouch AS (
SELECT
user_pseudo_id,
MIN(event_timestamp) AS user_first_engagement,
MAX(event_timestamp) AS user_last_engagement
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE event_name='user_engagement'
GROUP BY
user_pseudo_id
), data AS (
SELECT
user_pseudo_id,
user_first_engagement,
user_last_engagement,
EXTRACT(MONTH from TIMESTAMP_MICROS(user_first_engagement)) as month,
EXTRACT(DAYOFYEAR from TIMESTAMP_MICROS(user_first_engagement)) as julianday,
EXTRACT(DAYOFWEEK from TIMESTAMP_MICROS(user_first_engagement)) as dayofweek,
-- add 24 hr to user's first visit
(user_first_engagement + 86400000000) AS ts_24hr_after_first_engagement,
-- churned = 1 if last_touch within 24 hr of first visit, else 0
IF (user_last_engagement < (user_first_engagement + 86400000000),
1,
0 ) AS churned,
-- bounced = 1 if last_touch within 10 min, else 0
IF (user_last_engagement <= (user_first_engagement + 600000000),
1,
0 ) AS bounced,
FROM
firstlasttouch
GROUP BY
1,2,3
)
SELECT
user_pseudo_id,
TIMESTAMP_MICROS(user_first_engagement) user_first_engagement,
TIMESTAMP_MICROS(user_last_engagement) user_last_engagement,
month,
julianday,
dayofweek,
TIMESTAMP_MICROS(ts_24hr_after_first_engagement) ts_24hr_after_first_engagement,
churned,
bounced
FROM
data);
-- Finding bounced and returning users
SELECT
bounced,
churned,
COUNT(churned) as count_users
FROM
${bigquery_thelook_ecommerce_dataset}.returningusers
GROUP BY 1,2
ORDER BY bounced;
-- Finding Churn rate
SELECT
COUNTIF(churned=1)/COUNT(churned) as churn_rate
FROM
${bigquery_thelook_ecommerce_dataset}.returningusers
WHERE bounced = 0;
-- Extracting demographic data for Users
CREATE OR REPLACE TABLE ${bigquery_thelook_ecommerce_dataset}.user_demographics AS (
WITH first_values AS (
SELECT
user_pseudo_id,
geo.country as country,
device.operating_system as operating_system,
device.language as language,
ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp DESC) AS row_num
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE event_name="user_engagement"
)
SELECT * EXCEPT (row_num)
FROM first_values
WHERE row_num = 1
);
-- Extracting Behavioral Data for Users
SELECT
event_name,
COUNT(event_name) as event_count
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
GROUP BY 1
ORDER BY
event_count DESC;
-- Creating User Aggregate Behavior View
CREATE OR REPLACE TABLE ${bigquery_thelook_ecommerce_dataset}.user_aggregate_behavior AS (
WITH
events_first24hr AS (
-- select user data only from first 24 hr of using the website
SELECT
e.*
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` e
JOIN
${bigquery_thelook_ecommerce_dataset}.returningusers r
ON
e.user_pseudo_id = r.user_pseudo_id
WHERE
TIMESTAMP_MICROS(e.event_timestamp) <= r.ts_24hr_after_first_engagement
)
SELECT
user_pseudo_id,
SUM(IF(event_name = 'user_engagement', 1, 0)) AS cnt_user_engagement,
SUM(IF(event_name = 'page_view', 1, 0)) AS cnt_page_view,
SUM(IF(event_name = 'view_item', 1, 0)) AS cnt_view_item,
SUM(IF(event_name = 'view_promotion', 1, 0)) AS cnt_view_promotion,
SUM(IF(event_name = 'select_promotion', 1, 0)) AS cnt_select_promotion,
SUM(IF(event_name = 'add_to_cart', 1, 0)) AS cnt_add_to_cart,
SUM(IF(event_name = 'begin_checkout', 1, 0)) AS cnt_begin_checkout,
SUM(IF(event_name = 'add_shipping_info', 1, 0)) AS cnt_add_shipping_info,
SUM(IF(event_name = 'add_payment_info', 1, 0)) AS cnt_add_payment_info,
SUM(IF(event_name = 'purchase', 1, 0)) AS cnt_purchase,
FROM
events_first24hr
GROUP BY
1
);
-- Creating training_data combining returningusers, user_demographics and user_aggregate_behavior
CREATE OR REPLACE TABLE ${bigquery_thelook_ecommerce_dataset}.training_data AS (
SELECT
dem.*,
IFNULL(beh.cnt_user_engagement, 0) AS cnt_user_engagement,
IFNULL(beh.cnt_page_view, 0) AS cnt_page_view,
IFNULL(beh.cnt_view_item, 0) AS cnt_view_item,
IFNULL(beh.cnt_view_promotion, 0) AS cnt_view_promotion,
IFNULL(beh.cnt_select_promotion, 0) AS cnt_select_promotion,
IFNULL(beh.cnt_add_to_cart, 0) AS cnt_add_to_cart,
IFNULL(beh.cnt_begin_checkout, 0) AS cnt_begin_checkout,
IFNULL(beh.cnt_add_shipping_info, 0) AS cnt_add_shipping_info,
IFNULL(beh.cnt_add_payment_info, 0) AS cnt_add_payment_info,
IFNULL(beh.cnt_purchase, 0) AS cnt_purchase,
ret.user_first_engagement,
ret.month,
ret.julianday,
ret.dayofweek,
ret.churned
FROM
${bigquery_thelook_ecommerce_dataset}.returningusers ret
LEFT OUTER JOIN
${bigquery_thelook_ecommerce_dataset}.user_demographics dem
ON
ret.user_pseudo_id = dem.user_pseudo_id
LEFT OUTER JOIN
${bigquery_thelook_ecommerce_dataset}.user_aggregate_behavior beh
ON
ret.user_pseudo_id = beh.user_pseudo_id
WHERE ret.bounced = 0
);