retail/ltv/bqml/scripts/20_procedure_prepare.sql (158 lines of code) (raw):
-- Copyright 2020 Google LLC
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- https://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
CREATE OR REPLACE PROCEDURE PrepareForML(
MAX_STDV_MONETARY INT64,
MAX_STDV_QTY INT64,
WINDOW_LENGTH INT64, -- How many days back for inputs transactions.
WINDOW_STEP INT64, -- How many days between thresholds.
WINDOW_STEP_INITIAL INT64, -- How many days for the first window.
LENGTH_FUTURE INT64, -- How many days to predict for.
TABLE_FOR_PREDICTING STRING,
TABLE_FOR_TRAINING STRING)
BEGIN
DECLARE MIN_DATE DATE; -- Date of the first order in the dataset.
DECLARE MAX_DATE DATE; -- Date of the final order in the dataset.
DECLARE THRESHOLD_DATE DATE; -- Date that separates inputs orders from target orders.
DECLARE WINDOW_START DATE; -- Date at which an input transactions window starts.
DECLARE STEP INT64 DEFAULT 1; -- Index of the window being run.
-- Aggregates per date per customers.
CREATE OR REPLACE TEMP TABLE Aggred AS
SELECT
customer_id,
order_day,
ROUND(day_value_after_returns, 2) AS value,
day_qty_after_returns as qty_articles,
day_num_returns AS num_returns,
CEIL(avg_time_to_return) AS time_to_return
FROM (
SELECT
customer_id,
order_day,
SUM(order_value_after_returns) AS day_value_after_returns,
STDDEV(SUM(order_value_after_returns)) OVER(PARTITION BY customer_id ORDER BY SUM(order_value_after_returns)) AS stdv_value,
SUM(order_qty_after_returns) AS day_qty_after_returns,
STDDEV(SUM(order_qty_after_returns)) OVER(PARTITION BY customer_id ORDER BY SUM(order_qty_after_returns)) AS stdv_qty,
CASE
WHEN MIN(order_min_qty) < 0 THEN count(1)
ELSE 0
END AS day_num_returns,
CASE
WHEN MIN(order_min_qty) < 0 THEN AVG(time_to_return)
ELSE NULL
END AS avg_time_to_return
FROM (
SELECT
customer_id,
order_id,
-- Gives the order date vs return(s) dates.
MIN(transaction_date) AS order_day,
MAX(transaction_date) AS return_final_day,
DATE_DIFF(MAX(transaction_date), MIN(transaction_date), DAY) AS time_to_return,
-- Aggregates all products in the order
-- and all products returned later.
SUM(qty * unit_price) AS order_value_after_returns,
SUM(qty) AS order_qty_after_returns,
-- If negative, order has qty return(s).
MIN(qty) order_min_qty
FROM
Orders
GROUP BY
customer_id,
order_id)
GROUP BY
customer_id,
order_day)
WHERE
-- [Optional] Remove dates with outliers per a customer.
(stdv_value < MAX_STDV_MONETARY
OR stdv_value IS NULL) AND
(stdv_qty < MAX_STDV_QTY
OR stdv_qty IS NULL);
-- Creates the inputs and targets accross multiple threshold dates.
SET (MIN_DATE, MAX_DATE) = (
SELECT AS STRUCT
MIN(order_day) AS min_days,
MAX(order_day) AS max_days
FROM
Aggred
);
SET THRESHOLD_DATE = MIN_DATE;
CREATE OR REPLACE TEMP TABLE Featured
(
-- dataset STRING,
customer_id STRING,
monetary FLOAT64,
frequency INT64,
recency INT64,
T INT64,
time_between FLOAT64,
avg_basket_value FLOAT64,
avg_basket_size FLOAT64,
has_returns STRING,
avg_time_to_return FLOAT64,
num_returns INT64,
-- threshold DATE,
-- step INT64,
target_monetary FLOAT64,
);
LOOP
-- Can choose a longer original window in case
-- there were not many orders in the early days.
IF STEP = 1 THEN
SET THRESHOLD_DATE = DATE_ADD(THRESHOLD_DATE, INTERVAL WINDOW_STEP_INITIAL DAY);
ELSE
SET THRESHOLD_DATE = DATE_ADD(THRESHOLD_DATE, INTERVAL WINDOW_STEP DAY);
END IF;
SET STEP = STEP + 1;
IF THRESHOLD_DATE >= DATE_SUB(MAX_DATE, INTERVAL (WINDOW_STEP) DAY) THEN
LEAVE;
END IF;
-- Takes all transactions before the threshold date unless you decide
-- to use a different window lenght to test model performance.
IF WINDOW_LENGTH != 0 THEN
SET WINDOW_START = DATE_SUB(THRESHOLD_DATE, INTERVAL WINDOW_LENGTH DAY);
ELSE
SET WINDOW_START = MIN_DATE;
END IF;
INSERT Featured
SELECT
-- CASE
-- WHEN THRESHOLD_DATE <= DATE_SUB(MAX_DATE, INTERVAL LENGTH_FUTURE DAY) THEN 'UNASSIGNED'
-- ELSE 'TEST'
-- END AS dataset,
tf.customer_id,
ROUND(tf.monetary_orders, 2) AS monetary,
tf.cnt_orders AS frequency,
tf.recency,
tf.T,
ROUND(tf.recency/cnt_orders, 2) AS time_between,
ROUND(tf.avg_basket_value, 2) AS avg_basket_value,
ROUND(tf.avg_basket_size, 2) AS avg_basket_size,
has_returns,
CEIL(avg_time_to_return) AS avg_time_to_return,
num_returns,
-- THRESHOLD_DATE AS threshold,
-- STEP - 1 AS step,
ROUND(tt.target_monetary, 2) AS target_monetary,
FROM (
-- This SELECT uses only data before THRESHOLD_DATE to make features.
SELECT
customer_id,
SUM(value) AS monetary_orders,
DATE_DIFF(MAX(order_day), MIN(order_day), DAY) AS recency,
DATE_DIFF(THRESHOLD_DATE, MIN(order_day), DAY) AS T,
COUNT(DISTINCT order_day) AS cnt_orders,
AVG(qty_articles) avg_basket_size,
AVG(value) avg_basket_value,
CASE
WHEN SUM(num_returns) > 0 THEN 'y'
ELSE 'n'
END AS has_returns,
AVG(time_to_return) avg_time_to_return,
THRESHOLD_DATE AS threshold,
SUM(num_returns) num_returns,
FROM
Aggred
WHERE
order_day <= THRESHOLD_DATE AND
order_day >= WINDOW_START
GROUP BY
customer_id
) tf
INNER JOIN (
-- This SELECT uses all orders that happened between threshold and
-- threshold + LENGTH_FUTURE to calculte the target monetary.
SELECT
customer_id,
SUM(value) target_monetary
FROM
Aggred
WHERE
order_day <= DATE_ADD(THRESHOLD_DATE, INTERVAL LENGTH_FUTURE DAY)
-- Overall value is similar to predicting only what's after threshold.
-- and the prediction performs better. We can substract later.
-- AND order_day > THRESHOLD_DATE
GROUP BY
customer_id) tt
ON
tf.customer_id = tt.customer_id;
END LOOP;
-- Persists the temporary ml table. Could do it directly from the above query
-- but this tutorial tries to limit String SQL statements as much as possible
-- and CREATE OR REPLACE TABLE without specifying a dataset is not supported.
-- The TrainLTV needs the table to be persisted.
EXECUTE IMMEDIATE """
CREATE OR REPLACE TABLE """|| TABLE_FOR_PREDICTING || """ AS (
SELECT * FROM Aggred )""";
EXECUTE IMMEDIATE """
CREATE OR REPLACE TABLE """|| TABLE_FOR_TRAINING || """ AS (
SELECT * FROM Featured )""";
-- CALL PersistData(TABLE_FOR_PREDICTING, "Aggred");
-- CALL PersistData(TABLE_FOR_TRAINING, "Featured");
END