sql-scripts/thelook_ecommerce/demo_queries.sql (107 lines of code) (raw):
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
------------------------------------------------------------------------------------
-- Query 1
-- Determine if we have enough incoming stock of items to satisfy the next 30 days of deliveries
-- This query will use data shared via Analytic Hub
--
-- NOTE: You first need to add the Analytic Hub shared data
-- 1 - Click on Analytic Hub on the left side of the console after (after opening BigQuery)
-- 2 - Click on Search Listings
-- 3 - Click "Private" under Filters
-- 4 - Click "Wholesale Integrator Order Data" in the search results
-- 5 - Click "ADD DATASET TO PROJECT"
-- 6 - Click "SAVE"
-- 7 - Head back over to BigQuery SQL Workspace
------------------------------------------------------------------------------------
WITH Next30DayDelivery AS
(
-- Determine how many items we are delivering in the next 30 days
-- This creates a running sum of delivery quantities so we can compare against the running sum of incoming inventory
SELECT CAST(delivery_time AS DATE) AS delivery_date,
distribution_center_id,
product_id,
quantity_to_delivery,
SUM(quantity_to_delivery) OVER (PARTITION BY distribution_center_id, product_id
ORDER BY delivery_time, distribution_center_id, product_id) AS running_sum_quantity_to_deliver
FROM `${project_id}.${bigquery_thelook_ecommerce_dataset}.product_deliveries` product_deliveries
WHERE delivery_time BETWEEN CURRENT_TIMESTAMP() AND TIMESTAMP_ADD(CURRENT_TIMESTAMP() , INTERVAL 30 DAY)
AND quantity_to_delivery > 0
)
, IncomingProductStock AS
(
-- Use the shared data from our Wholesale Provider by using the Analytic's Hub table
-- Create a running sum of items that are being delivered
SELECT expected_delivery_date,
distribution_center_id,
product_id,
quantity,
SUM(quantity) OVER (PARTITION BY distribution_center_id, product_id
ORDER BY expected_delivery_date, distribution_center_id, product_id) AS running_sum_incoming_quantity
FROM `${project_id}.wholesale_integrator_order_data.wholesale_orders`
)
, DeliveryAndIncoming AS
(
SELECT Next30DayDelivery.delivery_date AS delivery_date,
distribution_centers.name AS distribution_center,
CASE WHEN LENGTH(products.name) > 50
THEN CONCAT(SUBSTR(products.name,1,50),'...') -- for display purposes
ELSE products.name
END AS product_name,
Next30DayDelivery.running_sum_quantity_to_deliver AS quantity_required,
IncomingProductStock.running_sum_incoming_quantity AS quantity_available,
IncomingProductStock.running_sum_incoming_quantity - Next30DayDelivery.running_sum_quantity_to_deliver AS in_stock_qty,
CASE WHEN IncomingProductStock.running_sum_incoming_quantity - Next30DayDelivery.running_sum_quantity_to_deliver <= 0 THEN 'Out of Stock'
WHEN IncomingProductStock.running_sum_incoming_quantity - Next30DayDelivery.running_sum_quantity_to_deliver <= 10 THEN 'Low'
WHEN IncomingProductStock.running_sum_incoming_quantity - Next30DayDelivery.running_sum_quantity_to_deliver <= 20 THEN 'Medium'
ELSE 'Ok'
END AS stock_status
FROM Next30DayDelivery
-- inner join since the shared data always has a delivery per day, distribution center and product
INNER JOIN IncomingProductStock
ON Next30DayDelivery.delivery_date = IncomingProductStock.expected_delivery_date
AND Next30DayDelivery.distribution_center_id = IncomingProductStock.distribution_center_id
AND Next30DayDelivery.product_id = IncomingProductStock.product_id
AND IncomingProductStock.running_sum_incoming_quantity - Next30DayDelivery.running_sum_quantity_to_deliver <= 20
INNER JOIN `${project_id}.${bigquery_thelook_ecommerce_dataset}.distribution_centers` distribution_centers
ON Next30DayDelivery.distribution_center_id = distribution_centers.id
INNER JOIN `${project_id}.${bigquery_thelook_ecommerce_dataset}.products` products
ON Next30DayDelivery.product_id = products.id
)
-- SELECT * FROM Next30DayDelivery ORDER BY 2, 3; -- 2,789,763 rows
-- SELECT * FROM IncomingProductStock ORDER BY 2, 3; -- 26,208,000 rows
SELECT * FROM DeliveryAndIncoming ORDER BY 1, 2, 3 ;
------------------------------------------------------------------------------------
-- Query 2
-- Show data that is being streamed in real time via Dataflow from a public Pub/Sub
-- Data is immediately available for reading by BigQuery
-- Determine if our delivers are slowing down or speeding up
--
-- NOTE: You need to start the Composer Airflow DAG "sample-dataflow-start-streaming-job"
-- several minutes before running this query in order to start the streaming job.
-- NOTE: You first need to start the Streaming job
-- 1 - Open Composer in the Google Cloud Console
-- 2 - Click "Airflow link" which will open a new tab
-- 3 - Click the "Play" button next to the DAG: sample-dataflow-start-streaming-job
-- 4 - The "Play" button will drop down and select "Trigger DAG"
-- 5 - You will need to wait several minutes in order for the Dataflow to start and send data
-- You can start this before your meeting. The streaming job will be stopped after 4 hours.
-- 6 - Head back over to BigQuery SQL Workspace
------------------------------------------------------------------------------------
WITH MaxStreamingDate AS
(
-- Use the max date (versus current time in case the streaming job is not started)
SELECT MAX(delivery_time) AS max_delivery_time
FROM `${project_id}.${bigquery_thelook_ecommerce_dataset}.product_deliveries_streaming`
)
-- SELECT max_delivery_time FROM MaxStreamingDate;
, AverageDeliveryTime AS
(
SELECT CASE WHEN delivery_time > TIMESTAMP_SUB(max_delivery_time, INTERVAL 60 MINUTE)
THEN 'CurrentWindow'
ELSE 'PriorWindow'
END AS TimeWindow,
distribution_center_id,
delivery_minutes,
distance
FROM `${project_id}.${bigquery_thelook_ecommerce_dataset}.product_deliveries_streaming`
CROSS JOIN MaxStreamingDate
WHERE delivery_time > TIMESTAMP_SUB(max_delivery_time, INTERVAL 5 DAY)
AND delivery_minutes > 0
AND distance > 0
)
--SELECT * FROM AverageDeliveryTime ORDER BY distribution_center_id, TimeWindow;
, PivotData AS
(
SELECT *
FROM AverageDeliveryTime
PIVOT (AVG(delivery_minutes) avg_delivery_minutes, AVG(distance) avg_distance, COUNT(*) nbr_of_deliveries FOR TimeWindow IN ('CurrentWindow', 'PriorWindow'))
)
-- SELECT * FROM PivotData;
SELECT distribution_centers.name AS distribution_center,
nbr_of_deliveries_CurrentWindow AS deliveries_current,
nbr_of_deliveries_PriorWindow AS deliveries_prior,
ROUND(avg_distance_CurrentWindow,1) AS distance_current,
ROUND(avg_distance_PriorWindow,1) AS distance_prior,
CASE WHEN avg_distance_CurrentWindow > avg_distance_PriorWindow + (avg_distance_PriorWindow * .15) THEN 'High'
WHEN avg_distance_CurrentWindow > avg_distance_PriorWindow + (avg_distance_PriorWindow * .10) THEN 'Med'
WHEN avg_distance_CurrentWindow > avg_distance_PriorWindow + (avg_distance_PriorWindow * .05) THEN 'Low'
ELSE 'Normal'
END AS distance_trend,
ROUND(avg_delivery_minutes_CurrentWindow,1) AS minutes_current,
ROUND(avg_delivery_minutes_PriorWindow,1) AS minutes_prior,
CASE WHEN avg_delivery_minutes_CurrentWindow > avg_delivery_minutes_PriorWindow + (avg_delivery_minutes_PriorWindow * .15) THEN 'High'
WHEN avg_delivery_minutes_CurrentWindow > avg_delivery_minutes_PriorWindow + (avg_delivery_minutes_PriorWindow * .10) THEN 'Med'
WHEN avg_delivery_minutes_CurrentWindow > avg_delivery_minutes_PriorWindow + (avg_delivery_minutes_PriorWindow * .05) THEN 'Low'
ELSE 'Normal'
END AS minutes_trend
FROM PivotData
INNER JOIN `${project_id}.${bigquery_thelook_ecommerce_dataset}.distribution_centers` AS distribution_centers
ON PivotData.distribution_center_id = distribution_centers.id
ORDER BY distribution_center_id;
------------------------------------------------------------------------------------
-- Query 3
-- Query data that is stored in AWS S3
--
-- NOTE: To run this, you must have access to OMNI. If you deployed this via
-- click-to-deploy then access was granted automatically. Otherwise, you need
-- to set up and configure OMNI and AWS manually.
------------------------------------------------------------------------------------
-- Create a table on data in AWS (S3)
CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${aws_omni_biglake_dataset_name}.distribution_centers`
WITH CONNECTION `${shared_demo_project_id}.${aws_omni_biglake_dataset_region}.${aws_omni_biglake_connection}`
OPTIONS (
format = "PARQUET",
uris = ['s3://${aws_omni_biglake_s3_bucket}/distribution-center/distribution_centers.parquet']
);
-- Query the data in AWS
SELECT * FROM `${project_id}.${aws_omni_biglake_dataset_name}.distribution_centers` LIMIT 1000;
-- Export the data to S3 from the table
-- We would typically grab a subset of data so we just transfer the results of the query
EXPORT DATA WITH CONNECTION `${shared_demo_project_id}.${aws_omni_biglake_dataset_region}.${aws_omni_biglake_connection}`
OPTIONS(
uri="s3://${aws_omni_biglake_s3_bucket}/taxi-export/distribution_centers/*",
format="PARQUET"
)
AS
SELECT * FROM `${project_id}.${aws_omni_biglake_dataset_name}.distribution_centers`;
-- Load into BigQuery
-- We can now join the data to the rest of data in BigQuery as well as do machine learning
-- This will appear in your dataset: ${bigquery_thelook_ecommerce_dataset}
LOAD DATA INTO `${project_id}.${bigquery_thelook_ecommerce_dataset}.aws_distribution_centers`
FROM FILES (uris = ['s3://${aws_omni_biglake_s3_bucket}/taxi-export/distribution_centers/*'], format = 'PARQUET')
WITH CONNECTION `${shared_demo_project_id}.${aws_omni_biglake_dataset_region}.${aws_omni_biglake_connection}`;
-- View the data just loaded
SELECT * FROM `${project_id}.${bigquery_thelook_ecommerce_dataset}.aws_distribution_centers`;
-- Use CTAS to query and directy load into BigQuery
-- Load query results directly into a local BigQuery table
CREATE OR REPLACE TABLE `${project_id}.${bigquery_thelook_ecommerce_dataset}.omni_aws_query_results` AS
SELECT * FROM `${project_id}.${aws_omni_biglake_dataset_name}.distribution_centers`;
SELECT * FROM `${project_id}.${bigquery_thelook_ecommerce_dataset}.omni_aws_query_results`;