sql-scripts/taxi_dataset/sp_demo_cleanroom_queries.sql (145 lines of code) (raw):
/*##################################################################################
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Use Cases:
- Clean Rooms allow organizations to share data and obtain insights
- Show how a rideshare company shares trip data for insights (such as #trips during time window)
Description:
- Shows searching a BigQuery Data Clean Room on two different views of rideshare trip data.
One with a privacy policy and one without to show BQCR's can protect sensitive data.
Show:
- Subscripe to a data clean room and run queries with and without a privacy policy to show
insights can be gained without leaking sensitive data with a pricacy policy.
References:
- https://cloud.google.com/bigquery/docs/data-clean-rooms
Clean up / Reset script:
DROP FUNCTION IF EXISTS `${project_id}.${bigquery_taxi_dataset}.sp_demo_cleanroom_queries`;
*/
------------------------------------------------------------------------------------
-- Query 1
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- The trip view has an aggregation threshold privacy policy which enforces the minimum number
-- of distinct entitie (2 in this case) on a particiular column (customer_id in this case).
-- This query will run on the unprotected table and allow you to view sensitive data
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Click on Analytic Hub on the left side of the console after (after opening BigQuery)
-- 2 - Click on Search Listings
-- 3 - Click "Cleanroom" under Filters
-- 4 - Click "NYC Rideshare Data" in the search results
-- 5 - Click "ADD DATASET TO PROJECT"
-- 6 - Click "SAVE"
-- 7 - Head back over to BigQuery SQL Workspace
-- 8 - Run the query below on the unprotected view and you should see 9 results of
-- famous people living in NYC (for identifying celebrities they all have income
-- of 500000)
---------------------------------------------------------------------------------------------
select
distinct(customer_id),
customer_name,
age,
income,
gender
from
`${project_id}.${bigquery_cleanroom_dataset}.trip_no_pp`
where
income = 500000;
------------------------------------------------------------------------------------
-- Query 2
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- The trip view has an aggregation threshold privacy policy which enforces the minimum number
-- of distinct entitie (2 in this case) on a particiular column (customer_id in this case).
-- This query will run on the protected table and WILL NOT allow you to view sensitive data
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the query below on the unprotected view and you get an error:
-- "You must use SELECT WITH AGGREGATION_THRESHOLD for this query because a
-- privacy policy has been set by a data owner.""
---------------------------------------------------------------------------------------------
select
distinct(customer_id),
customer_name,
age,
income,
gender
from
`${project_id}.${bigquery_cleanroom_dataset}.trip`
where
income = 500000;
------------------------------------------------------------------------------------
-- Query 3a
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- We will create a stored procedure to create time bucket to nearest N seconds
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the statement below to create the stored procedure
---------------------------------------------------------------------------------------------
CREATE
OR REPLACE FUNCTION `${project_id}.${bigquery_taxi_dataset}.tumble_interval` (val DATETIME, tumble_seconds INT64) AS (
DATETIME(timestamp_seconds(div(UNIX_SECONDS(TIMESTAMP(val)),
tumble_seconds) * tumble_seconds),
"America/New_York"
)
);
------------------------------------------------------------------------------------
-- Query 3b
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- A consulting company is working with a restaurateur who is investigating two NYC locations and
-- wants to measure traffic in the vicinity during luncheon hours. The company will use Rideshare
-- Trip Data available in BigQuery Data Cleanrooms for the number of customers dropped off in 3
-- minute windows during prime lunchtime hours.
--
-- NOTE: You first need to add the Cleanroom shared data (and create the funciton above)
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the query below on the protected view and you get about 116 results
-- You will notice each result has a minumum of 2 distinct customer id's as
-- enforeced by the privacy policy on the view.
---------------------------------------------------------------------------------------------
select
with aggregation_threshold
`${project_id}.${bigquery_taxi_dataset}.tumble_interval`(DATETIME(dropoff_time), 180) AS three_min,
dropoff_location_id,
count(distinct(customer_id)) as num_customers,
CAST(ROUND(avg(income)) AS STRING FORMAT '$999,999') as avg_income,
ROUND(avg(age)) as avg_age
from
`${project_id}.${bigquery_cleanroom_dataset}.trip`
where
DATETIME(timestamp(dropoff_time), "America/New_York") >= '2022-09-05T00:00:00'
AND DATETIME(timestamp(dropoff_time), "America/New_York") <= '2022-09-06T00:00:00' ---132: JFK ; 138: LaGuardia
and dropoff_location_id in (132, 138)
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) >= 12
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) <= 14
group by
three_min,
dropoff_location_id
order by
num_customers asc,
three_min asc;
------------------------------------------------------------------------------------
-- Query 4
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- A consulting company is working with a restaurateur who is investigating two NYC locations and
-- wants to measure traffic in the vicinity during luncheon hours. The company will use Rideshare
-- Trip Data available in BigQuery Data Cleanrooms for the number of customers dropped off in 3
-- minute windows during prime lunchtime hours.
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the query below on the unprotected view and you get about 120 results
-- You will notice in this case there are results that have 1 distinct customer id's
-- which means it's possible to reveal sensitive data.
---------------------------------------------------------------------------------------------
select
`${project_id}.${bigquery_taxi_dataset}.tumble_interval` (DATETIME(dropoff_time), 180) AS three_min,
dropoff_location_id,
count(distinct(customer_id)) as num_customers,
CAST(ROUND(avg(income)) AS STRING FORMAT '$999,999') as avg_income,
ROUND(avg(age)) as avg_age
from
`${project_id}.${bigquery_cleanroom_dataset}.trip_no_pp`
where
DATETIME(timestamp(dropoff_time), "America/New_York") >= '2022-09-05T00:00:00'
AND DATETIME(timestamp(dropoff_time), "America/New_York") <= '2022-09-06T00:00:00' ---132: JFK ; 138: LaGuardia
and dropoff_location_id in (132, 138)
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) >= 12
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) <= 14
group by
three_min,
dropoff_location_id
order by
num_customers asc,
three_min asc;
------------------------------------------------------------------------------------
-- Query 5
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- A consulting company is working with a restaurateur who is investigating two NYC locations and
-- wants to measure traffic in the vicinity during luncheon hours. The company will use Rideshare
-- Trip Data available in BigQuery Data Cleanrooms for the number of customers dropped off in 3
-- minute windows during prime lunchtime hours.
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the query below on the unprotected view and add customer_name to the result
-- You will notice in this case there are results that have 1 distinct customer id's
-- and you can see specific customer names which is a leak of sensitive data.
---------------------------------------------------------------------------------------------
select
`${project_id}.${bigquery_taxi_dataset}.tumble_interval` (DATETIME(dropoff_time), 180) AS three_min,
dropoff_location_id,
count(distinct(customer_id)) as num_customers,
CAST(ROUND(avg(income)) AS STRING FORMAT '$999,999') as avg_income,
ROUND(avg(age)) as avg_age,
customer_name
from
`${project_id}.${bigquery_cleanroom_dataset}.trip_no_pp`
where
DATETIME(timestamp(dropoff_time), "America/New_York") >= '2022-09-05T00:00:00'
AND DATETIME(timestamp(dropoff_time), "America/New_York") <= '2022-09-06T00:00:00' ---132: JFK ; 138: LaGuardia
and dropoff_location_id in (132, 138)
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) >= 12
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) <= 14
group by
three_min,
dropoff_location_id,
customer_name
order by
num_customers asc,
three_min asc;
------------------------------------------------------------------------------------
-- Query 6
-- We have two views, one with a privacy policy (trip) and one without (trip_no_pp)
-- A consulting company is working with a restaurateur who is investigating two NYC locations and
-- wants to measure traffic in the vicinity during luncheon hours. The company will use Rideshare
-- Trip Data available in BigQuery Data Cleanrooms for the number of customers dropped off in 3
-- minute windows during prime lunchtime hours.
--
-- NOTE: You first need to add the Cleanroom shared data
-- 1 - Head back over to BigQuery SQL Workspace
-- 2 - Run the query below on the protected view and add customer_name to the result
-- You will get 0 results because all results are filtered out because the aggreage
-- threshold is not met.
---------------------------------------------------------------------------------------------
select
with aggregation_threshold
`${project_id}.${bigquery_taxi_dataset}.tumble_interval` (DATETIME(dropoff_time), 180) AS three_min,
dropoff_location_id,
count(distinct(customer_id)) as num_customers,
CAST(ROUND(avg(income)) AS STRING FORMAT '$999,999') as avg_income,
ROUND(avg(age)) as avg_age,
customer_name
from
`${project_id}.${bigquery_cleanroom_dataset}.trip`
where
DATETIME(timestamp(dropoff_time), "America/New_York") >= '2022-09-05T00:00:00'
AND DATETIME(timestamp(dropoff_time), "America/New_York") <= '2022-09-06T00:00:00' ---132: JFK ; 138: LaGuardia
and dropoff_location_id in (132, 138)
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) >= 12
AND EXTRACT(
HOUR
FROM
DATETIME(timestamp(dropoff_time), "America/New_York")
) <= 14
group by
three_min,
dropoff_location_id,
customer_name
order by
num_customers asc,
three_min asc;