scripts/optimization/storage_billing_model_savings_ddl.sql (116 lines of code) (raw):
-- WARNING #########################################################################################################################################
-- WARNING If you change the storage billing model of a dataset then you have to wait 14 days before changing it again.
-- WARNING See https://cloud.google.com/bigquery/docs/datasets-intro#dataset_limitations
-- WARNING #########################################################################################################################################
-- Instructions
-- Search for marker 'REMEMBER' to tune the queries at your will
-- Run the query to obtain the results including ALTER SCHEMA DDL
-- Permissions needed
-- * To run this query
-- * Permissions to run queries in the current project
-- * Permissions to create a table and write on it in the configured work dataset
-- * https://cloud.google.com/bigquery/docs/information-schema-table-storage-by-organization#required_permissions
-- * https://cloud.google.com/bigquery/docs/information-schema-datasets-schemata-options#before_you_begin
-- * To run the ALTER SCHEMA DDL
-- * https://cloud.google.com/bigquery/docs/updating-datasets#required_permissions
-- REMEMBER: Put here the prices of the region of interest, current values are for the US
-- See https://cloud.google.com/bigquery/pricing#storage
DECLARE active_logical_gib_price FLOAT64 DEFAULT 0.02;
DECLARE long_term_logical_gib_price FLOAT64 DEFAULT 0.01;
DECLARE active_physical_gib_price FLOAT64 DEFAULT 0.04;
DECLARE long_term_physical_gib_price FLOAT64 DEFAULT 0.02;
-- REMEMBER: (optional) Change this if you plan to change the time travel window (e.g. if you currently use the default 7 days and want to reduce it to 2 days put 2.0/7.0)
-- See https://cloud.google.com/bigquery/docs/time-travel#configuring_the_time_travel_window
DECLARE time_travel_rescale FLOAT64 DEFAULT 1.0;
-- REMEMBER: (optional) Change this if you want that the generated DDL statements include time travel window settings (e.g. 2.0*24.0)
DECLARE time_travel_hours FLOAT64 DEFAULT NULL;
-- REMEMBER: (optional) Change this to filter based on savings absolute value
DECLARE min_monthly_saving FLOAT64 DEFAULT 0.0;
-- REMEMBER: (optional) Change this to filter based on savings % value (e.g. 1% is 0.01)
DECLARE min_monthly_saving_pct FLOAT64 DEFAULT 0.00;
DECLARE query string;
CREATE OR REPLACE TEMPORARY TABLE get_phy_datasets_at_org_level_results (
project_name STRING,
dataset_name STRING,
);
FOR record IN (
SELECT DISTINCT project_id
FROM `region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION
) DO
SET query = '''
INSERT INTO get_phy_datasets_at_org_level_results
SELECT
catalog_name AS project_name,
schema_name AS dataset_name,
FROM `''' || record.project_id || '''.region-us.INFORMATION_SCHEMA.SCHEMATA_OPTIONS` -- REMEMBER: Change here to the region of interest
WHERE
option_name = "storage_billing_model"
AND option_value = 'PHYSICAL'
''';
BEGIN
EXECUTE IMMEDIATE query;
EXCEPTION WHEN ERROR THEN
END;
END FOR;
CREATE SCHEMA IF NOT EXISTS optimization_workshop;
CREATE OR REPLACE TABLE optimization_workshop.storage_billing_model_savings_ddl AS
WITH
storage_sizes AS (
SELECT
project_id AS project_name,
table_schema AS dataset_name,
-- Logical
SUM(active_logical_bytes) / power(1024, 3) AS active_logical_gib,
SUM(long_term_logical_bytes) / power(1024, 3) AS long_term_logical_gib,
-- Physical
SUM(active_physical_bytes - time_travel_physical_bytes) / power(1024, 3) AS active_no_tt_no_fs_physical_gib,
SUM(time_travel_physical_bytes) / power(1024, 3) * time_travel_rescale AS time_travel_physical_gib,
SUM(fail_safe_physical_bytes) / power(1024, 3) AS fail_safe_physical_gib,
SUM(long_term_physical_bytes) / power(1024, 3) AS long_term_physical_gib,
FROM
-- REMEMBER: Change here to the region of interest
`region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION
-- See https://cloud.google.com/bigquery/docs/information-schema-table-storage-by-organization#schema
WHERE TRUE
AND total_physical_bytes > 0
-- REMEMBER: (optional) You may want to enable this filter to exclude ML models, materialized views, etc...
-- AND table_type = 'BASE TABLE'
GROUP BY 1,2
),
storage_prices AS (
SELECT
project_name,
dataset_name,
-- Logical
active_logical_gib AS active_logical_gib,
long_term_logical_gib AS long_term_logical_gib,
-- Physical
active_no_tt_no_fs_physical_gib AS active_no_tt_no_fs_physical_gib,
time_travel_physical_gib AS time_travel_physical_gib,
fail_safe_physical_gib AS fail_safe_physical_gib,
long_term_physical_gib AS long_term_physical_gib,
-- Compression ratio
SAFE_DIVIDE(active_logical_gib, active_no_tt_no_fs_physical_gib) AS active_compression_ratio,
SAFE_DIVIDE(long_term_logical_gib, long_term_physical_gib) AS long_term_compression_ratio,
-- Forecast costs logical
active_logical_gib * active_logical_gib_price AS forecast_active_logical_cost,
long_term_logical_gib * long_term_logical_gib_price AS forecast_long_term_logical_cost,
-- Forecast costs physical
active_no_tt_no_fs_physical_gib * active_physical_gib_price AS forecast_active_no_tt_no_fs_physical_cost,
time_travel_physical_gib * active_physical_gib_price AS forecast_travel_physical_cost,
fail_safe_physical_gib * active_physical_gib_price AS forecast_failsafe_physical_cost,
long_term_physical_gib * long_term_physical_gib_price AS forecast_long_term_physical_cost,
FROM
storage_sizes
),
storage_prices_total AS (
SELECT
project_name,
dataset_name,
(forecast_active_logical_cost+forecast_long_term_logical_cost) AS forecast_logical,
(forecast_active_no_tt_no_fs_physical_cost+forecast_travel_physical_cost+forecast_failsafe_physical_cost+forecast_long_term_physical_cost) AS forecast_physical,
FROM storage_prices
),
storage_prices_compare AS (
SELECT
SPT.project_name,
SPT.dataset_name,
SPT.forecast_logical,
SPT.forecast_physical,
(SPT.forecast_logical - SPT.forecast_physical) AS forecast_compare,
IF(SPT.forecast_logical > SPT.forecast_physical,"physical","logical") AS better_on,
IF(PD.dataset_name IS NULL, "logical", "physical") AS currently_on
FROM
storage_prices_total AS SPT
LEFT JOIN get_phy_datasets_at_org_level_results AS PD
USING(project_name, dataset_name)
),
storage_prices_compare2 AS (
SELECT
*,
IF(currently_on = "logical", forecast_logical, forecast_physical) AS monthly_spending,
ABS(forecast_compare) AS monthly_savings,
ABS(forecast_compare)/IF(currently_on = "logical", forecast_logical, forecast_physical) AS monthly_savings_pct,
FROM storage_prices_compare
),
storage_ddl AS (
SELECT
*,
IF(time_travel_hours IS NULL,
CONCAT("ALTER SCHEMA `", project_name, ".", dataset_name, "` SET OPTIONS(storage_billing_model='", better_on, "' );" ),
CONCAT("ALTER SCHEMA `", project_name, ".", dataset_name, "` SET OPTIONS(storage_billing_model='", better_on, "', max_time_travel_hours=", time_travel_hours, ");" )
) AS ddl,
FROM storage_prices_compare2
WHERE better_on != currently_on
AND monthly_savings > min_monthly_saving
AND monthly_savings_pct > min_monthly_saving_pct
ORDER BY monthly_savings DESC
)
-- REMEMBER: (optional) Change here the name of the pseudo-table (CTE) you want to check
SELECT * FROM storage_ddl;