sql-scripts/taxi_dataset/sp_demo_pricing_ondemand

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* -- NOTE: This is designed for a single BigQuery region -- This gets the past 3 months of data (from first day of prior 3 months) -- WARNING!!! -- This will return every SQL statement run in BigQuery which could be A LOT! -- Also, if you have PII data in your SQL (Passport, Date of Birth), it can be shown in some of the SQL statements -- WARNING!!! -- SEARCH and REPLACE the below values (if downloading this single file from GitHub) -- Replace Region -> Search for: region-${bigquery_region} -- Replace GCS bucket Path -> Search for: gs://${raw_bucket_name} Use Cases: - Shows the queries that are most used in your organization - Shows the estimatated on-demand cost of the queries (retail pricing) Description: - Loops through each project and get your query data from the informational schema tables Reference: - https://cloud.google.com/bigquery/docs/information-schema-jobs Clean up / Reset script: DROP SCHEMA IF EXISTS `${project_id}.ondemand_query_usage` CASCADE; DROP SCHEMA IF EXISTS `${project_id}.ondemand_query_analysis` CASCADE; */ -- This is designed to help you understand where you should focus your efforts or costs -- It is NOT an exact costs savings calculator CREATE SCHEMA `${project_id}.ondemand_query_usage` OPTIONS ( location = "us" ); -- Track which projects we are able to get data CREATE OR REPLACE TABLE `${project_id}.ondemand_query_usage.usage_export_project` ( project_id STRING, result STRING ) ; -- Create a table to hold the results CREATE OR REPLACE TABLE `${project_id}.ondemand_query_usage.usage_export_data` ( creation_time TIMESTAMP, project_id STRING, project_number INT64, user_email STRING, job_id STRING, job_type STRING, statement_type STRING, priority STRING, start_time TIMESTAMP, end_time TIMESTAMP, query STRING, state STRING, reservation_id STRING, total_bytes_processed INT64, total_slot_ms INT64, error_result_reason STRING, error_result_location STRING, error_result_debug_info STRING, error_result_message STRING, cache_hit BOOLEAN, destination_table_project_id STRING, destination_table_dataset_id STRING, destination_table_table_id STRING, total_bytes_billed INT64, transaction_id STRING, parent_job_id STRING, session_info_session_id STRING, total_modified_partitions INT64, bi_engine_statistics_bi_engine_mode STRING, resource_warning STRING, normalized_literals STRING, transferred_bytes INT64, est_on_demand_cost FLOAT64, job_avg_slots FLOAT64, jobstage_max_slots FLOAT64, estimated_runnable_units INT64 ); SELECT DISTINCT project_id from `region-${bigquery_region}`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION; -- Loop through each project id -- This is done since only the JOBS view has the "query" field FOR record IN (SELECT DISTINCT project_id FROM `region-${bigquery_region}`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION) -- You can inlcude the below WHERE statement to limit to just a few projects --WHERE project_id IN ('data-analytics-demo-xxxxxxxxxx','data-analytics-demo-yyyyyyyyy')) DO BEGIN EXECUTE IMMEDIATE FORMAT(""" INSERT INTO `${project_id}.ondemand_query_usage.usage_export_data` ( creation_time, project_id, project_number, user_email, job_id, job_type, statement_type, priority, start_time, end_time, query, state, reservation_id, total_bytes_processed, total_slot_ms, error_result_reason, error_result_location, error_result_debug_info, error_result_message, cache_hit, destination_table_project_id, destination_table_dataset_id, destination_table_table_id, total_bytes_billed, transaction_id, parent_job_id, session_info_session_id, total_modified_partitions, bi_engine_statistics_bi_engine_mode, resource_warning, normalized_literals, transferred_bytes, est_on_demand_cost, job_avg_slots, jobstage_max_slots, estimated_runnable_units ) SELECT job.creation_time, job.project_id, job.project_number, job.user_email, job.job_id, job.job_type, job.statement_type, job.priority, job.start_time, job.end_time, job.query, job.state, job.reservation_id, job.total_bytes_processed, job.total_slot_ms, job.error_result.reason AS error_result_reason, job.error_result.location AS error_result_location, job.error_result.debug_info AS error_result_debug_info, job.error_result.message AS error_result_message, job.cache_hit, job.destination_table.project_id AS destination_table_project_id, job.destination_table.dataset_id AS destination_table_dataset_id, job.destination_table.table_id AS destination_table_table_id, job.total_bytes_billed, job.transaction_id, job.parent_job_id, job.session_info.session_id AS session_info_session_id, job.total_modified_partitions, job.bi_engine_statistics.bi_engine_mode AS bi_engine_statistics_bi_engine_mode, job.query_info.resource_warning, job.query_info.query_hashes.normalized_literals, job.transferred_bytes, -- This is retail pricing (for estimating purposes) -- 6.25 / 1,099,511,627,776 = 0.00000000000568434188608080 ($6.25 per TB so cost per byte is 0.00000000000568434188608080) CASE WHEN job.reservation_id IS NULL THEN CAST(CAST(job.total_bytes_billed AS BIGDECIMAL) * CAST(0.00000000000568434188608080 AS BIGDECIMAL) AS FLOAT64) ELSE CAST (0 AS FLOAT64) END as est_on_demand_cost, -- Average slot utilization per job is calculated by dividing -- total_slot_ms by the millisecond duration of the job CAST(SAFE_DIVIDE(job.total_slot_ms,(TIMESTAMP_DIFF(job.end_time, job.start_time, MILLISECOND))) AS FLOAT64) AS job_avg_slots, MAX(SAFE_DIVIDE(unnest_job_stages.slot_ms,unnest_job_stages.end_ms - unnest_job_stages.start_ms)) AS jobstage_max_slots, MAX(unnest_timeline.estimated_runnable_units) AS estimated_runnable_units FROM `%s`.`region-${bigquery_region}`.INFORMATION_SCHEMA.JOBS AS job CROSS JOIN UNNEST(job.job_stages) as unnest_job_stages CROSS JOIN UNNEST(job.timeline) AS unnest_timeline WHERE DATE(job.creation_time) BETWEEN DATE_SUB(DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH), INTERVAL (SELECT EXTRACT(DAY FROM CURRENT_DATE())-1) DAY) AND CURRENT_DATE() GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34; """,record.project_id); INSERT INTO `${project_id}.ondemand_query_usage.usage_export_project` (project_id, result) VALUES (record.project_id,'SUCCESS'); EXCEPTION WHEN ERROR THEN -- do nothing we do not have access to the project INSERT INTO `${project_id}.ondemand_query_usage.usage_export_project` (project_id, result) VALUES (record.project_id,'FAILED'); END; END FOR; -- Export the data (some people transfer this to have an analysis performed) -- You should delete the data from this path before exporting -- You can also share the data via Analytics Hub (Preferred Method of Sharing) EXPORT DATA OPTIONS ( uri = 'gs://${raw_bucket_name}/ondemand_query_usage/*.parquet', format = 'PARQUET', overwrite = true ) AS ( SELECT * FROM `${project_id}.ondemand_query_usage.usage_export_data` ); -- Test loading the data DROP TABLE IF EXISTS `${project_id}.ondemand_query_usage.usage_import_data`; LOAD DATA OVERWRITE `${project_id}.ondemand_query_usage.usage_import_data` FROM FILES ( format = 'PARQUET', uris = ['gs://${raw_bucket_name}/ondemand_query_usage/*.parquet'] ); -- See which projects worked/failed SELECT * FROM `${project_id}.ondemand_query_usage.usage_export_project` ORDER BY result DESC; -- See some results SELECT project_id, query, SUM(est_on_demand_cost) AS est_sum_on_demand_cost, COUNT(1) AS Cnt FROM `${project_id}.ondemand_query_usage.usage_export_data` GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 100; ------------------------------------------------------------------------------------------------------------ -- Do Analysis ------------------------------------------------------------------------------------------------------------ -- This is designed to help you understand where you should focus your efforts or costs -- It is NOT an exact costs savings calculator CREATE SCHEMA `${project_id}.ondemand_query_analysis` OPTIONS ( location = "us" ); -- In case we have duplicates (the tables were not dropped between runs) CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_data` AS SELECT DISTINCT * FROM `${project_id}.ondemand_query_usage.usage_import_data` WHERE error_result_reason IS NULL AND reservation_id IS NULL; -- Cost by month for top 1000 most expensive queries CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_cost_by_query_top_1000` AS SELECT project_id, EXTRACT(YEAR FROM start_time) AS Year, EXTRACT(MONTH FROM start_time) AS Month, query, SUM(est_on_demand_cost) AS est_sum_on_demand_cost, COUNT(1) AS Cnt, AVG(total_bytes_processed) / 1000000000 AS average_gb_processed, AVG(job_avg_slots) AS average_job_avg_slots, AVG(estimated_runnable_units) AS average_estimated_runnable_units, FROM `${project_id}.ondemand_query_analysis.usage_data` WHERE error_result_reason IS NULL AND reservation_id IS NULL GROUP BY 1, 2, 3, 4 LIMIT 1000; SELECT project_id, Year, Month, query, CAST(est_sum_on_demand_cost AS INT64) AS est_sum_on_demand_cost, Cnt, CAST(average_gb_processed AS INT64) AS average_gb_processed, CAST(average_job_avg_slots AS INT64) AS average_job_avg_slots, CAST(average_estimated_runnable_units AS INT64) AS average_estimated_runnable_units FROM `${project_id}.ondemand_query_analysis.usage_cost_by_query_top_1000` ORDER BY est_sum_on_demand_cost DESC; -- Usage Costs by Project CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_cost_by_project` AS SELECT project_id, EXTRACT(YEAR FROM start_time) AS Year, EXTRACT(MONTH FROM start_time) AS Month, SUM(est_on_demand_cost) AS est_sum_on_demand_cost, AVG(total_bytes_processed) / 1000000000 AS average_gb_processed, AVG(job_avg_slots) AS average_job_avg_slots, AVG(estimated_runnable_units) AS average_estimated_runnable_units, FROM `${project_id}.ondemand_query_analysis.usage_data` GROUP BY 1, 2, 3; SELECT project_id, Year, Month, CAST(est_sum_on_demand_cost AS INT64) AS est_sum_on_demand_cost, CAST(average_gb_processed AS INT64) AS average_gb_processed, CAST(average_job_avg_slots AS INT64) AS average_job_avg_slots, CAST(average_estimated_runnable_units AS INT64) AS average_estimated_runnable_units FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` ORDER BY est_sum_on_demand_cost DESC; -- Total costs for each month single month -- This is based off of Retail Pricing (no discounts) -- If you have a 10% discount then you can change the below to (1 minus .10 = .90) SUM(est_sum_on_demand_cost * .90) SELECT Year, Month, CAST(SUM(est_sum_on_demand_cost) AS INT64) AS total FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` GROUP BY 1,2 ORDER BY Year, Month; -- For each minute, for each job, get the maximum number of slots CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_slots_by_job` AS SELECT project_id, job_id, EXTRACT(YEAR FROM start_time) AS Year, EXTRACT(MONTH FROM start_time) AS Month, ((EXTRACT(DAY FROM start_time) - 1) * (24*60)) + (EXTRACT(HOUR FROM start_time) * 60) + (EXTRACT(MINUTE FROM start_time)) AS start_minute_of_job, ((EXTRACT(DAY FROM end_time) - 1) * (24*60)) + (EXTRACT(HOUR FROM end_time) * 60) + (EXTRACT(MINUTE FROM end_time)) AS end_minute_of_job, MAX(job_avg_slots) AS average_job_max_slots FROM `${project_id}.ondemand_query_analysis.usage_data` GROUP BY 1,2,3,4,5,6; SELECT * FROM `${project_id}.ondemand_query_analysis.usage_slots_by_job` LIMIT 100; -- For each minute in the month sum the max slots used by the jobs -- For each minute determine the number of slots to buy (in increments of 100) CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_slots_data_per_minute` AS WITH minutes AS ( -- every minute in the month SELECT element as minute_number FROM UNNEST(GENERATE_ARRAY(1, 44640)) AS element ) SELECT project_id, Year, Month, minute_number, CAST(SUM(average_job_max_slots) AS INT64) AS avg_slots, CAST(FLOOR((CAST(SUM(average_job_max_slots) AS INT64) + 99) / 100) * 100 AS INT64) AS avg_slots_rounded_up_100_slots FROM `${project_id}.ondemand_query_analysis.usage_slots_by_job` AS usage_slots_by_job INNER JOIN minutes ON minute_number BETWEEN start_minute_of_job AND end_minute_of_job GROUP BY 1,2,3,4; SELECT * FROM `${project_id}.ondemand_query_analysis.usage_slots_data_per_minute` ORDER BY project_id, Year, Month, minute_number LIMIT 10000; -- For each minute determine the number of slots to buy (in increments of 100) -- 0.060 = US PAYG Slot price per 100 per minute at Retail price CREATE OR REPLACE TABLE `${project_id}.ondemand_query_analysis.usage_slots_payg_slot_cost` AS SELECT project_id, Year, Month, CAST(0.060 * FLOOR((SUM(avg_slots_rounded_up_100_slots) + 99) / 100) AS INT64) AS payg_slot_cost FROM `${project_id}.ondemand_query_analysis.usage_slots_data_per_minute` AS usage_slots_data_per_minute GROUP BY 1,2,3; -- Compare On-Demand versus Slots (HIGH LEVEL ESTIMATE FOR SEEING IF SLOTS SHOULD BE EVAULATED) -- The above pricing is a ROUGH estimate so we say payg_slot_cost * 2 to account for any inefficiencies in auto-scaling SELECT usage_cost_by_project.project_id, usage_cost_by_project.Year, usage_cost_by_project.Month, CAST(AVG(CAST(usage_cost_by_project.est_sum_on_demand_cost AS INT64)) AS INT64) AS est_sum_on_demand_cost, CAST(AVG(usage_slots_payg_slot_cost.payg_slot_cost) AS INT64) AS est_payg_slot_cost FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` AS usage_cost_by_project INNER JOIN `${project_id}.ondemand_query_analysis.usage_slots_payg_slot_cost` AS usage_slots_payg_slot_cost ON usage_cost_by_project.project_id = usage_slots_payg_slot_cost.project_id AND usage_cost_by_project.Year = usage_slots_payg_slot_cost.Year AND usage_cost_by_project.Month = usage_slots_payg_slot_cost.Month WHERE est_sum_on_demand_cost > payg_slot_cost * 2 GROUP BY 1,2,3 ORDER BY 4 DESC; -- $2000 is used as a high level estimate of 100 slots SELECT usage_cost_by_project.project_id, usage_cost_by_project.Year, usage_cost_by_project.Month, CAST(AVG(CAST(usage_cost_by_project.est_sum_on_demand_cost AS INT64)) AS INT64) - CAST(AVG(usage_slots_payg_slot_cost.payg_slot_cost) AS INT64) AS rough_savings, REPEAT('$', CAST((CAST(AVG(CAST(usage_cost_by_project.est_sum_on_demand_cost AS INT64)) AS INT64) - CAST(AVG(usage_slots_payg_slot_cost.payg_slot_cost) AS INT64))/2000 AS INT)) FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` AS usage_cost_by_project INNER JOIN `${project_id}.ondemand_query_analysis.usage_slots_payg_slot_cost` AS usage_slots_payg_slot_cost ON usage_cost_by_project.project_id = usage_slots_payg_slot_cost.project_id AND usage_cost_by_project.Year = usage_slots_payg_slot_cost.Year AND usage_cost_by_project.Month = usage_slots_payg_slot_cost.Month WHERE est_sum_on_demand_cost > payg_slot_cost * 2 GROUP BY 1,2,3 ORDER BY 4 DESC; -- Create Looker Views CREATE OR REPLACE VIEW `${project_id}.ondemand_query_analysis.looker_cost_per_month` AS SELECT Year, Month, CAST(CONCAT(CAST(Year AS STRING),'-',CAST(Month AS STRING),'-01') AS DATE) AS SortDate, CAST(SUM(est_sum_on_demand_cost) AS INT64) AS Total FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` GROUP BY 1,2; CREATE OR REPLACE VIEW `${project_id}.ondemand_query_analysis.looker_cost_per_month_per_project` AS SELECT project_id, Year, Month, CAST(CONCAT(CAST(Year AS STRING),'-',CAST(Month AS STRING),'-01') AS DATE) AS SortDate, CAST(SUM(est_sum_on_demand_cost) AS INT64) AS Total FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` GROUP BY 1,2,3; CREATE OR REPLACE VIEW `${project_id}.ondemand_query_analysis.looker_most_expensive_queries` AS SELECT project_id, Year, Month, SUBSTRING(query, 1, 1000) AS query, CAST(est_sum_on_demand_cost AS INT64) AS est_sum_on_demand_cost, Cnt AS execution_count, CAST(average_gb_processed AS INT64) AS average_gb_processed, CAST(average_job_avg_slots AS INT64) AS average_job_avg_slots, CAST(average_estimated_runnable_units AS INT64) AS average_estimated_runnable_units FROM `${project_id}.ondemand_query_analysis.usage_cost_by_query_top_1000`; -- $2000 is used as a high level estimate of 100 slots CREATE OR REPLACE VIEW `${project_id}.ondemand_query_analysis.looker_ondemand_vs_slots` AS WITH data AS ( SELECT usage_cost_by_project.project_id, usage_cost_by_project.Year, usage_cost_by_project.Month, CAST(SUM(usage_cost_by_project.est_sum_on_demand_cost) AS INT64) AS est_sum_on_demand_cost, CAST(AVG(CAST(usage_cost_by_project.est_sum_on_demand_cost AS INT64)) AS INT64) - CAST(AVG(usage_slots_payg_slot_cost.payg_slot_cost) AS INT64) AS rough_savings, REPEAT('$', CAST((CAST(AVG(CAST(usage_cost_by_project.est_sum_on_demand_cost AS INT64)) AS INT64) - CAST(AVG(usage_slots_payg_slot_cost.payg_slot_cost) AS INT64))/2000 AS INT)) AS stars FROM `${project_id}.ondemand_query_analysis.usage_cost_by_project` AS usage_cost_by_project INNER JOIN `${project_id}.ondemand_query_analysis.usage_slots_payg_slot_cost` AS usage_slots_payg_slot_cost ON usage_cost_by_project.project_id = usage_slots_payg_slot_cost.project_id AND usage_cost_by_project.Year = usage_slots_payg_slot_cost.Year AND usage_cost_by_project.Month = usage_slots_payg_slot_cost.Month WHERE est_sum_on_demand_cost > payg_slot_cost * 2 GROUP BY 1,2,3 ) SELECT * FROM data; -- Show the Looker report: /* Clone this report: https://lookerstudio.google.com/reporting/32c72a9a-1172-44d3-8f92-3eebdb042a02 Click the 3 dots in the top right and select "Make a copy" Click "Copy Report" Click "Resouce" menu then "Manage added data sources" Click "Edit" under Actions title Click "${project_id}" (or enter the Project Id) under Project title Click "${bigquery_taxi_dataset}" under Dataset title Click "looker_most_expensive_queries" under Table title Click "Reconnect" Click "Apply" - there should be no field changes Click "Done" - in top right Repeat the above for each data source (3 additional ones) Click "Close" - in top right You can now see the data */ SELECT * FROM `${project_id}.ondemand_query_analysis.looker_ondemand_vs_slots` ORDER BY rough_savings DESC;

sql-scripts/taxi_dataset/sp_demo_pricing_ondemand_usage.sql (147 lines of code) (raw):