scripts/optimization/viewable_queries_grouped_by

/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This script creates a table named, top_bytes_scanning_queries_by_hash, * which contains the top 200 most expensive queries by total bytes scanned * within the past 30 days. * 30 days is the default timeframe, but you can change this by setting the * num_days_to_scan variable to a different value. * Queries are grouped by their normalized query pattern, which ignores * comments, parameter values, UDFs, and literals in the query text. * This allows us to group queries that are logically the same, but * have different literals. * * For example, the following queries would be grouped together: * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01' * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02' * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03' */ DECLARE num_days_to_scan INT64 DEFAULT 30; DECLARE projects ARRAY<STRING> DEFAULT ( SELECT ARRAY_AGG(project_id) FROM( SELECT project_id FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION WHERE DATE(creation_time) >= CURRENT_DATE - 30 GROUP BY 1 ORDER BY SUM(total_bytes_billed) DESC LIMIT 100 ) ); CREATE SCHEMA IF NOT EXISTS optimization_workshop; CREATE OR REPLACE TABLE optimization_workshop.viewable_queries_grouped_by_hash ( Query_Hash STRING, Query_Raw_Sample STRING, Job_Origin STRING, Ref_Tables STRING, Days_Active INT64, Job_Count INT64, Avg_Job_Count_Active_Days INT64, Project_Id STRING, BQ_Region STRING, Reservation_Id STRING, Total_Gigabytes_Processed INT64, Total_Gigabytes_Processed_Per_Job INT64, Avg_Gigabytes_Processed INT64, Total_Slot_Hours INT64, Avg_Total_Slot_Hours_per_Active_Day INT64, Avg_Job_Duration_Seconds INT64, Any_Job_Ids ARRAY<STRING>, User_Emails STRING, Labels STRING ); FOR p IN ( SELECT project_id FROM UNNEST(projects) project_id ) DO BEGIN EXECUTE IMMEDIATE FORMAT(""" INSERT INTO optimization_workshop.viewable_queries_grouped_by_hash SELECT query_hash AS Query_Hash, ANY_VALUE(query_raw) AS Query_Raw_Sample, SPLIT(ANY_VALUE(job_ids)[OFFSET(0)], '_')[OFFSET(0)] AS Job_Origin, Ref_Tables AS Ref_Tables, COUNT(DISTINCT creation_dt) AS Days_Active, SUM(job_count) AS Job_Count, CAST(AVG(job_count) AS INT64) AS Avg_Job_Count_Active_Days, Project_Id AS Project_Id, 'us' AS BQ_Region, Reservation_Id AS Reservation_Id, CAST(SUM(total_gigabytes_processed) AS INT64) AS Total_Gigabytes_Processed, CAST(SUM(total_gigabytes_processed)/sum(job_count) AS INT64) AS Total_Gigabytes_Processed_Per_Job, CAST(AVG(total_gigabytes_processed) AS INT64) AS Avg_Gigabytes_Processed, CAST(SUM(total_slot_hours_per_day) AS INT64) AS Total_Slot_Hours, CAST(AVG(total_slot_hours_per_day) AS INT64) AS Avg_Total_Slot_Hours_per_Active_Day, CAST(AVG(avg_job_duration_seconds) AS INT64) AS Avg_Job_Duration_Seconds, ANY_VALUE(job_ids) AS Any_Job_Ids, STRING_AGG(DISTINCT user_emails_unnest) AS User_Emails, STRING_AGG(DISTINCT labels_concat) AS Labels FROM ( SELECT query_hash, ANY_VALUE(query_raw) AS query_raw, ref_tables AS ref_tables, creation_dt AS creation_dt, project_id AS project_id, reservation_id AS reservation_id, COUNT(*) AS job_count, ARRAY_AGG(job_id ORDER BY total_slot_ms DESC LIMIT 10) AS job_ids, SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours_per_day, SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, AVG(job_duration_seconds) AS avg_job_duration_seconds, ARRAY_AGG(DISTINCT user_email) AS user_emails, STRING_AGG(DISTINCT labels_concat) AS labels_concat FROM ( SELECT query_info.query_hashes.normalized_literals AS query_hash, query AS query_raw, DATE(jbp.creation_time) AS creation_dt, jbp.project_id AS project_id, jbp.reservation_id AS reservation_id, jbp.job_id AS job_id, jbp.total_bytes_processed AS total_bytes_processed, jbp.total_slot_ms AS total_slot_ms, jbp.total_slot_ms / TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, MILLISECOND) AS slots, TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, SECOND) AS job_duration_seconds, user_email, STRING_AGG(ref_tables.project_id || '.' || IF (STARTS_WITH(ref_tables.dataset_id, '_'), 'TEMP', ref_tables.dataset_id) || '.' || ref_tables.table_id ORDER BY ref_tables.project_id || '.' || IF (STARTS_WITH(ref_tables.dataset_id, '_'), 'TEMP', ref_tables.dataset_id) || '.' || ref_tables.table_id) AS ref_tables, FORMAT("%%T", ARRAY_CONCAT_AGG(labels)) AS labels_concat FROM `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT as jbp JOIN UNNEST(referenced_tables) ref_tables WHERE DATE(jbp.creation_time) >= CURRENT_DATE - %i AND jbp.end_time > jbp.start_time AND jbp.error_result IS NULL AND jbp.job_type = 'QUERY' AND jbp.statement_type != 'SCRIPT' AND ref_tables.table_id not like '%%INFORMATION_SCHEMA%%' GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ) GROUP BY 1, 3, 4, 5, 6) JOIN UNNEST(user_emails) as user_emails_unnest GROUP BY Query_Hash, Ref_Tables, Project_Id, BQ_Region, Reservation_Id; """, p.project_id, num_days_to_scan); EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors END; END FOR;

scripts/optimization/viewable_queries_grouped_by_hash.sql (132 lines of code) (raw):