scripts/optimization/viewable_queries_grouped_by_hash.sql (132 lines of code) (raw):
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This script creates a table named, top_bytes_scanning_queries_by_hash,
* which contains the top 200 most expensive queries by total bytes scanned
* within the past 30 days.
* 30 days is the default timeframe, but you can change this by setting the
* num_days_to_scan variable to a different value.
* Queries are grouped by their normalized query pattern, which ignores
* comments, parameter values, UDFs, and literals in the query text.
* This allows us to group queries that are logically the same, but
* have different literals.
*
* For example, the following queries would be grouped together:
* SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
* SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
* SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
*/
DECLARE num_days_to_scan INT64 DEFAULT 30;
DECLARE projects ARRAY<STRING> DEFAULT (
SELECT
ARRAY_AGG(project_id)
FROM(
SELECT project_id
FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
WHERE DATE(creation_time) >= CURRENT_DATE - 30
GROUP BY 1
ORDER BY SUM(total_bytes_billed) DESC
LIMIT 100
)
);
CREATE SCHEMA IF NOT EXISTS optimization_workshop;
CREATE OR REPLACE TABLE optimization_workshop.viewable_queries_grouped_by_hash
(
Query_Hash STRING,
Query_Raw_Sample STRING,
Job_Origin STRING,
Ref_Tables STRING,
Days_Active INT64,
Job_Count INT64,
Avg_Job_Count_Active_Days INT64,
Project_Id STRING,
BQ_Region STRING,
Reservation_Id STRING,
Total_Gigabytes_Processed INT64,
Total_Gigabytes_Processed_Per_Job INT64,
Avg_Gigabytes_Processed INT64,
Total_Slot_Hours INT64,
Avg_Total_Slot_Hours_per_Active_Day INT64,
Avg_Job_Duration_Seconds INT64,
Any_Job_Ids ARRAY<STRING>,
User_Emails STRING,
Labels STRING
);
FOR p IN (
SELECT project_id
FROM
UNNEST(projects) project_id
)
DO
BEGIN
EXECUTE IMMEDIATE FORMAT("""
INSERT INTO optimization_workshop.viewable_queries_grouped_by_hash
SELECT
query_hash AS Query_Hash,
ANY_VALUE(query_raw) AS Query_Raw_Sample,
SPLIT(ANY_VALUE(job_ids)[OFFSET(0)], '_')[OFFSET(0)] AS Job_Origin,
Ref_Tables AS Ref_Tables,
COUNT(DISTINCT creation_dt) AS Days_Active,
SUM(job_count) AS Job_Count,
CAST(AVG(job_count) AS INT64) AS Avg_Job_Count_Active_Days,
Project_Id AS Project_Id,
'us' AS BQ_Region,
Reservation_Id AS Reservation_Id,
CAST(SUM(total_gigabytes_processed) AS INT64) AS Total_Gigabytes_Processed,
CAST(SUM(total_gigabytes_processed)/sum(job_count) AS INT64) AS Total_Gigabytes_Processed_Per_Job,
CAST(AVG(total_gigabytes_processed) AS INT64) AS Avg_Gigabytes_Processed,
CAST(SUM(total_slot_hours_per_day) AS INT64) AS Total_Slot_Hours,
CAST(AVG(total_slot_hours_per_day) AS INT64) AS Avg_Total_Slot_Hours_per_Active_Day,
CAST(AVG(avg_job_duration_seconds) AS INT64) AS Avg_Job_Duration_Seconds,
ANY_VALUE(job_ids) AS Any_Job_Ids,
STRING_AGG(DISTINCT user_emails_unnest) AS User_Emails,
STRING_AGG(DISTINCT labels_concat) AS Labels
FROM (
SELECT
query_hash,
ANY_VALUE(query_raw) AS query_raw,
ref_tables AS ref_tables,
creation_dt AS creation_dt,
project_id AS project_id,
reservation_id AS reservation_id,
COUNT(*) AS job_count,
ARRAY_AGG(job_id ORDER BY total_slot_ms DESC LIMIT 10) AS job_ids,
SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours_per_day,
SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed,
AVG(job_duration_seconds) AS avg_job_duration_seconds,
ARRAY_AGG(DISTINCT user_email) AS user_emails,
STRING_AGG(DISTINCT labels_concat) AS labels_concat
FROM (
SELECT
query_info.query_hashes.normalized_literals AS query_hash,
query AS query_raw,
DATE(jbp.creation_time) AS creation_dt,
jbp.project_id AS project_id,
jbp.reservation_id AS reservation_id,
jbp.job_id AS job_id,
jbp.total_bytes_processed AS total_bytes_processed,
jbp.total_slot_ms AS total_slot_ms,
jbp.total_slot_ms / TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, MILLISECOND) AS slots,
TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, SECOND) AS job_duration_seconds,
user_email,
STRING_AGG(ref_tables.project_id || '.' ||
IF
(STARTS_WITH(ref_tables.dataset_id, '_'),
'TEMP',
ref_tables.dataset_id) || '.' || ref_tables.table_id
ORDER BY
ref_tables.project_id || '.' ||
IF
(STARTS_WITH(ref_tables.dataset_id, '_'),
'TEMP',
ref_tables.dataset_id) || '.' || ref_tables.table_id) AS ref_tables,
FORMAT("%%T", ARRAY_CONCAT_AGG(labels)) AS labels_concat
FROM
`%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT as jbp
JOIN
UNNEST(referenced_tables) ref_tables
WHERE
DATE(jbp.creation_time) >= CURRENT_DATE - %i
AND jbp.end_time > jbp.start_time
AND jbp.error_result IS NULL
AND jbp.job_type = 'QUERY'
AND jbp.statement_type != 'SCRIPT'
AND ref_tables.table_id not like '%%INFORMATION_SCHEMA%%'
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
)
GROUP BY 1, 3, 4, 5, 6)
JOIN
UNNEST(user_emails) as user_emails_unnest
GROUP BY
Query_Hash,
Ref_Tables,
Project_Id,
BQ_Region,
Reservation_Id;
""",
p.project_id, num_days_to_scan);
EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
END;
END FOR;