sql-scripts/rideshare_lakehouse_curated/sp_demo_data_quality_table.sql (56 lines of code) (raw):

/*################################################################################## # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ###################################################################################*/ /* Use Cases: - Gets the Aggreated Data Quality (Dataplex) results for a table - This will be called by Airflow DAG: sample-dataplex-run-data-quality Description: - Shows gathering the data which will be ingeted into the Data Catalog Tag Template - You could make this procedure more generic and pass in the table as a parameter (this is just a demo) - You could also deploy this to the dataplex_data_quality dataset (instead of taxi) Show: - References: - Clean up / Reset script: */ WITH LatestExecution AS ( SELECT MAX(execution_ts) AS latest_execution_ts FROM `${project_id}.dataplex_data_quality.data_quality_results` WHERE table_id = '${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip' ) , TableRows AS ( SELECT COUNT(*) AS record_count FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip` ) , ColumnsValidated AS ( SELECT COUNT(DISTINCT column_id) AS columns_validated FROM `${project_id}.dataplex_data_quality.data_quality_results` WHERE invocation_id = (SELECT DISTINCT invocation_id FROM `${project_id}.dataplex_data_quality.data_quality_results` WHERE execution_ts = (SELECT latest_execution_ts FROM LatestExecution)) ) , ColumnsTotal AS ( SELECT COUNT(column_name) AS columns_count FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.INFORMATION_SCHEMA.COLUMNS` WHERE table_name = 'bigquery_rideshare_trip' ) , TableData AS ( SELECT TableRows.record_count, LatestExecution.latest_execution_ts, ColumnsValidated.columns_validated, ColumnsTotal.columns_count, data_quality_results.* FROM `${project_id}.dataplex_data_quality.data_quality_results` AS data_quality_results CROSS JOIN LatestExecution CROSS JOIN TableRows CROSS JOIN ColumnsValidated CROSS JOIN ColumnsTotal WHERE invocation_id = (SELECT DISTINCT invocation_id FROM `${project_id}.dataplex_data_quality.data_quality_results` WHERE execution_ts = (SELECT latest_execution_ts FROM LatestExecution)) ) SELECT record_count, latest_execution_ts, columns_validated, columns_count, invocation_id, CASE WHEN SUM(IFNULL(rows_validated,0)) = 0 THEN CAST(1 AS NUMERIC) ELSE CAST(SUM(IFNULL(success_count,0)) / SUM(IFNULL(rows_validated,0)) AS NUMERIC) END AS success_percentage, -- There is a failed_count field but it does not sum to 100% with the success count. -- This is used in the demo so the UI does not show two numbers that do not add up to 100% -- We could add in the Null count to get a 100% total CAST(1 AS NUMERIC) - CASE WHEN SUM(IFNULL(rows_validated,0)) = 0 THEN CAST(1 AS NUMERIC) ELSE CAST(SUM(IFNULL(success_count,0)) / SUM(IFNULL(rows_validated,0)) AS NUMERIC) END AS failed_percentage, FROM TableData GROUP BY 1,2,3,4,5;