dataplex/data-quality/dataplex_data_quality_rideshare.yaml (169 lines of code) (raw):

#################################################################################### # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #################################################################################### # Summary: Performs a validation check on the Taxi Trips table. # This highlights just a few features of data quality, see the links for all the features # # The results will then be placed on a Tag Template on the Table in Data Catalog. # This way users can see the results in the cloud console. # # References: # https://github.com/GoogleCloudPlatform/cloud-data-quality/tree/main/docs/examples # https://cloud.google.com/dataplex/docs/check-data-quality ############################################################ # The row filter is for 2022 (1 month) so the check does not run too long during a demo # You have to define a NONE row filter ############################################################ row_filters: NONE: filter_sql_expr: |- partition_date = '2022-11-01' ############################################################ # Rule dimensions are optional but allow you to aggregate reporting. ############################################################ rule_dimensions: - consistency - correctness - duplication - completeness - conformance - integrity ############################################################ # Rules can be shared across tables or columns. ############################################################ rules: VALUE_UUID: rule_type: CUSTOM_SQL_EXPR dimension: correctness params: custom_sql_expr: |- LENGTH($column) = 36 VALUE_ZERO_OR_POSITIVE: rule_type: CUSTOM_SQL_EXPR dimension: correctness params: custom_sql_expr: |- $column >= 0 VALID_NOT_NULL: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column is not null VALID_PAYMENT_TYPE: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column BETWEEN 1 AND 6 VALID_LOCATION_ID: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column BETWEEN 1 AND 265 VALID_RIDE_DISTANCE: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column BETWEEN 0 AND 100 VALID_PICKUP_DATE: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column < dropoff_datetime VALID_DROPOFF_DATE: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column > pickup_datetime VALUE_CREDIT_CARD: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column IS NULL OR STARTS_WITH($column, "3") OR STARTS_WITH($column, "4") OR STARTS_WITH($column, "5") OR STARTS_WITH($column, "6") VALID_CREDIT_CARD_DATE: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- CAST($column AS TIMESTAMP) > dropoff_datetime VALID_CREDIT_CARD_CVV: rule_type: CUSTOM_SQL_EXPR dimension: integrity params: custom_sql_expr: |- $column IS NULL OR LENGTH($column) = 3 ############################################################ # Rule bindings associate rules to {table, column} ############################################################ rule_bindings: RIDESHARE_TRIP_ID_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: rideshare_trip_id row_filter_id: NONE rule_ids: - VALUE_UUID PICKUP_LOCATION_ID_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: pickup_location_id row_filter_id: NONE rule_ids: - VALID_LOCATION_ID PICKUP_DATETIME_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: pickup_datetime row_filter_id: NONE rule_ids: - VALID_PICKUP_DATE DROPOFF_LOCATION_ID_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: dropoff_location_id row_filter_id: NONE rule_ids: - VALID_LOCATION_ID DROPOFF_DATETIME_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: dropoff_datetime row_filter_id: NONE rule_ids: - VALID_DROPOFF_DATE RIDE_DISTANCE_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: ride_distance row_filter_id: NONE rule_ids: - VALID_RIDE_DISTANCE IS_AIRPORT_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: is_airport row_filter_id: NONE rule_ids: - VALID_NOT_NULL PAYMENT_TYPE_ID_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: payment_type_id row_filter_id: NONE rule_ids: - VALID_PAYMENT_TYPE FARE_AMOUNT_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: fare_amount row_filter_id: NONE rule_ids: - VALUE_ZERO_OR_POSITIVE TIP_AMOUNT_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: tip_amount row_filter_id: NONE rule_ids: - VALUE_ZERO_OR_POSITIVE TAXES_AMOUNT_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: taxes_amount row_filter_id: NONE rule_ids: - VALUE_ZERO_OR_POSITIVE TOTAL_AMOUNT_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: total_amount row_filter_id: NONE rule_ids: - VALUE_ZERO_OR_POSITIVE CREDIT_CARD_NUMBER_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: credit_card_number row_filter_id: NONE rule_ids: - VALUE_CREDIT_CARD CREDIT_CARD_EXPIRE_DATE_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: credit_card_expire_date row_filter_id: NONE rule_ids: - VALID_CREDIT_CARD_DATE CREDIT_CARD_CVV_CODE_VALID: entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/rideshare-lakehouse-${random_extension}/zones/rideshare-curated-zone-${random_extension}/entities/bigquery_rideshare_trip column_id: credit_card_cvv_code row_filter_id: NONE rule_ids: - VALID_CREDIT_CARD_CVV