dataplex/data-quality/dataplex_data_quality_taxi.yaml (216 lines of code) (raw):
####################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
####################################################################################
# Summary: Performs a validation check on the Taxi Trips table.
# This highlights just a few features of data quality, see the links for all the features
#
# The results will then be placed on a Tag Template on the Table in Data Catalog.
# This way users can see the results in the cloud console.
#
# References:
# https://github.com/GoogleCloudPlatform/cloud-data-quality/tree/main/docs/examples
# https://cloud.google.com/dataplex/docs/check-data-quality
############################################################
# The row filter is for 2022 (1 month) so the check does not run too long during a demo
# You have to define a NONE row filter
############################################################
row_filters:
NONE:
filter_sql_expr: |-
PartitionDate = '2023-09-01'
YELLOW_TAXI:
filter_sql_expr: |-
TaxiCompany = 'Yellow' and PartitionDate = '2023-09-01'
GREEN_TAXI:
filter_sql_expr: |-
TaxiCompany = 'Green' and PartitionDate = '2023-09-01'
############################################################
# Rule dimensions are optional but allow you to aggregate reporting.
############################################################
rule_dimensions:
- consistency
- correctness
- duplication
- completeness
- conformance
- integrity
############################################################
# Rules can be shared across tables or columns.
############################################################
rules:
NO_DUPLICATES_IN_COLUMN_GROUPS:
rule_type: CUSTOM_SQL_STATEMENT
dimension: duplication
params:
custom_sql_arguments:
- column_names
custom_sql_statement: |-
select a.*
from data a
inner join (
select
$column_names
from data
group by $column_names
having count(*) > 1
) duplicates
using ($column_names)
VALUE_ZERO_OR_POSITIVE:
rule_type: CUSTOM_SQL_EXPR
dimension: correctness
params:
custom_sql_expr: |-
$column >= 0
VALID_VENDOR_ID:
rule_type: REGEX
dimension: integrity
params:
pattern: |-
[1-2]{1}
VALID_LOCATION_ID:
rule_type: REGEX
dimension: integrity
params:
pattern: |-
[0-9]{1,3}
VALID_TAXI_COMPANY:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_expr: |-
$column in ('Green', 'Yellow')
VALID_NOT_NULL:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_expr: |-
$column is not null
VALID_YES_OR_NO:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_expr: |-
$column in ('Y', 'N')
VALID_RATE_CODE:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_expr: |-
$column BETWEEN 1 AND 6
VALID_PAYMENT_TYPE:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_expr: |-
$column BETWEEN 1 AND 5
VALID_TRIP_TYPE_ID:
rule_type: REGEX
dimension: integrity
params:
pattern: |-
[1-2]{1}
############################################################
# Rule bindings associate rules to {table, column}
############################################################
rule_bindings:
TAXI_COMPANY_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: TaxiCompany
row_filter_id: NONE
rule_ids:
- VALID_TAXI_COMPANY
TAXI_PICKUP_DATETIME_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Pickup_DateTime
row_filter_id: NONE
rule_ids:
- VALID_NOT_NULL
TAXI_DROPOFF_DATETIME_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Dropoff_DateTime
row_filter_id: NONE
rule_ids:
- VALID_NOT_NULL
TAXI_STOTE_AND_FORWARD_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Store_And_Forward
row_filter_id: YELLOW_TAXI
rule_ids:
- VALID_YES_OR_NO
TAXI_RATE_CODE_ID_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Rate_Code_Id
row_filter_id: YELLOW_TAXI
rule_ids:
- VALID_RATE_CODE
TAXI_VALID_VENDOR_ID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Vendor_Id
row_filter_id: NONE
rule_ids:
- VALID_VENDOR_ID
TAXI_LOCATION_PU_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: PULocationID
row_filter_id: NONE
rule_ids:
- VALID_LOCATION_ID
TAXI_LOCATION_DO_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: DOLocationID
row_filter_id: NONE
rule_ids:
- VALID_LOCATION_ID
TAXI_PASSENGER_COUNT_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Passenger_Count
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_TRIP_DISTANCE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Trip_Distance
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_FARE_AMOUNT_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Fare_Amount
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_SURCHARGE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Surcharge
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_MTA_TAX_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: MTA_Tax
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_TIP_AMOUNT_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Tip_Amount
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_TOLLS_AMOUNT_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Tolls_Amount
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_IMPROVEMENT_SURCHARGE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Improvement_Surcharge
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_AMOUNT_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Total_Amount
row_filter_id: NONE
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_PAYMENT_TYPE_ID_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Payment_Type_Id
row_filter_id: YELLOW_TAXI
rule_ids:
- VALID_PAYMENT_TYPE
TAXI_CONGESTION_SURCHARGE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Congestion_Surcharge
row_filter_id: YELLOW_TAXI
rule_ids:
- VALUE_ZERO_OR_POSITIVE
TAXI_TRIP_TYPE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Trip_Type
row_filter_id: YELLOW_TAXI
rule_ids:
- VALID_TRIP_TYPE_ID
TAXI_EHAIL_FEE_VALID:
entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
column_id: Ehail_Fee
row_filter_id: GREEN_TAXI
rule_ids:
- VALUE_ZERO_OR_POSITIVE
# TAXI_TRIPS_UNIQUE:
# entity_uri: dataplex://projects/${project_id}/locations/${dataplex_region}/lakes/taxi-data-lake-${random_extension}/zones/taxi-curated-zone-${random_extension}/entities/taxi_trips
# column_id: Pickup_DateTime
# row_filter_id: NONE
# incremental_time_filter_column_id: Pickup_DateTime
# rule_ids:
# - NO_DUPLICATES_IN_COLUMN_GROUPS:
# column_names: "Pickup_DateTime"