data-mesh-banking-labs/setup/resources/code/merchant-source-configs/dq_merchant_gcs_data.yaml (175 lines of code) (raw):
metadata_registry_defaults:
dataplex:
projects: _project_datagov_
locations: us-central1
lakes: consumer-banking--merchant--domain
zones: merchant-raw-zone
row_filters:
NONE:
filter_sql_expr: |-
True
INTERNATIONAL_ITEMS:
filter_sql_expr: |-
REGEXP_CONTAINS(item_id, 'INTNL')
LAST_WEEK:
filter_sql_expr: |-
date(last_modified_timestamp) >= DATE_SUB(current_date(), INTERVAL 1 WEEK)
rule_dimensions:
- consistency
- correctness
- duplication
- completeness
- conformance
- integrity
- timeliness
- accuracy
rules:
NOT_NULL:
rule_type: NOT_NULL
dimension: completeness
NOT_BLANK:
rule_type: NOT_BLANK
dimension: completeness
NO_DUPLICATES_IN_COLUMN_GROUPS:
rule_type: CUSTOM_SQL_STATEMENT
dimension: duplication
params:
custom_sql_arguments:
- column_names
custom_sql_statement: |-
select a.*
from data a
inner join (
select
$column_names
from data
group by $column_names
having count(*) > 1
) duplicates
using ($column_names)
VALUE_ZERO_OR_POSITIVE:
rule_type: CUSTOM_SQL_EXPR
dimension: correctness
params:
custom_sql_expr: |-
$column >= 0
VALUE_POSITIVE:
rule_type: CUSTOM_SQL_EXPR
dimension: correctness
params:
custom_sql_expr: |-
$column > 0
VALID_SSN:
rule_type: REGEX
dimension: conformance
params:
pattern: |-
^d{3}-?d{2}-?d{4}$
VALID_EMAIL:
rule_type: REGEX
dimension: conformance
params:
pattern: |-
^[^@]+[@]{1}[^@]+$
NO_DELAYED_INGESTION_DAY_LEVEL:
rule_type: CUSTOM_SQL_STATEMENT
dimension: timeliness
params:
custom_sql_arguments:
- ingestion_date_day
- elapsed_time_days
custom_sql_statement: |-
select * from
(select count(*) as n
from data a
where cast($ingestion_date_day as Date) >= date_sub(current_date(), interval $elapsed_time_days day)
)
where n = 0
NO_ACCURACY_ISSUES_SIMPLE:
rule_type: CUSTOM_SQL_STATEMENT
dimension: accuracy
params:
custom_sql_arguments:
- deviation_threshold_pct
custom_sql_statement: |-
select
*
from (
select
countif(gender in ('M', 'F')) as n
from data
)
where
ifnull(safe_divide(n,
(
select
count(*)
from data
)),
cast('inf' as float64)) < (1 - ($deviation_threshold_pct / 100))
NO_REFERENTIAL_INTEGRITY_VIOLATION_IN_OPERATOR:
rule_type: CUSTOM_SQL_EXPR
dimension: integrity
params:
custom_sql_arguments:
- ref_data_project
- ref_data_dataset
- ref_data_table_id
- ref_data_column_id
custom_sql_expr: |-
$column in (select $ref_data_column_id from `$ref_data_project.$ref_data_dataset.$ref_data_table_id`)
rule_bindings:
VALID_MERCHANT:
entity_uri: dataplex://entities/merchants_data
column_id: merchant_id
row_filter_id: NONE
rule_ids:
- NOT_NULL
- NOT_BLANK
- NO_DUPLICATES_IN_COLUMN_GROUPS:
column_names: "merchant_id"
NO_DELAYED_INGESTION_DAY_LEVEL:
entity_uri: dataplex://entities/merchants_data
column_id: merchant_id
row_filter_id: NONE
rule_ids:
- NO_DELAYED_INGESTION_DAY_LEVEL:
ingestion_date_day: dt
elapsed_time_days: 1
VALID_EMAIL:
entity_uri: dataplex://entities/merchants_data
column_id: email
row_filter_id: NONE
rule_ids:
- VALID_EMAIL
VALID_CITY:
entity_uri: dataplex://entities/merchants_data
column_id: city
row_filter_id: NONE
rule_ids:
- NOT_NULL
- NOT_BLANK
VALID_STATE:
entity_uri: dataplex://entities/merchants_data
column_id: state
row_filter_id: NONE
rule_ids:
- NOT_NULL
- NOT_BLANK
VALID_ZIP:
entity_uri: dataplex://entities/merchants_data
column_id: zip
row_filter_id: NONE
rule_ids:
- NOT_NULL
- NOT_BLANK
T4_REFERENTIAL_INTEGRITY_VIOLATION_IN_OPERATOR_SHOULD_FAIL:
entity_uri: dataplex://entities/merchants_data
column_id: mcc
row_filter_id: NONE
rule_ids:
- NO_REFERENTIAL_INTEGRITY_VIOLATION_IN_OPERATOR:
ref_data_project: _project_datasto_
ref_data_dataset: merchants_ref_data
ref_data_table_id: mcc_code
ref_data_column_id: mcc