datascan/gcloud/spec-files/data-quality-spec.yaml (47 lines of code) (raw):
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DataQualityScan related setting. Variable descriptions are provided in https://cloud.google.com/dataplex/docs/reference/rest/v1/DataQualitySpec.
# A datascan can specify only one of data_profile_spec or data_quality_spec, but not both.
data_quality_spec:
# [Optional] The percentage of the records to be selected from the dataset for DataScan.
sampling_percent: 100
# [Optional] A filter applied to all rows in a single DataScan job. The filter needs to be a valid SQL expression for a WHERE clause in BigQuery standard SQL syntax. Example: col1 >= 0 AND col2 < 10
row_filter: "station_id > 1000"
# [Optional] The configuration of BigQuery export post scan action upon job completion.
# post_scan_actions:
# bigquery_export:
# results_table: "//bigquery.googleapis.com/projects/{project_id}/datasets/{dataset_id}/tables/{export_table}"
# DataQualityScan rule configurations. Documentation for the supported rule types and rule specifications can be found in https://cloud.google.com/dataplex/docs/reference/rest/v1/DataQualityRule.
rules:
# - one of [non_null_expectation, range_expectation, regex_expectation, regex_expectation, set_expectation, uniqueness_expectation, statistic_range_expectation, row_condition_expectation, table_condition_expectation]
# column - [string] Required, except for row_condition_expectation and table_condition_expectation. The unnested column which this rule is evaluated against.
# name - [string] Optional. A mutable name for the rule. The name must contain only letters (a-z, A-Z), numbers (0-9), or hyphens (-). The maximum length is 63 characters. Must start with a letter. Must end with a number or a letter.
# description - [string] Optional. Description of the rule. The maximum length is 1,024 characters.
# threshold - [number] Optional. The minimum ratio of passing_rows / total_rows required to pass this rule, with a range of [0.0, 1.0]. 0 indicates default value (i.e. 1.0). This field is only valid for row-level type rules.
# ignoreNull - [boolean] Optional. Rows with null values will automatically fail a rule, unless ignoreNull is true. In that case, such null rows are trivially considered passing. This field is only valid for row-level type rules.
# dimension - [string] Required. The dimension a rule belongs to. Results are also aggregated at the dimension level. Supported dimensions are ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]
- non_null_expectation: {}
column: address
threshold: 0.99
dimension: VALIDITY
- range_expectation:
max_value: '10'
min_value: '1'
strict_max_enabled: false
strict_min_enabled: true
column: council_district
ignore_null: true
threshold: 0.9
dimension: VALIDITY
- regex_expectation:
regex: .*solar.*
column: power_type
ignore_null: false
dimension: VALIDITY
- set_expectation:
values:
- sidewalk
- parkland
column: property_type
ignore_null: false
dimension: VALIDITY
- uniqueness_expectation: {}
column: address
dimension: UNIQUENESS
- statistic_range_expectation:
max_value: '15'
min_value: '5'
statistic: MEAN
strict_max_enabled: true
strict_min_enabled: true
column: number_of_docks
dimension: VALIDITY
- row_condition_expectation:
sql_expression: footprint_length > 0 AND footprint_length <= 10
column: footprint_length
dimension: VALIDITY
- table_condition_expectation:
sql_expression: COUNT(*) > 0
dimension: VALIDITY