datascan/gcloud/spec-files/data-quality-spec.yaml (47 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DataQualityScan related setting. Variable descriptions are provided in https://cloud.google.com/dataplex/docs/reference/rest/v1/DataQualitySpec. # A datascan can specify only one of data_profile_spec or data_quality_spec, but not both. data_quality_spec: # [Optional] The percentage of the records to be selected from the dataset for DataScan. sampling_percent: 100 # [Optional] A filter applied to all rows in a single DataScan job. The filter needs to be a valid SQL expression for a WHERE clause in BigQuery standard SQL syntax. Example: col1 >= 0 AND col2 < 10 row_filter: "station_id > 1000" # [Optional] The configuration of BigQuery export post scan action upon job completion. # post_scan_actions: # bigquery_export: # results_table: "//bigquery.googleapis.com/projects/{project_id}/datasets/{dataset_id}/tables/{export_table}" # DataQualityScan rule configurations. Documentation for the supported rule types and rule specifications can be found in https://cloud.google.com/dataplex/docs/reference/rest/v1/DataQualityRule. rules: # - one of [non_null_expectation, range_expectation, regex_expectation, regex_expectation, set_expectation, uniqueness_expectation, statistic_range_expectation, row_condition_expectation, table_condition_expectation] # column - [string] Required, except for row_condition_expectation and table_condition_expectation. The unnested column which this rule is evaluated against. # name - [string] Optional. A mutable name for the rule. The name must contain only letters (a-z, A-Z), numbers (0-9), or hyphens (-). The maximum length is 63 characters. Must start with a letter. Must end with a number or a letter. # description - [string] Optional. Description of the rule. The maximum length is 1,024 characters. # threshold - [number] Optional. The minimum ratio of passing_rows / total_rows required to pass this rule, with a range of [0.0, 1.0]. 0 indicates default value (i.e. 1.0). This field is only valid for row-level type rules. # ignoreNull - [boolean] Optional. Rows with null values will automatically fail a rule, unless ignoreNull is true. In that case, such null rows are trivially considered passing. This field is only valid for row-level type rules. # dimension - [string] Required. The dimension a rule belongs to. Results are also aggregated at the dimension level. Supported dimensions are ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"] - non_null_expectation: {} column: address threshold: 0.99 dimension: VALIDITY - range_expectation: max_value: '10' min_value: '1' strict_max_enabled: false strict_min_enabled: true column: council_district ignore_null: true threshold: 0.9 dimension: VALIDITY - regex_expectation: regex: .*solar.* column: power_type ignore_null: false dimension: VALIDITY - set_expectation: values: - sidewalk - parkland column: property_type ignore_null: false dimension: VALIDITY - uniqueness_expectation: {} column: address dimension: UNIQUENESS - statistic_range_expectation: max_value: '15' min_value: '5' statistic: MEAN strict_max_enabled: true strict_min_enabled: true column: number_of_docks dimension: VALIDITY - row_condition_expectation: sql_expression: footprint_length > 0 AND footprint_length <= 10 column: footprint_length dimension: VALIDITY - table_condition_expectation: sql_expression: COUNT(*) > 0 dimension: VALIDITY