mmv1/products/dataplex/Datascan.yaml (527 lines of code) (raw):
# Copyright 2024 Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
name: 'Datascan'
api_resource_type_kind: DataScan
description: |
Represents a user-visible job which provides the insights for the related data source.
# User-provided label cannot start with goog-
exclude_attribution_label: true
references:
guides:
'Official Documentation': 'https://cloud.google.com/dataplex/docs'
api: 'https://cloud.google.com/dataplex/docs/reference/rest'
docs:
base_url: 'projects/{{project}}/locations/{{location}}/dataScans'
self_link: 'projects/{{project}}/locations/{{location}}/dataScans/{{data_scan_id}}'
create_url: 'projects/{{project}}/locations/{{location}}/dataScans?dataScanId={{data_scan_id}}'
update_verb: 'PATCH'
update_mask: true
read_query_params: '?view=FULL'
import_format:
- 'projects/{{project}}/locations/{{location}}/dataScans/{{data_scan_id}}'
- '{{data_scan_id}}'
timeouts:
insert_minutes: 5
update_minutes: 5
delete_minutes: 5
autogen_async: true
async:
actions: ['create', 'delete', 'update']
type: 'OpAsync'
operation:
base_url: '{{op_id}}'
timeouts:
insert_minutes: 5
update_minutes: 5
delete_minutes: 5
result:
resource_inside_response: false
iam_policy:
method_name_separator: ':'
parent_resource_attribute: 'data_scan_id'
example_config_body: 'templates/terraform/iam/iam_attributes.go.tmpl'
import_format:
- 'projects/{{project}}/locations/{{location}}/dataScans/{{data_scan_id}}'
- '{{data_scan_id}}'
custom_code:
examples:
- name: 'dataplex_datascan_basic_profile'
primary_resource_id: 'basic_profile'
primary_resource_name: 'fmt.Sprintf("tf-test-dataprofile-basic%s", context["random_suffix"])'
vars:
datascan_name: 'dataprofile-basic'
test_env_vars:
project_name: 'PROJECT_NAME'
- name: 'dataplex_datascan_full_profile'
primary_resource_id: 'full_profile'
vars:
dataset_name: 'dataplex_dataset'
datascan_name: 'dataprofile-full'
test_env_vars:
project_name: 'PROJECT_NAME'
- name: 'dataplex_datascan_basic_quality'
primary_resource_id: 'basic_quality'
vars:
datascan_name: 'dataquality-basic'
test_env_vars:
project_name: 'PROJECT_NAME'
- name: 'dataplex_datascan_full_quality'
primary_resource_id: 'full_quality'
vars:
datascan_name: 'dataquality-full'
test_env_vars:
project_name: 'PROJECT_NAME'
parameters:
- name: 'location'
type: String
description: |
The location where the data scan should reside.
url_param_only: true
required: true
immutable: true
- name: 'dataScanId'
type: String
description: |
DataScan identifier. Must contain only lowercase letters, numbers and hyphens. Must start with a letter. Must end with a number or a letter.
url_param_only: true
required: true
immutable: true
properties:
- name: 'name'
type: String
description: |
The relative resource name of the scan, of the form: projects/{project}/locations/{locationId}/dataScans/{datascan_id}, where project refers to a project_id or project_number and locationId refers to a GCP region.
output: true
- name: 'uid'
type: String
description: |
System generated globally unique ID for the scan. This ID will be different if the scan is deleted and re-created with the same name.
output: true
- name: 'description'
type: String
description: |
Description of the scan.
- name: 'displayName'
type: String
description: |
User friendly display name.
- name: 'labels'
type: KeyValueLabels
description: |
User-defined labels for the scan. A list of key->value pairs.
- name: 'state'
type: Enum
description: |
Current state of the DataScan.
output: true
enum_values:
- 'STATE_UNSPECIFIED'
- 'ACTIVE'
- 'CREATING'
- 'DELETING'
- 'ACTION_REQUIRED'
- name: 'createTime'
type: String
description: |
The time when the scan was created.
output: true
- name: 'updateTime'
type: String
description: |
The time when the scan was last updated.
output: true
- name: 'data'
type: NestedObject
description: |
The data source for DataScan.
required: true
immutable: true
properties:
- name: 'entity'
type: String
description: |
The Dataplex entity that represents the data source(e.g. BigQuery table) for Datascan.
immutable: true
exactly_one_of:
- 'data.0.entity'
- 'data.0.resource'
- name: 'resource'
type: String
description: |
The service-qualified full resource name of the cloud resource for a DataScan job to scan against. The field could be:
(Cloud Storage bucket for DataDiscoveryScan)BigQuery table of type "TABLE" for DataProfileScan/DataQualityScan.
immutable: true
exactly_one_of:
- 'data.0.entity'
- 'data.0.resource'
- name: 'executionSpec'
type: NestedObject
description: |
DataScan execution settings.
required: true
properties:
- name: 'trigger'
type: NestedObject
description: |
Spec related to how often and when a scan should be triggered.
required: true
properties:
- name: 'onDemand'
type: NestedObject
description: |
The scan runs once via dataScans.run API.
send_empty_value: true
allow_empty_object: true
exactly_one_of:
- 'execution_spec.0.trigger.0.on_demand'
- 'execution_spec.0.trigger.0.schedule'
properties:
[]
- name: 'schedule'
type: NestedObject
description: |
The scan is scheduled to run periodically.
exactly_one_of:
- 'execution_spec.0.trigger.0.on_demand'
- 'execution_spec.0.trigger.0.schedule'
properties:
- name: 'cron'
type: String
description:
Cron schedule for running scans periodically. This field is
required for Schedule scans.
required: true
- name: 'field'
type: String
description: |
The unnested field (of type Date or Timestamp) that contains values which monotonically increase over time. If not specified, a data scan will run for all data in the table.
immutable: true
- name: 'executionStatus'
type: NestedObject
description: |
Status of the data scan execution.
output: true
properties:
- name: 'latestJobEndTime'
type: String
description: |
The time when the latest DataScanJob started.
output: true
- name: 'latestJobStartTime'
type: String
description: |
The time when the latest DataScanJob ended.
output: true
- name: 'type'
type: Enum
description: |
The type of DataScan.
output: true
enum_values:
- 'DATA_SCAN_TYPE_UNSPECIFIED'
- 'DATA_QUALITY'
- 'DATA_PROFILE'
- name: 'dataQualitySpec'
type: NestedObject
description: |
DataQualityScan related setting.
exactly_one_of:
- 'data_quality_spec'
- 'data_profile_spec'
properties:
- name: 'samplingPercent'
type: Double
description: |
The percentage of the records to be selected from the dataset for DataScan.
Value can range between 0.0 and 100.0 with up to 3 significant decimal digits.
Sampling is not applied if `sampling_percent` is not specified, 0 or 100.
- name: 'rowFilter'
type: String
description: |
A filter applied to all rows in a single DataScan job. The filter needs to be a valid SQL expression for a WHERE clause in BigQuery standard SQL syntax. Example: col1 >= 0 AND col2 < 10
- name: 'postScanActions'
type: NestedObject
description: |
Actions to take upon job completion.
properties:
- name: 'bigqueryExport'
type: NestedObject
description: |
If set, results will be exported to the provided BigQuery table.
properties:
- name: 'resultsTable'
type: String
description: |
The BigQuery table to export DataQualityScan results to.
Format://bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
- name: 'notificationReport'
type: NestedObject
description: |
The configuration of notification report post scan action.
properties:
- name: 'recipients'
type: NestedObject
description: |
The individuals or groups who are designated to receive notifications upon triggers.
required: true
properties:
- name: 'emails'
type: Array
description: |
The email recipients who will receive the DataQualityScan results report.
item_type:
type: String
- name: 'scoreThresholdTrigger'
type: NestedObject
description: |
This trigger is triggered when the DQ score in the job result is less than a specified input score.
properties:
- name: 'scoreThreshold'
type: Double
description: |
The score range is in [0,100].
- name: 'jobFailureTrigger'
type: NestedObject
description: |
This trigger is triggered when the scan job itself fails, regardless of the result.
send_empty_value: true
allow_empty_object: true
properties:
[]
- name: 'jobEndTrigger'
type: NestedObject
description: |
This trigger is triggered whenever a scan job run ends, regardless of the result.
send_empty_value: true
allow_empty_object: true
properties:
[]
- name: 'rules'
type: Array
description: |
The list of rules to evaluate against a data source. At least one rule is required.
item_type:
type: NestedObject
properties:
- name: 'column'
type: String
description: |
The unnested column which this rule is evaluated against.
- name: 'ignoreNull'
type: Boolean
description: |
Rows with null values will automatically fail a rule, unless ignoreNull is true. In that case, such null rows are trivially considered passing. Only applicable to ColumnMap rules.
- name: 'dimension'
type: String
description: |
The dimension a rule belongs to. Results are also aggregated at the dimension level. Supported dimensions are ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]
required: true
- name: 'threshold'
type: Double
description: |
The minimum ratio of passing_rows / total_rows required to pass this rule, with a range of [0.0, 1.0]. 0 indicates default value (i.e. 1.0).
- name: 'name'
type: String
description: |
A mutable name for the rule.
The name must contain only letters (a-z, A-Z), numbers (0-9), or hyphens (-).
The maximum length is 63 characters.
Must start with a letter.
Must end with a number or a letter.
- name: 'description'
type: String
description: |
Description of the rule.
The maximum length is 1,024 characters.
- name: 'rangeExpectation'
type: NestedObject
description: |
ColumnMap rule which evaluates whether each column value lies between a specified range.
properties:
- name: 'minValue'
type: String
description: |
The minimum column value allowed for a row to pass this validation. At least one of minValue and maxValue need to be provided.
- name: 'maxValue'
type: String
description: |
The maximum column value allowed for a row to pass this validation. At least one of minValue and maxValue need to be provided.
- name: 'strictMinEnabled'
type: Boolean
description: |
Whether each value needs to be strictly greater than ('>') the minimum, or if equality is allowed.
Only relevant if a minValue has been defined. Default = false.
default_value: false
- name: 'strictMaxEnabled'
type: Boolean
description: |
Whether each value needs to be strictly lesser than ('<') the maximum, or if equality is allowed.
Only relevant if a maxValue has been defined. Default = false.
default_value: false
- name: 'nonNullExpectation'
type: NestedObject
description: |
ColumnMap rule which evaluates whether each column value is null.
send_empty_value: true
allow_empty_object: true
properties:
[]
- name: 'setExpectation'
type: NestedObject
description: |
ColumnMap rule which evaluates whether each column value is contained by a specified set.
properties:
- name: 'values'
type: Array
description: |
Expected values for the column value.
required: true
item_type:
type: String
- name: 'regexExpectation'
type: NestedObject
description: |
ColumnMap rule which evaluates whether each column value matches a specified regex.
properties:
- name: 'regex'
type: String
description: |
A regular expression the column value is expected to match.
required: true
- name: 'uniquenessExpectation'
type: NestedObject
description: |
Row-level rule which evaluates whether each column value is unique.
send_empty_value: true
allow_empty_object: true
properties:
[]
- name: 'statisticRangeExpectation'
type: NestedObject
description: |
ColumnAggregate rule which evaluates whether the column aggregate statistic lies between a specified range.
properties:
- name: 'statistic'
type: Enum
description: |
column statistics.
required: true
enum_values:
- 'STATISTIC_UNDEFINED'
- 'MEAN'
- 'MIN'
- 'MAX'
- name: 'minValue'
type: String
description: |
The minimum column statistic value allowed for a row to pass this validation.
At least one of minValue and maxValue need to be provided.
- name: 'maxValue'
type: String
description: |
The maximum column statistic value allowed for a row to pass this validation.
At least one of minValue and maxValue need to be provided.
- name: 'strictMinEnabled'
type: Boolean
description: |
Whether column statistic needs to be strictly greater than ('>') the minimum, or if equality is allowed.
Only relevant if a minValue has been defined. Default = false.
default_value: false
- name: 'strictMaxEnabled'
type: Boolean
description: |
Whether column statistic needs to be strictly lesser than ('<') the maximum, or if equality is allowed.
Only relevant if a maxValue has been defined. Default = false.
default_value: false
- name: 'rowConditionExpectation'
type: NestedObject
description: |
Table rule which evaluates whether each row passes the specified condition.
properties:
- name: 'sqlExpression'
type: String
description: |
The SQL expression.
required: true
- name: 'tableConditionExpectation'
type: NestedObject
description: |
Table rule which evaluates whether the provided expression is true.
properties:
- name: 'sqlExpression'
type: String
description: |
The SQL expression.
required: true
- name: 'sqlAssertion'
type: NestedObject
description: |
Table rule which evaluates whether any row matches invalid state.
properties:
- name: 'sqlStatement'
type: String
description: |
The SQL statement.
required: true
min_size: 1
- name: 'dataProfileSpec'
type: NestedObject
description: |
DataProfileScan related setting.
send_empty_value: true
allow_empty_object: true
exactly_one_of:
- 'data_quality_spec'
- 'data_profile_spec'
properties:
- name: 'samplingPercent'
type: Double
description: |
The percentage of the records to be selected from the dataset for DataScan.
Value can range between 0.0 and 100.0 with up to 3 significant decimal digits.
Sampling is not applied if `sampling_percent` is not specified, 0 or 100.
- name: 'rowFilter'
type: String
description: |
A filter applied to all rows in a single DataScan job. The filter needs to be a valid SQL expression for a WHERE clause in BigQuery standard SQL syntax. Example: col1 >= 0 AND col2 < 10
- name: 'postScanActions'
type: NestedObject
description: |
Actions to take upon job completion.
properties:
- name: 'bigqueryExport'
type: NestedObject
description: |
If set, results will be exported to the provided BigQuery table.
properties:
- name: 'resultsTable'
type: String
description: |
The BigQuery table to export DataProfileScan results to.
Format://bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
- name: 'includeFields'
type: NestedObject
description: |
The fields to include in data profile.
If not specified, all fields at the time of profile scan job execution are included, except for ones listed in `exclude_fields`.
properties:
- name: 'fieldNames'
type: Array
description: |
Expected input is a list of fully qualified names of fields as in the schema.
Only top-level field names for nested fields are supported.
For instance, if 'x' is of nested field type, listing 'x' is supported but 'x.y.z' is not supported. Here 'y' and 'y.z' are nested fields of 'x'.
item_type:
type: String
- name: 'excludeFields'
type: NestedObject
description: |
The fields to exclude from data profile.
If specified, the fields will be excluded from data profile, regardless of `include_fields` value.
properties:
- name: 'fieldNames'
type: Array
description: |
Expected input is a list of fully qualified names of fields as in the schema.
Only top-level field names for nested fields are supported.
For instance, if 'x' is of nested field type, listing 'x' is supported but 'x.y.z' is not supported. Here 'y' and 'y.z' are nested fields of 'x'.
item_type:
type: String