dataplex.tf (223 lines of code) (raw):

/** * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ resource "google_project_service_identity" "dataplex_sa" { provider = google-beta project = module.project-services.project_id service = "dataplex.googleapis.com" } #give dataplex access to biglake bucket resource "google_project_iam_member" "dataplex_bucket_access" { project = module.project-services.project_id role = "roles/dataplex.serviceAgent" member = "serviceAccount:${google_project_service_identity.dataplex_sa.email}" } resource "google_dataplex_lake" "gcp_primary" { location = var.region name = "gcp-primary-lake" description = "gcp primary lake" display_name = "gcp primary lake" labels = { gcp-lake = "exists" } project = module.project-services.project_id depends_on = [ google_project_iam_member.dataplex_bucket_access ] } #zone - raw resource "google_dataplex_zone" "gcp_primary_raw" { discovery_spec { enabled = true } lake = google_dataplex_lake.gcp_primary.name location = var.region name = "gcp-primary-raw" resource_spec { location_type = "SINGLE_REGION" } type = "RAW" description = "Zone for thelook_ecommerce image data" display_name = "images" labels = {} project = module.project-services.project_id } #zone - curated, for staging the data resource "google_dataplex_zone" "gcp_primary_staging" { discovery_spec { enabled = true } lake = google_dataplex_lake.gcp_primary.name location = var.region name = "gcp-primary-staging" resource_spec { location_type = "SINGLE_REGION" } type = "CURATED" description = "Zone for thelook_ecommerce tabular data" display_name = "staging" labels = {} project = module.project-services.project_id } #zone - curated, for BI resource "google_dataplex_zone" "gcp_primary_curated_bi" { discovery_spec { enabled = true } lake = google_dataplex_lake.gcp_primary.name location = var.region name = "gcp-primary-curated" resource_spec { location_type = "SINGLE_REGION" } type = "CURATED" description = "Zone for thelook_ecommerce tabular data" display_name = "business_intelligence" labels = {} project = module.project-services.project_id } # Assets are listed below. Assets need to wait for data to be copied to be created. #asset resource "google_dataplex_asset" "gcp_primary_textocr" { name = "gcp-primary-textocr" location = var.region lake = google_dataplex_lake.gcp_primary.name dataplex_zone = google_dataplex_zone.gcp_primary_raw.name discovery_spec { enabled = true } resource_spec { name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.textocr_images_bucket.name}" type = "STORAGE_BUCKET" read_access_mode = "MANAGED" } project = module.project-services.project_id depends_on = [time_sleep.wait_after_copy_data] } #asset resource "google_dataplex_asset" "gcp_primary_ga4_obfuscated_sample_ecommerce" { name = "gcp-primary-ga4-obfuscated-sample-ecommerce" location = var.region lake = google_dataplex_lake.gcp_primary.name dataplex_zone = google_dataplex_zone.gcp_primary_raw.name discovery_spec { enabled = true } resource_spec { name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.ga4_images_bucket.name}" type = "STORAGE_BUCKET" read_access_mode = "MANAGED" } project = module.project-services.project_id depends_on = [time_sleep.wait_after_copy_data] } #asset resource "google_dataplex_asset" "gcp_primary_tables" { name = "gcp-primary-tables" location = var.region lake = google_dataplex_lake.gcp_primary.name dataplex_zone = google_dataplex_zone.gcp_primary_staging.name discovery_spec { enabled = true } resource_spec { name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.tables_bucket.name}" type = "STORAGE_BUCKET" read_access_mode = "MANAGED" } project = module.project-services.project_id depends_on = [time_sleep.wait_after_copy_data] } # Add a wait for Dataplex Discovery. # Discovery on this data generally takes 6-8 minutes. resource "time_sleep" "wait_for_dataplex_discovery" { depends_on = [ google_dataplex_asset.gcp_primary_tables, google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, google_dataplex_asset.gcp_primary_textocr ] create_duration = "600s" } locals { datascan_dataset = replace(google_dataplex_zone.gcp_primary_staging.name, "-", "_") } resource "google_dataplex_datascan" "dq_scan" { project = module.project-services.project_id location = var.region data_scan_id = "thelook-ecommerce-orders" data { resource = "//bigquery.googleapis.com/projects/${module.project-services.project_id}/datasets/${local.datascan_dataset}/tables/thelook_ecommerce_orders" } execution_spec { trigger { on_demand {} } } data_quality_spec { rules { column = "order_id" dimension = "COMPLETENESS" name = "non-null" description = "Sample rule for non-null column" threshold = 1.0 non_null_expectation {} } rules { column = "user_id" dimension = "COMPLETENESS" name = "non-null" description = "Sample rule for non-null column" threshold = 1.0 non_null_expectation {} } rules { column = "created_at" dimension = "COMPLETENESS" name = "non-null" description = "Sample rule for non-null column" threshold = 1.0 non_null_expectation {} } rules { column = "order_id" dimension = "UNIQUENESS" name = "unique" description = "Sample rule for values in a set" uniqueness_expectation {} } rules { column = "status" dimension = "VALIDITY" name = "one-of-set" description = "Sample rule for values in a set" ignore_null = false set_expectation { values = ["Shipped", "Complete", "Processing", "Cancelled", "Returned"] } } rules { column = "num_of_item" dimension = "VALIDITY" name = "range-values" description = "Sample rule for values in a range" ignore_null = false threshold = 0.99 range_expectation { max_value = 1 strict_max_enabled = false strict_min_enabled = false } } rules { dimension = "VALIDITY" name = "non-empty-table" description = "Sample rule for a non-empty table" table_condition_expectation { sql_expression = "COUNT(*) > 0" } } } depends_on = [time_sleep.wait_for_dataplex_discovery] }