modules/data_warehouse/bigquery.tf (304 lines of code) (raw):

/** * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ # Set up BigQuery resources # # Create the BigQuery dataset resource "google_bigquery_dataset" "ds_edw" { project = module.project-services.project_id dataset_id = "thelook" friendly_name = "My EDW Dataset" description = "My EDW Dataset with tables" location = var.region labels = var.labels delete_contents_on_destroy = var.force_destroy depends_on = [time_sleep.wait_after_apis] } # # Create a BigQuery connection for Cloud Storage to create BigLake tables resource "google_bigquery_connection" "ds_connection" { project = module.project-services.project_id connection_id = "ds_connection" location = var.region friendly_name = "Storage Bucket Connection" cloud_resource {} depends_on = [time_sleep.wait_after_apis] } # # Grant IAM access to the BigQuery Connection account for Cloud Storage resource "google_storage_bucket_iam_binding" "bq_connection_iam_object_viewer" { bucket = google_storage_bucket.raw_bucket.name role = "roles/storage.objectViewer" members = [ "serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}", ] } # # Create a BigQuery connection for Vertex AI to support GenerativeAI use cases resource "google_bigquery_connection" "vertex_ai_connection" { project = module.project-services.project_id connection_id = "genai_connection" location = var.region friendly_name = "BigQuery ML Connection" cloud_resource {} depends_on = [time_sleep.wait_after_apis] } # # Grant IAM access to the BigQuery Connection account for Vertex AI resource "google_project_iam_member" "bq_connection_iam_vertex_ai" { for_each = toset([ "roles/aiplatform.user", "roles/bigquery.connectionUser", "roles/serviceusage.serviceUsageConsumer", ] ) project = module.project-services.project_id role = each.key member = "serviceAccount:${google_bigquery_connection.vertex_ai_connection.cloud_resource[0].service_account_id}" } # Create data tables in BigQuery # # Create a Biglake table for events with metadata caching resource "google_bigquery_table" "tbl_edw_events" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "events" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/events_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/events.parquet"] } labels = var.labels } # # Create a Biglake table for inventory_items resource "google_bigquery_table" "tbl_edw_inventory_items" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "inventory_items" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/inventory_items_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/inventory_items.parquet"] } labels = var.labels } # # Create a Biglake table with metadata caching for order_items resource "google_bigquery_table" "tbl_edw_order_items" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "order_items" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/order_items_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/order_items.parquet"] } labels = var.labels } # # Create a Biglake table for orders resource "google_bigquery_table" "tbl_edw_orders" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "orders" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/orders_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/orders.parquet"] } labels = var.labels } # # Create a Biglake table for products resource "google_bigquery_table" "tbl_edw_products" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "products" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/products_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/products.parquet"] } labels = var.labels } # # Create a Biglake table for products resource "google_bigquery_table" "tbl_edw_users" { dataset_id = google_bigquery_dataset.ds_edw.dataset_id table_id = "users" project = module.project-services.project_id deletion_protection = var.deletion_protection schema = file("${path.module}/src/schema/users_schema.json") external_data_configuration { autodetect = true connection_id = google_bigquery_connection.ds_connection.name source_format = "PARQUET" source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/users.parquet"] } labels = var.labels } # Load Queries for Stored Procedure Execution # # Load Distribution Center Lookup Data Tables resource "google_bigquery_routine" "sp_provision_lookup_tables" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_provision_lookup_tables" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id } ) } # # Add Looker Studio Data Report Procedure resource "google_bigquery_routine" "sproc_sp_demo_lookerstudio_report" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_lookerstudio_report" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id } ) depends_on = [ google_bigquery_table.tbl_edw_inventory_items, google_bigquery_table.tbl_edw_order_items, google_bigquery_routine.sp_provision_lookup_tables, ] } # # Add Sample Queries resource "google_bigquery_routine" "sp_sample_queries" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_sample_queries" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id } ) depends_on = [ google_bigquery_table.tbl_edw_inventory_items, google_bigquery_table.tbl_edw_order_items, ] } # # Add Bigquery ML Model for clustering resource "google_bigquery_routine" "sp_bigqueryml_model" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_bigqueryml_model" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id } ) depends_on = [ google_bigquery_table.tbl_edw_order_items, ] } # # Create Bigquery ML Model for using text generation resource "google_bigquery_routine" "sp_bigqueryml_generate_create" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_bigqueryml_generate_create" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_generate_create.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id, connection_id = google_bigquery_connection.vertex_ai_connection.id, model_name = var.text_generation_model_name, region = var.region } ) } # # Query Bigquery ML Model for describing customer clusters resource "google_bigquery_routine" "sp_bigqueryml_generate_describe" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_bigqueryml_generate_describe" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_generate_describe.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id, model_name = var.text_generation_model_name } ) depends_on = [ google_bigquery_routine.sp_bigqueryml_generate_create ] } # # Add Translation Scripts resource "google_bigquery_routine" "sp_sample_translation_queries" { project = module.project-services.project_id dataset_id = google_bigquery_dataset.ds_edw.dataset_id routine_id = "sp_sample_translation_queries" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id } ) depends_on = [ google_bigquery_table.tbl_edw_inventory_items, ] } # Add Scheduled Query # # Set up DTS permissions resource "google_project_service_identity" "bigquery_data_transfer_sa" { provider = google-beta project = module.project-services.project_id service = "bigquerydatatransfer.googleapis.com" depends_on = [time_sleep.wait_after_apis] } # # Grant the DTS service account access resource "google_project_iam_member" "dts_service_account_roles" { for_each = toset([ "roles/bigquerydatatransfer.serviceAgent", ]) project = module.project-services.project_id role = each.key member = "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}" depends_on = [time_sleep.wait_after_apis] } # Create specific service account for DTS Run # # Set up the DTA service account resource "google_service_account" "dts" { project = module.project-services.project_id account_id = "cloud-dts-sa-${random_id.id.hex}" display_name = "Service Account for Data Transfer Service" } # # Grant the DTS Specific service account access resource "google_project_iam_member" "dts_roles" { for_each = toset([ "roles/bigquery.user", "roles/bigquery.dataEditor", ]) project = module.project-services.project_id role = each.key member = "serviceAccount:${google_service_account.dts.email}" } # # Grant the DTS specific service account Token Creator to the DTS Service Identity resource "google_service_account_iam_binding" "dts_token_creator" { service_account_id = google_service_account.dts.id role = "roles/iam.serviceAccountTokenCreator" members = [ "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}" ] depends_on = [ google_project_iam_member.dts_service_account_roles, ] } # Set up scheduled query resource "google_bigquery_data_transfer_config" "dts_config" { display_name = "nightlyloadquery" project = module.project-services.project_id location = var.region data_source_id = "scheduled_query" schedule = "every day 00:00" params = { query = "CALL `${module.project-services.project_id}.${google_bigquery_dataset.ds_edw.dataset_id}.sp_bigqueryml_model`()" } service_account_name = google_service_account.dts.email depends_on = [ google_project_iam_member.dts_roles, google_bigquery_dataset.ds_edw, google_service_account_iam_binding.dts_token_creator, ] }