infrastructure/terraform/modules/feature-store/bigquery-datasets.tf (215 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This resource creates a BigQuery dataset called `feature_store`. resource "google_bigquery_dataset" "feature_store" { dataset_id = local.config_bigquery.dataset.feature_store.name friendly_name = local.config_bigquery.dataset.feature_store.friendly_name project = local.feature_store_project_id description = local.config_bigquery.dataset.feature_store.description location = local.config_bigquery.dataset.feature_store.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.feature_store.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `purchase_propensity`. resource "google_bigquery_dataset" "purchase_propensity" { dataset_id = local.config_bigquery.dataset.purchase_propensity.name friendly_name = local.config_bigquery.dataset.purchase_propensity.friendly_name project = local.purchase_propensity_project_id description = local.config_bigquery.dataset.purchase_propensity.description location = local.config_bigquery.dataset.purchase_propensity.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.purchase_propensity.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `churn_propensity`. resource "google_bigquery_dataset" "churn_propensity" { dataset_id = local.config_bigquery.dataset.churn_propensity.name friendly_name = local.config_bigquery.dataset.churn_propensity.friendly_name project = local.churn_propensity_project_id description = local.config_bigquery.dataset.churn_propensity.description location = local.config_bigquery.dataset.churn_propensity.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.churn_propensity.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `lead_score_propensity`. resource "google_bigquery_dataset" "lead_score_propensity" { dataset_id = local.config_bigquery.dataset.lead_score_propensity.name friendly_name = local.config_bigquery.dataset.lead_score_propensity.friendly_name project = local.lead_score_propensity_project_id description = local.config_bigquery.dataset.lead_score_propensity.description location = local.config_bigquery.dataset.lead_score_propensity.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.feature_store.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.lead_score_propensity.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `customer_lifetime_value`. resource "google_bigquery_dataset" "customer_lifetime_value" { dataset_id = local.config_bigquery.dataset.customer_lifetime_value.name friendly_name = local.config_bigquery.dataset.customer_lifetime_value.friendly_name project = local.customer_lifetime_value_project_id description = local.config_bigquery.dataset.customer_lifetime_value.description location = local.config_bigquery.dataset.customer_lifetime_value.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.customer_lifetime_value.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.customer_lifetime_value.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `audience_segmentation`. resource "google_bigquery_dataset" "audience_segmentation" { dataset_id = local.config_bigquery.dataset.audience_segmentation.name friendly_name = local.config_bigquery.dataset.audience_segmentation.friendly_name project = local.audience_segmentation_project_id description = local.config_bigquery.dataset.audience_segmentation.description location = local.config_bigquery.dataset.audience_segmentation.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.audience_segmentation.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.audience_segmentation.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `auto_audience_segmentation`. resource "google_bigquery_dataset" "auto_audience_segmentation" { dataset_id = local.config_bigquery.dataset.auto_audience_segmentation.name friendly_name = local.config_bigquery.dataset.auto_audience_segmentation.friendly_name project = local.auto_audience_segmentation_project_id description = local.config_bigquery.dataset.auto_audience_segmentation.description location = local.config_bigquery.dataset.auto_audience_segmentation.location # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.auto_audience_segmentation.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.auto_audience_segmentation.max_time_travel_hours # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false labels = { version = "prod" } # The lifecycle block allows you to configure the lifecycle of the dataset. # In this case, the ignore_changes attribute is set to all, which means that # Terraform will ignore any changes to the dataset and will not attempt to update the dataset. lifecycle { ignore_changes = all } } # This resource creates a BigQuery dataset called `aggregated_vbb`. # For existing users that has pulled this change will result in that # terraform try to created the `aggregated_vbb` dataset along with # the underlying tables. terraform apply will result in an error saying # it failed to create resources that are already exist. To resolve you # need to import the the existing dataset and tables to terraform using # the following commands: # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import module.feature_store[0].module.aggregated_vbb.google_bigquery_dataset.main 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/aggregated_vbb'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.aggregated_vbb.google_bigquery_table.main["vbb_weights"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/aggregated_vbb/tables/vbb_weights'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.aggregated_vbb.google_bigquery_table.main["aggregated_value_based_bidding_volume_weekly"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/aggregated_vbb/tables/aggregated_value_based_bidding_volume_weekly'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.aggregated_vbb.google_bigquery_table.main["aggregated_value_based_bidding_correlation"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/aggregated_vbb/tables/aggregated_value_based_bidding_correlation'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.aggregated_vbb.google_bigquery_table.main["aggregated_value_based_bidding_volume_daily"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/aggregated_vbb/tables/aggregated_value_based_bidding_volume_daily'` # # You also need to remove the information of the existing aggregated_vbb # dataset from the terraform state by running following command: # > `terraform state rm 'module.feature_store[0].google_bigquery_dataset.aggregated_vbb'` locals { aggregated_vbb_tables = [ "vbb_weights", "aggregated_value_based_bidding_correlation", "aggregated_value_based_bidding_volume_daily", "aggregated_value_based_bidding_volume_weekly" ] } module "aggregated_vbb" { source = "terraform-google-modules/bigquery/google" version = "9.0.0" dataset_id = local.config_bigquery.dataset.aggregated_vbb.name dataset_name = local.config_bigquery.dataset.aggregated_vbb.friendly_name description = local.config_bigquery.dataset.aggregated_vbb.description project_id = local.aggregated_vbb_project_id location = local.config_bigquery.dataset.aggregated_vbb.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = true dataset_labels = { version = "prod" } tables = [for table_id in local.aggregated_vbb_tables : { table_id = table_id schema = file("../../sql/schema/table/${table_id}.json") # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.auto_audience_segmentation.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.aggregated_vbb.max_time_travel_hours deletion_protection = false time_partitioning = null, range_partitioning = null, expiration_time = null, clustering = [], labels = {}, }] } # This module creates a BigQuery dataset called `aggregated_predictions` and a table called "latest". # The aggregated_predictions module is used to create a BigQuery dataset and table that will be used to store # the aggregated predictions generated by the predictions pipelines. module "aggregated_predictions" { source = "terraform-google-modules/bigquery/google" version = "9.0.0" dataset_id = local.config_bigquery.dataset.aggregated_predictions.name dataset_name = local.config_bigquery.dataset.aggregated_predictions.friendly_name description = local.config_bigquery.dataset.aggregated_predictions.description project_id = local.config_bigquery.dataset.aggregated_predictions.project_id location = local.config_bigquery.dataset.aggregated_predictions.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to true, which means that the contents of the dataset will be deleted when the dataset is destroyed. delete_contents_on_destroy = true # The tables attribute is used to configure the BigQuery table within the dataset tables = [ { table_id = "latest" # The schema of the table, defined in a JSON file. schema = file("../../sql/schema/table/aggregated_predictions_latest.json") time_partitioning = null, range_partitioning = null, expiration_time = null, clustering = [], labels = {}, } ] } # This resource creates a BigQuery dataset called `gemini_insights`. # For existing users that has pulled this change will result in that # terraform try to created the `gemini_insights` dataset along with # the underlying tables. terraform apply will result in an error saying # it failed to create resources that are already exist. To resolve you # need to import the the existing dataset and tables to terraform using # the following commands: # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_dataset.main' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_table.main["user_behaviour_revenue_insights_monthly"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights/tables/user_behaviour_revenue_insights_monthly'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_table.main["user_behaviour_revenue_insights_weekly"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights/tables/user_behaviour_revenue_insights_weekly'` # # > `terraform -chdir="${TERRAFORM_RUN_DIR}" import 'module.feature_store[0].module.gemini_insights.google_bigquery_table.main["user_behaviour_revenue_insights_daily"]' 'projects/${MAJ_FEATURE_STORE_PROJECT_ID}/datasets/gemini_insights/tables/user_behaviour_revenue_insights_daily'` # # You also need to remove the information of the existing gemini_insights # dataset from the terraform state by running following command: # > `terraform state rm 'module.feature_store[0].google_bigquery_dataset.gemini_insights'` locals { gemini_insights_tables = [ "user_behaviour_revenue_insights_monthly", "user_behaviour_revenue_insights_weekly", "user_behaviour_revenue_insights_daily" ] } module "gemini_insights" { source = "terraform-google-modules/bigquery/google" version = "9.0.0" dataset_id = local.config_bigquery.dataset.gemini_insights.name dataset_name = local.config_bigquery.dataset.gemini_insights.friendly_name description = local.config_bigquery.dataset.gemini_insights.description project_id = local.gemini_insights_project_id location = local.config_bigquery.dataset.gemini_insights.location # The delete_contents_on_destroy attribute specifies whether the contents of the dataset should be deleted when the dataset is destroyed. # In this case, the delete_contents_on_destroy attribute is set to false, which means that the contents of the dataset will not be deleted when the dataset is destroyed. delete_contents_on_destroy = false deletion_protection = true dataset_labels = { version = "prod", dataset_id = local.config_bigquery.dataset.gemini_insights.name } tables = [for table_id in local.gemini_insights_tables : { table_id = table_id schema = file("../../sql/schema/table/${table_id}.json") # The max_time_travel_hours attribute specifies the maximum number of hours that data in the dataset can be accessed using time travel queries. # In this case, the maximum time travel hours is set to the value of the local file config.yaml section bigquery.dataset.gemini_insights.max_time_travel_hours configuration. max_time_travel_hours = local.config_bigquery.dataset.gemini_insights.max_time_travel_hours deletion_protection = true time_partitioning = null, range_partitioning = null, expiration_time = null, clustering = [], labels = {}, }] } # This resource executes gcloud commands to check whether the BigQuery API is enabled. # Since enabling APIs can take a few seconds, we need to make the deployment wait until the API is enabled before resuming. resource "null_resource" "check_gemini_insights_dataset_exists" { provisioner "local-exec" { command = <<-EOT COUNTER=0 MAX_TRIES=100 while ! bq ls --filter labels.dataset_id:${local.config_bigquery.dataset.gemini_insights.name} --max_results 1 --format=json --project_id ${local.gemini_insights_project_id} && [ $COUNTER -lt $MAX_TRIES ] do sleep 6 printf "." COUNTER=$((COUNTER + 1)) done if [ $COUNTER -eq $MAX_TRIES ]; then echo "bigquery api is not enabled, terraform can not continue!" exit 1 fi sleep 20 EOT } depends_on = [ module.gemini_insights.google_bigquery_dataset ] }