projects/database-archival/terraform/main.tf (568 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. terraform { required_version = ">= 1.5.6" required_providers { google = { source = "hashicorp/google" version = "6.13.0" } } } provider "google" { project = var.project_id region = var.region } ## Compute Project number from Project Id. data "google_project" "gcp_project_info" { project_id = var.project_id } ## Enable required APIs. resource "google_project_service" "enable_required_services" { project = var.project_id disable_on_destroy = false for_each = toset([ "cloudresourcemanager.googleapis.com", "bigquery.googleapis.com", "cloudbuild.googleapis.com", "cloudfunctions.googleapis.com", "composer.googleapis.com", "compute.googleapis.com", "datastream.googleapis.com", "iam.googleapis.com", "iamcredentials.googleapis.com", "run.googleapis.com", "secretmanager.googleapis.com", "servicenetworking.googleapis.com", "sql-component.googleapis.com", "sqladmin.googleapis.com", "storage-component.googleapis.com", "storage.googleapis.com", "vpcaccess.googleapis.com", ]) service = each.key } locals { gcs_bucket_name = "${var.project_id}-db-archival-bucket" } ## Create Service Account for the Database Archival tool and assign roles/permissions. resource "google_service_account" "db_archival_sa" { account_id = var.service_account_name project = var.project_id description = "Service Account for Database Archival tool" depends_on = [google_project_service.enable_required_services] } ## Assign Roles to the SA. resource "google_project_iam_member" "assign_sa_roles" { for_each = toset([ "roles/bigquery.dataEditor", "roles/bigquery.jobUser", "roles/cloudfunctions.invoker", "roles/cloudsql.client", "roles/cloudsql.editor", "roles/cloudsql.instanceUser", "roles/composer.worker", "roles/compute.admin", "roles/run.invoker", "roles/secretmanager.secretAccessor", "roles/storage.objectAdmin", "roles/storage.objectCreator", "roles/storage.objectUser", "roles/storage.objectViewer", "roles/logging.logWriter", ]) role = each.key member = "serviceAccount:${google_service_account.db_archival_sa.email}" project = var.project_id } ## Create VPC network. resource "google_compute_network" "db_archival_network" { depends_on = [google_project_service.enable_required_services] project = var.project_id name = "db-archival-vpc" description = "VPC network for Database Archival workloads" auto_create_subnetworks = true mtu = 1460 routing_mode = "REGIONAL" } ## Create Private Subnetwork for Composer. locals { composer_subnet = "10.8.0.0/28" } resource "google_compute_subnetwork" "composer_subnet" { name = "db-archival-composer-subnet" ip_cidr_range = local.composer_subnet region = var.region project = var.project_id network = google_compute_network.db_archival_network.id private_ip_google_access = true } # Private Service Access components to allow connectivity between Datastream and Cloud SQL. resource "google_compute_global_address" "private_ip_alloc" { name = "private-ip-alloc" purpose = "VPC_PEERING" address_type = "INTERNAL" prefix_length = 16 network = google_compute_network.db_archival_network.id } resource "google_service_networking_connection" "private_vpc_connection" { depends_on = [ google_compute_network.db_archival_network, google_compute_global_address.private_ip_alloc, google_project_service.enable_required_services, ] network = google_compute_network.db_archival_network.id service = "servicenetworking.googleapis.com" reserved_peering_ranges = [google_compute_global_address.private_ip_alloc.name] } ## Create Firewalls. locals { datastream_subnet_cidr = "10.0.0.0/29" } resource "google_compute_firewall" "allow_datastream_to_natvm" { depends_on = [ google_datastream_private_connection.private_connection ] project = var.project_id name = "allow-datastream-to-natvm" network = google_compute_network.db_archival_network.id allow { protocol = "tcp" ports = ["3306"] } source_ranges = [local.datastream_subnet_cidr] direction = "INGRESS" priority = 1000 } resource "google_compute_firewall" "vpc_allow_https" { depends_on = [google_compute_network.db_archival_network] project = var.project_id name = "db-archival-vpc-allow-https" network = "projects/${var.project_id}/global/networks/${google_compute_network.db_archival_network.name}" description = <<-EOT Allows SSL connections from any source to any instance on the network using port 443. EOT priority = 65534 allow { protocol = "tcp" ports = ["443"] } source_ranges = ["0.0.0.0/0"] } locals { vpc_allowed_tcp_udp_ranges = ["10.8.0.0/28"] } resource "google_compute_firewall" "vpc_allow_internal" { depends_on = [google_compute_network.db_archival_network] project = var.project_id name = "db-archival-vpc-allow-internal" network = "projects/${var.project_id}/global/networks/${google_compute_network.db_archival_network.name}" description = "Allow internal traffic on the default network" priority = 65534 allow { protocol = "tcp" ports = ["0-65535"] } allow { protocol = "udp" ports = ["0-65535"] } source_ranges = local.vpc_allowed_tcp_udp_ranges } resource "google_compute_instance" "nat-vm" { depends_on = [ google_compute_firewall.allow_datastream_to_natvm ] project = var.project_id name = "nat-vm" machine_type = "e2-micro" zone = "${var.region}-a" tags = ["nat-vm-proxy"] boot_disk { initialize_params { image = "debian-cloud/debian-11" } } network_interface { network = google_compute_network.db_archival_network.id } metadata = { # gce-container-declaration = module.gce-container.metadata_value google-logging-enabled = "true" google-monitoring-enabled = "true" } metadata_startup_script = <<-EOT #! /bin/bash export DB_ADDR="${google_sql_database_instance.db-archival_sql_instance.private_ip_address}" export DB_PORT=3306 # Enable the VM to receive packets whose destinations do # not match any running process local to the VM echo 1 > /proc/sys/net/ipv4/ip_forward # Ask the Metadata server for the IP address of the VM nic0 # network interface: md_url_prefix="http://169.254.169.254/computeMetadata/v1/instance" vm_nic_ip="$(curl -H "Metadata-Flavor: Google" $md_url_prefix/network-interfaces/0/ip)" # Clear any existing iptables NAT table entries (all chains): iptables -t nat -F # Create a NAT table entry in the prerouting chain, matching # any packets with destination database port, changing the destination # IP address of the packet to the SQL instance IP address: iptables -t nat -A PREROUTING \ -p tcp --dport $DB_PORT \ -j DNAT \ --to-destination $DB_ADDR # Create a NAT table entry in the postrouting chain, matching # any packets with destination database port, changing the source IP # address of the packet to the NAT VM's primary internal IPv4 address: iptables -t nat -A POSTROUTING \ -p tcp --dport $DB_PORT \ -j SNAT \ --to-source $vm_nic_ip # Save iptables configuration: iptables-save EOT service_account { scopes = ["cloud-platform"] email = google_service_account.db_archival_sa.email } } ## Create a Secret in Secret Manager to store the database password. resource "google_secret_manager_secret" "database_password" { depends_on = [google_project_service.enable_required_services] secret_id = "db-archival-db-password" project = var.project_id replication { user_managed { replicas { location = var.region } replicas { location = "us-central1" } } } } ## Add the password as a Secret Version. resource "google_secret_manager_secret_version" "database_password_version" { secret_data = var.database_user_password secret = google_secret_manager_secret.database_password.id } ## Grant the db_archival_sa service account access to the secret. resource "google_secret_manager_secret_iam_member" "db_archival_sa_secret_access" { project = var.project_id secret_id = google_secret_manager_secret.database_password.id role = "roles/secretmanager.secretAccessor" member = "serviceAccount:${google_service_account.db_archival_sa.email}" } ## Create Cloud SQL Instance. resource "google_sql_database_instance" "db-archival_sql_instance" { depends_on = [ google_service_networking_connection.private_vpc_connection, google_project_service.enable_required_services, google_secret_manager_secret_version.database_password_version, ] name = "private-db-archival-db" region = var.region database_version = "MYSQL_8_0" settings { tier = "db-f1-micro" ip_configuration { ipv4_enabled = false private_network = google_compute_network.db_archival_network.self_link enable_private_path_for_google_cloud_services = true # Private Service Access flag } backup_configuration { binary_log_enabled = true enabled = true } } root_password = google_secret_manager_secret_version.database_password_version.secret_data deletion_protection = false } resource "google_sql_database" "database_name" { name = "db_archival_demo" instance = google_sql_database_instance.db-archival_sql_instance.name } resource "google_sql_user" "db_archival_user" { depends_on = [google_sql_database_instance.db-archival_sql_instance] name = var.database_user_name instance = google_sql_database_instance.db-archival_sql_instance.name password = var.database_user_password } ## Create a BigQuery dataset. resource "google_bigquery_dataset" "db_archival_bq_dataset" { dataset_id = "db_archival_bq_dataset" location = var.region delete_contents_on_destroy = true } ## Give the Database Archival service account access to the BigQuery dataset. resource "google_bigquery_dataset_access" "db_archival_sa_access_dataeditor" { dataset_id = google_bigquery_dataset.db_archival_bq_dataset.dataset_id for_each = toset([ "roles/bigquery.admin", ]) role = each.key user_by_email = google_service_account.db_archival_sa.email } ## Create GCS placeholder. resource "google_storage_bucket" "db-archival-main-bucket" { name = local.gcs_bucket_name project = var.project_id location = var.region force_destroy = true public_access_prevention = "enforced" uniform_bucket_level_access = true depends_on = [google_project_service.enable_required_services] } resource "google_storage_bucket_object" "upload_sample_data" { depends_on = [ data.archive_file.local_source, google_storage_bucket.db-archival-main-bucket, ] content_type = "application/zip" name = "mysql_sqldump.sql.gz" bucket = google_storage_bucket.db-archival-main-bucket.name source = abspath("../demo/mysql_sqldump.sql.gz") } locals { fqdn_database_name = "${var.project_id}:${var.region}:${google_sql_database_instance.db-archival_sql_instance.name}" } ## Token replace sample_config.json file. resource "null_resource" "replace_config_file" { depends_on = [ google_sql_database_instance.db-archival_sql_instance, google_storage_bucket.db-archival-main-bucket, google_sql_database.database_name, ] provisioner "local-exec" { command = <<-EOT sed -r -e "s/\\$\{REGION\}/${var.region}/g" \ -e "s/\\$\{PROJECT_ID\}/${var.project_id}/g" \ -e "s/\\$\{BIGQUERY_DATASET\}/${google_bigquery_dataset.db_archival_bq_dataset.dataset_id}/g" \ -e "s/\\$\{CLOUD_SQL_INSTANCE_NAME\}/${local.fqdn_database_name}/g" \ -e "s/\\$\{CLOUD_SQL_DATABASE_NAME\}/${google_sql_database.database_name.name}/g" \ -e "s/\\$\{CLOUD_SQL_USER_NAME\}/${var.database_user_name}/g" \ -e "s:\\$\{CLOUD_SQL_PASSWORD_SECRET\}:${google_secret_manager_secret_version.database_password_version.name}:g" \ ../demo/sample_config.json > temp/config.json EOT } } resource "google_storage_bucket_object" "upload_config_file" { depends_on = [ google_storage_bucket.db-archival-main-bucket, null_resource.replace_config_file ] source = abspath("../terraform/temp/config.json") name = "config.json" # target subfolder and rename at the same time bucket = google_storage_bucket.db-archival-main-bucket.name content_type = "application/json" } ## Assign SA of SQL instance permission for bucket access resource "google_project_iam_member" "assign_db_sa_roles" { depends_on = [google_sql_database_instance.db-archival_sql_instance] for_each = toset([ "roles/cloudsql.client", "roles/storage.admin", ]) role = each.key member = "serviceAccount:${google_sql_database_instance.db-archival_sql_instance.service_account_email_address}" project = var.project_id } ## Ingest sample data into the database. resource "null_resource" "ingest_sample_data" { depends_on = [ google_sql_database_instance.db-archival_sql_instance, google_project_iam_member.assign_db_sa_roles, google_storage_bucket_object.upload_sample_data, google_project_service.enable_required_services, ] provisioner "local-exec" { command = <<-EOT gcloud sql import sql ${google_sql_database_instance.db-archival_sql_instance.name} \ gs://${google_storage_bucket.db-archival-main-bucket.name}/mysql_sqldump.sql.gz \ --database=${google_sql_database.database_name.name} \ --quiet EOT } } ## Create Cloud Function (pruning_function). locals { files_to_copy = { "../src/database_archival/common/" : "database_archival/", "../src/database_archival/pruning_function/" : "database_archival/", "../src/database_archival/pruning_function/main.py" : "main.py", "../src/database_archival/pruning_function/requirements.in" : "requirements.txt" } # Create a temporary directory to store copied files temp_dir = "${path.module}/temp" cf_zip_output_path = "${path.module}/output" } # Create the temporary directory if it doesn't exist resource "null_resource" "create_dirs" { for_each = { "dummy" : 1 } # Workaround for using file functions with null_resource provisioner "local-exec" { command = <<-EOF mkdir -p ${abspath(local.temp_dir)}/database_archival/common mkdir -p ${abspath(local.temp_dir)}/database_archival/pruning_function mkdir -p ${abspath(local.cf_zip_output_path)} EOF } } ## Copy files locally resource "null_resource" "copy_files_locally" { for_each = local.files_to_copy depends_on = [null_resource.create_dirs] provisioner "local-exec" { command = "cp -r ${abspath(each.key)} ${local.temp_dir}/${each.value}" } } data "archive_file" "local_source" { depends_on = [null_resource.copy_files_locally] type = "zip" source_dir = abspath("${abspath(local.temp_dir)}") output_path = "${local.cf_zip_output_path}/db_archival.zip" excludes = [".git", ".github", ".DS_Store", ".vscode", "kubernetes", "node_modules", "resources", "terraform" ] } resource "google_storage_bucket_object" "db_archival_source_code_bucket" { depends_on = [ data.archive_file.local_source, google_storage_bucket.db-archival-main-bucket ] content_type = "application/zip" name = "db_archival.zip" bucket = google_storage_bucket.db-archival-main-bucket.name source = "${local.cf_zip_output_path}/db_archival.zip" } # IAM entry for all users to invoke the function. resource "google_cloudfunctions2_function_iam_member" "invoker" { project = google_cloudfunctions2_function.prune_function.project location = google_cloudfunctions2_function.prune_function.location cloud_function = google_cloudfunctions2_function.prune_function.name role = "roles/cloudfunctions.invoker" member = "serviceAccount:${google_service_account.db_archival_sa.email}" } ## Serverless VPC connector to allow access to Cloud SQL - Cloud Function. locals { vpc_connector_cidr = "10.8.1.0/28" } resource "google_vpc_access_connector" "vpc_connector" { depends_on = [ google_compute_network.db_archival_network, ] name = "db-archival-vpc-connector" region = var.region project = var.project_id network = google_compute_network.db_archival_network.name ip_cidr_range = local.vpc_connector_cidr min_instances = 2 max_instances = 5 } resource "google_cloudfunctions2_function" "prune_function" { depends_on = [ google_vpc_access_connector.vpc_connector, google_project_service.enable_required_services, google_service_account.db_archival_sa, google_storage_bucket_object.db_archival_source_code_bucket, google_storage_bucket.db-archival-main-bucket ] name = "prune-data" project = var.project_id location = var.region build_config { runtime = "python39" entry_point = "request_handler" service_account = "projects/${var.project_id}/serviceAccounts/${google_service_account.db_archival_sa.email}" source { storage_source { bucket = google_storage_bucket.db-archival-main-bucket.name object = google_storage_bucket_object.db_archival_source_code_bucket.name } } } service_config { max_instance_count = 100 available_memory = "2048M" timeout_seconds = 3600 ingress_settings = "ALLOW_ALL" service_account_email = google_service_account.db_archival_sa.email vpc_connector = google_vpc_access_connector.vpc_connector.name } } ## Configure Composer's DAG. resource "google_composer_environment" "db_archival_composer" { provider = google-beta depends_on = [ google_storage_bucket.db-archival-main-bucket, google_compute_subnetwork.composer_subnet, google_service_account.db_archival_sa, google_cloudfunctions2_function.prune_function, ] timeouts { create = "120m" update = "120m" delete = "120m" } name = "db-archival-composer-env" project = var.project_id region = var.region config { enable_private_environment = true node_config { network = google_compute_network.db_archival_network.id subnetwork = google_compute_subnetwork.composer_subnet.id service_account = google_service_account.db_archival_sa.email } software_config { image_version = "composer-3-airflow-2" env_variables = { DATA_ARCHIVAL_CONFIG_PATH = "gs://${google_storage_bucket.db-archival-main-bucket.name}/config.json" CLOUD_FUNCTION_URL_DATA_DELETION = google_cloudfunctions2_function.prune_function.service_config[0].uri } } } } # Upload DAGs and common files to the Composer GCS bucket. resource "google_storage_bucket_object" "upload_dags_files" { for_each = fileset(abspath("../src/database_archival"), "**/*.py") source = abspath("../src/database_archival/${each.key}") name = "dags/database_archival/${each.value}" bucket = replace(replace( google_composer_environment.db_archival_composer.config.0.dag_gcs_prefix , "gs://", ""), "/dags", "") content_type = "text/x-python" depends_on = [google_composer_environment.db_archival_composer] } ## Set up Datastream. resource "google_datastream_private_connection" "private_connection" { display_name = "Datastream private connection" location = var.region private_connection_id = "datastream-db-archival-private-connection" labels = { key = "value" } vpc_peering_config { vpc = google_compute_network.db_archival_network.id subnet = local.datastream_subnet_cidr } } resource "google_datastream_connection_profile" "source_connection_profile" { depends_on = [ google_datastream_private_connection.private_connection, google_project_service.enable_required_services, ] display_name = "source-connection-profile" location = var.region connection_profile_id = "db-archival-source-connection-profile" mysql_profile { hostname = google_compute_instance.nat-vm.network_interface[0].network_ip port = 3306 username = google_sql_user.db_archival_user.name password = google_sql_user.db_archival_user.password } private_connectivity { private_connection = google_datastream_private_connection.private_connection.id } } resource "google_datastream_connection_profile" "destination_connection_profile" { depends_on = [google_project_service.enable_required_services] display_name = "destination-connection-profile" location = var.region connection_profile_id = "db-archival-destination-connection-profile" bigquery_profile {} } resource "google_datastream_stream" "db-archival-datastream" { depends_on = [ google_bigquery_dataset.db_archival_bq_dataset, google_sql_database_instance.db-archival_sql_instance, ] stream_id = "db-archival-datastream" location = var.region display_name = "db-archival-datastream" desired_state = "RUNNING" source_config { source_connection_profile = google_datastream_connection_profile.source_connection_profile.id mysql_source_config { include_objects { mysql_databases { database = google_sql_database.database_name.name mysql_tables { table = "User" } mysql_tables { table = "Transaction" } } } } } destination_config { destination_connection_profile = google_datastream_connection_profile.destination_connection_profile.id bigquery_destination_config { single_target_dataset { dataset_id = "projects/${var.project_id}/datasets/${google_bigquery_dataset.db_archival_bq_dataset.dataset_id}" } } } backfill_all { } }