terraform/etl_integration/main.tf (211 lines of code) (raw):
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
locals {
spanner_instance = "test-spanner-instance"
spanner_database = "taxis_database"
spanner_table = "events"
spanner_change_stream = "events_stream"
spanner_metadata_db = "metadata"
spanner_configuration = "regional-${var.region}"
spanner_name = "Spanner instance managed by TF"
bigquery_dataset = "replica"
dataflow_service_account = "my-dataflow-sa"
worker_type = "n2-standard-4"
max_dataflow_workers = 10
}
resource "google_project_service" "crm" {
project = var.project_id
service = "cloudresourcemanager.googleapis.com"
disable_dependent_services = true
}
// Project
module "google_cloud_project" {
depends_on = [google_project_service.crm]
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/project?ref=v38.0.0"
billing_account = var.billing_account
project_reuse = var.project_create ? null : {}
name = var.project_id
parent = var.organization
services = [
"iam.googleapis.com",
"dataflow.googleapis.com",
"monitoring.googleapis.com",
"pubsub.googleapis.com",
"autoscaling.googleapis.com",
"spanner.googleapis.com",
"bigquery.googleapis.com"
]
service_config = {
disable_on_destroy = true
disable_dependent_services = true
}
}
// Buckets for staging data, scripts, etc, in the two regions
module "buckets" {
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
name = module.google_cloud_project.project_id
location = var.region
storage_class = "STANDARD"
force_destroy = var.destroy_all_resources
}
// BigQuery dataset for final destination
module "dataset" {
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/bigquery-dataset?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
id = local.bigquery_dataset
access = {
dataflow-writer = { role = "OWNER", type = "user" }
}
access_identities = {
dataflow-writer = module.dataflow_sa.email
}
options = {
delete_contents_on_destroy = true
}
}
// Spanner instance for change streams / CDC, using minimal instances size for demo purposes
resource "google_spanner_instance" "spanner_instance" {
config = local.spanner_configuration
name = local.spanner_instance
project = module.google_cloud_project.project_id
display_name = local.spanner_name
processing_units = 1000
force_destroy = var.destroy_all_resources
}
resource "google_spanner_database" "taxis" {
instance = google_spanner_instance.spanner_instance.name
project = module.google_cloud_project.project_id
name = local.spanner_database
ddl = [
<<DDL1
CREATE TABLE ${local.spanner_table} (
ride_id STRING(64),
point_idx INT64,
latitude FLOAT64,
longitude FLOAT64,
timestamp TIMESTAMP,
meter_reading FLOAT64,
meter_increment FLOAT64,
ride_status STRING(64),
passenger_count INT64,
) PRIMARY KEY(ride_id, point_idx)
DDL1
,
<<DDL2
CREATE CHANGE STREAM ${local.spanner_change_stream} FOR ${local.spanner_table}
OPTIONS(value_capture_type = 'NEW_ROW_AND_OLD_VALUES')
DDL2
]
deletion_protection = !var.destroy_all_resources
}
resource "google_spanner_database_iam_binding" "read_write_taxis" {
project = module.google_cloud_project.project_id
instance = google_spanner_instance.spanner_instance.name
database = google_spanner_database.taxis.name
role = "roles/spanner.databaseUser"
members = [
module.dataflow_sa.iam_email
]
}
resource "google_spanner_database" "metadata" {
instance = google_spanner_instance.spanner_instance.name
project = module.google_cloud_project.project_id
name = local.spanner_metadata_db
deletion_protection = !var.destroy_all_resources
}
resource "google_spanner_database_iam_binding" "read_write_metadata" {
project = module.google_cloud_project.project_id
instance = google_spanner_instance.spanner_instance.name
database = google_spanner_database.metadata.name
role = "roles/spanner.databaseUser"
members = [
module.dataflow_sa.iam_email
]
}
// Service account
module "dataflow_sa" {
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/iam-service-account?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
name = local.dataflow_service_account
iam_project_roles = {
(module.google_cloud_project.project_id) = [
"roles/storage.admin",
"roles/dataflow.worker",
"roles/monitoring.metricWriter",
"roles/pubsub.editor"
]
}
}
// Network
module "vpc_network" {
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
name = "${var.network_prefix}-net"
subnets = [
{
ip_cidr_range = "10.1.0.0/16"
name = "${var.network_prefix}-subnet"
region = var.region
enable_private_access = true
secondary_ip_ranges = {
pods = "10.16.0.0/14"
services = "10.20.0.0/24"
}
}
]
}
module "firewall_rules" {
// Default rules for internal traffic + SSH access via IAP
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc-firewall?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
network = module.vpc_network.name
default_rules_config = {
admin_ranges = [
module.vpc_network.subnet_ips["${var.region}/${var.network_prefix}-subnet"],
]
}
egress_rules = {
allow-egress-dataflow = {
deny = false
description = "Dataflow firewall rule egress"
targets = ["dataflow"]
rules = [{ protocol = "tcp", ports = [12345, 12346] }]
}
}
ingress_rules = {
allow-ingress-dataflow = {
description = "Dataflow firewall rule ingress"
targets = ["dataflow"]
rules = [{ protocol = "tcp", ports = [12345, 12346] }]
}
}
}
// So we can get to Internet if necessary (from the Dataflow region)
module "regional_nat" {
count = var.internet_access ? 1 : 0
source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-cloudnat?ref=v38.0.0"
project_id = module.google_cloud_project.project_id
region = var.region
name = "${var.network_prefix}-nat"
router_network = module.vpc_network.self_link
}
// Script with variables to launch the Dataflow jobs
resource "local_file" "variables_script" {
filename = "${path.module}/../../pipelines/etl_integration_java/scripts/01_set_variables.sh"
file_permission = "0644"
content = <<FILE
# This file is generated by the Terraform code of this Solution Guide.
# We recommend that you modify this file only through the Terraform deployment.
export PROJECT=${module.google_cloud_project.project_id}
export REGION=${var.region}
export NETWORK=regions/${var.region}/subnetworks/${var.network_prefix}-subnet
export TEMP_LOCATION=gs://$PROJECT/tmp
export SERVICE_ACCOUNT=${module.dataflow_sa.email}
export TOPIC=projects/pubsub-public-data/topics/taxirides-realtime
export SPANNER_INSTANCE=${google_spanner_instance.spanner_instance.name}
export SPANNER_DATABASE=${local.spanner_database}
export SPANNER_METADATA_DB=${local.spanner_metadata_db}
export SPANNER_TABLE=${local.spanner_table}
export SPANNER_CHANGE_STREAM=${local.spanner_change_stream}
export BIGQUERY_DATASET=${local.bigquery_dataset}
export MAX_DATAFLOW_WORKERS=${local.max_dataflow_workers}
export WORKER_TYPE=${local.worker_type}
FILE
}