terraform-modules/resources/tf-resources.tf (2,317 lines of code) (raw):
####################################################################################
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
####################################################################################
####################################################################################
# Create the GCP resources
#
# Author: Adam Paternostro
####################################################################################
terraform {
required_providers {
google = {
source = "hashicorp/google-beta"
version = "5.35.0"
}
}
}
####################################################################################
# Variables
####################################################################################
variable "gcp_account_name" {}
variable "project_id" {}
variable "composer_region" {}
variable "dataform_region" {}
variable "dataplex_region" {}
variable "dataproc_region" {}
variable "dataflow_region" {}
variable "bigquery_region" {}
variable "bigquery_non_multi_region" {}
variable "spanner_region" {}
variable "datafusion_region" {}
variable "vertex_ai_region" {}
variable "cloud_function_region" {}
variable "data_catalog_region" {}
variable "dataproc_serverless_region" {}
variable "cloud_sql_region" {}
variable "cloud_sql_zone" {}
variable "datastream_region" {}
variable "colab_enterprise_region" {}
variable "storage_bucket" {}
variable "spanner_config" {}
variable "random_extension" {}
variable "project_number" {}
variable "deployment_service_account_name" {}
variable "curl_impersonation" {}
variable "aws_omni_biglake_dataset_region" {}
variable "aws_omni_biglake_dataset_name" {}
variable "azure_omni_biglake_dataset_name" {}
variable "azure_omni_biglake_dataset_region" {}
variable "terraform_service_account" {}
# Hardcoded
variable "bigquery_taxi_dataset" {
type = string
default = "taxi_dataset"
}
variable "bigquery_thelook_ecommerce_dataset" {
type = string
default = "thelook_ecommerce"
}
variable "bigquery_rideshare_lakehouse_raw_dataset" {
type = string
default = "rideshare_lakehouse_raw"
}
variable "bigquery_rideshare_lakehouse_enriched_dataset" {
type = string
default = "rideshare_lakehouse_enriched"
}
variable "bigquery_rideshare_lakehouse_curated_dataset" {
type = string
default = "rideshare_lakehouse_curated"
}
variable "bigquery_rideshare_llm_raw_dataset" {
type = string
default = "rideshare_llm_raw"
}
variable "bigquery_rideshare_llm_enriched_dataset" {
type = string
default = "rideshare_llm_enriched"
}
variable "bigquery_rideshare_llm_curated_dataset" {
type = string
default = "rideshare_llm_curated"
}
variable "bigquery_cleanroom_dataset" {
type = string
default = "nyc_rideshare_data_nyc_rideshare_tables"
}
####################################################################################
# Bucket for all data (BigQuery, Spark, etc...)
# This is your "Data Lake" bucket
# If you are using Dataplex you should create a bucket per data lake zone (bronze, silver, gold, etc.)
####################################################################################
resource "google_storage_bucket" "raw_bucket" {
project = var.project_id
name = "raw-${var.storage_bucket}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "processed_bucket" {
project = var.project_id
name = "processed-${var.storage_bucket}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "code_bucket" {
project = var.project_id
name = "code-${var.storage_bucket}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "rideshare_lakehouse_raw" {
project = var.project_id
name = "rideshare-lakehouse-raw-${var.random_extension}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "rideshare_lakehouse_enriched" {
project = var.project_id
name = "rideshare-lakehouse-enriched-${var.random_extension}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "rideshare_lakehouse_curated" {
project = var.project_id
name = "rideshare-lakehouse-curated-${var.random_extension}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "iceberg_catalog_bucket" {
project = var.project_id
name = "iceberg-catalog-${var.random_extension}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "iceberg_catalog_source_data_bucket" {
project = var.project_id
name = "iceberg-source-data-${var.random_extension}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
resource "google_storage_bucket" "biglake_managed_table_bucket" {
project = var.project_id
name = "mt-${var.storage_bucket}"
location = var.bigquery_region
force_destroy = true
uniform_bucket_level_access = true
}
####################################################################################
# Custom Roles
####################################################################################
# Required since we are setting BigLake permissions with BigSpark
resource "google_project_iam_custom_role" "customconnectiondelegate" {
project = var.project_id
role_id = "CustomConnectionDelegate"
title = "Custom Connection Delegate"
description = "Used for BQ connections"
permissions = ["biglake.tables.create", "biglake.tables.delete", "biglake.tables.get",
"biglake.tables.list", "biglake.tables.lock", "biglake.tables.update",
"bigquery.connections.delegate"]
}
resource "google_project_iam_custom_role" "custom-role-custom-delegate" {
project = var.project_id
role_id = "CustomDelegate"
title = "Custom Delegate"
description = "Used for BLMS connections"
permissions = ["bigquery.connections.delegate"]
}
####################################################################################
# Default Network
# The project was not created with the default network.
# This creates just the network/subnets we need.
####################################################################################
resource "google_compute_network" "default_network" {
project = var.project_id
name = "vpc-main"
description = "Default network"
auto_create_subnetworks = false
mtu = 1460
}
resource "google_compute_subnetwork" "compute_subnet" {
project = var.project_id
name = "compute-subnet"
ip_cidr_range = "10.1.0.0/16"
region = var.cloud_sql_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network
]
}
# Firewall for NAT Router
resource "google_compute_firewall" "subnet_firewall_rule" {
project = var.project_id
name = "subnet-nat-firewall"
network = google_compute_network.default_network.id
allow {
protocol = "icmp"
}
allow {
protocol = "tcp"
}
allow {
protocol = "udp"
}
source_ranges = ["10.1.0.0/16", "10.2.0.0/16", "10.3.0.0/16", "10.4.0.0/16", "10.5.0.0/16"]
depends_on = [
google_compute_subnetwork.compute_subnet,
google_compute_subnetwork.composer_subnet,
google_compute_subnetwork.dataproc_subnet,
google_compute_subnetwork.dataproc_serverless_subnet,
google_compute_subnetwork.dataflow_subnet
]
}
# We want a NAT for every region
locals {
distinctRegions = distinct([var.composer_region,
var.dataform_region,
var.dataplex_region,
var.dataproc_region,
var.dataflow_region,
var.bigquery_non_multi_region,
var.spanner_region,
var.datafusion_region,
var.vertex_ai_region,
var.cloud_function_region,
var.data_catalog_region,
var.dataproc_serverless_region,
var.cloud_sql_region,
var.datastream_region
])
}
resource "google_compute_router" "nat-router-distinct-regions" {
project = var.project_id
count = length(local.distinctRegions)
name = "nat-router-${local.distinctRegions[count.index]}"
region = local.distinctRegions[count.index]
network = google_compute_network.default_network.id
depends_on = [
google_compute_firewall.subnet_firewall_rule
]
}
resource "google_compute_router_nat" "nat-config-distinct-regions" {
project = var.project_id
count = length(local.distinctRegions)
name = "nat-config-${local.distinctRegions[count.index]}"
router = google_compute_router.nat-router-distinct-regions[count.index].name
region = local.distinctRegions[count.index]
nat_ip_allocate_option = "AUTO_ONLY"
source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES"
depends_on = [
google_compute_router.nat-router-distinct-regions
]
}
####################################################################################
# Datastream
####################################################################################
# Firewall rule for Cloud Shell to SSH in Compute VMs
# A compute VM will be deployed as a SQL Reverse Proxy for Datastream private connectivity
resource "google_compute_firewall" "cloud_shell_ssh_firewall_rule" {
project = var.project_id
name = "cloud-shell-ssh-firewall-rule"
network = google_compute_network.default_network.id
allow {
protocol = "tcp"
ports = ["22"]
}
direction = "INGRESS"
target_tags = ["ssh-firewall-tag"]
source_ranges = ["35.235.240.0/20"]
depends_on = [
google_compute_network.default_network
]
}
# Datastream ingress rules for SQL Reverse Proxy communication
resource "google_compute_firewall" "datastream_ingress_rule_firewall_rule" {
project = var.project_id
name = "datastream-ingress-rule"
network = google_compute_network.default_network.id
allow {
protocol = "tcp"
ports = ["5432"]
}
direction = "INGRESS"
source_ranges = ["10.6.0.0/16", "10.7.0.0/29"]
depends_on = [
google_compute_network.default_network
]
}
# Datastream egress rules for SQL Reverse Proxy communication
resource "google_compute_firewall" "datastream_egress_rule_firewall_rule" {
project = var.project_id
name = "datastream-egress-rule"
network = google_compute_network.default_network.id
allow {
protocol = "tcp"
ports = ["5432"]
}
direction = "EGRESS"
destination_ranges = ["10.6.0.0/16", "10.7.0.0/29"]
depends_on = [
google_compute_network.default_network
]
}
/*
# Create the Datastream Private Connection (takes a while so it is done here and not created on the fly in Airflow)
resource "google_datastream_private_connection" "datastream_cloud-sql-private-connect" {
project = var.project_id
display_name = "cloud-sql-private-connect"
location = var.datastream_region
private_connection_id = "cloud-sql-private-connect"
vpc_peering_config {
vpc = google_compute_network.default_network.id
subnet = "10.7.0.0/29"
}
depends_on = [
google_compute_network.default_network
]
}
*/
# For Cloud SQL / Datastream demo
# Allocate an IP address range
# https://cloud.google.com/sql/docs/mysql/configure-private-services-access#allocate-ip-address-range
resource "google_compute_global_address" "google_compute_global_address_vpc_main" {
project = var.project_id
name = "google-managed-services-vpc-main"
purpose = "VPC_PEERING"
address_type = "INTERNAL"
prefix_length = 16
network = google_compute_network.default_network.id
depends_on = [
google_compute_network.default_network
]
}
# Create a private connection
# https://cloud.google.com/sql/docs/mysql/configure-private-services-access#create_a_private_connection
resource "google_service_networking_connection" "google_service_networking_connection_default" {
# project = var.project_id
network = google_compute_network.default_network.id
service = "servicenetworking.googleapis.com"
reserved_peering_ranges = [google_compute_global_address.google_compute_global_address_vpc_main.name]
depends_on = [
google_compute_network.default_network,
google_compute_global_address.google_compute_global_address_vpc_main
]
}
# Force the service account to get created so we can grant permisssions
resource "google_project_service_identity" "service_identity_servicenetworking" {
project = var.project_id
service = "servicenetworking.googleapis.com"
depends_on = [
google_compute_network.default_network,
google_service_networking_connection.google_service_networking_connection_default
]
}
resource "time_sleep" "service_identity_servicenetworking_time_delay" {
depends_on = [google_project_service_identity.service_identity_servicenetworking]
create_duration = "30s"
}
# Add permissions for the database to get created
resource "google_project_iam_member" "iam_service_networking" {
project = var.project_id
role = "roles/servicenetworking.serviceAgent"
member = "serviceAccount:${google_project_service_identity.service_identity_servicenetworking.email}"
#member = "serviceAccount:service-${var.project_number}@service-networking.iam.gserviceaccount.com "
depends_on = [
google_compute_network.default_network,
google_service_networking_connection.google_service_networking_connection_default,
time_sleep.service_identity_servicenetworking_time_delay
]
}
####################################################################################
# Dataproc
####################################################################################
# Subnet for dataproc cluster
resource "google_compute_subnetwork" "dataproc_subnet" {
project = var.project_id
name = "dataproc-subnet"
ip_cidr_range = "10.3.0.0/16"
region = var.dataproc_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network,
]
}
# Firewall rule for dataproc cluster
resource "google_compute_firewall" "dataproc_subnet_firewall_rule" {
project = var.project_id
name = "dataproc-firewall"
network = google_compute_network.default_network.id
allow {
protocol = "icmp"
}
allow {
protocol = "tcp"
}
allow {
protocol = "udp"
}
source_ranges = ["10.3.0.0/16"]
depends_on = [
google_compute_subnetwork.dataproc_subnet
]
}
# Temp work bucket for dataproc cluster
# If you do not have a perminate temp bucket random ones will be created (which is messy since you do not know what they are being used for)
resource "google_storage_bucket" "dataproc_bucket" {
project = var.project_id
name = "dataproc-${var.storage_bucket}"
location = var.dataproc_region
force_destroy = true
uniform_bucket_level_access = true
}
# Service account for dataproc cluster
resource "google_service_account" "dataproc_service_account" {
project = var.project_id
account_id = "dataproc-service-account"
display_name = "Service Account for Dataproc Environment"
}
# Grant require worker role
resource "google_project_iam_member" "dataproc_service_account_worker_role" {
project = var.project_id
role = "roles/dataproc.worker"
member = "serviceAccount:${google_service_account.dataproc_service_account.email}"
depends_on = [
google_service_account.dataproc_service_account
]
}
# Grant editor (too high) to service account
resource "google_project_iam_member" "dataproc_service_account_editor_role" {
project = var.project_id
role = "roles/editor"
member = "serviceAccount:${google_service_account.dataproc_service_account.email}"
depends_on = [
google_project_iam_member.dataproc_service_account_worker_role
]
}
# So dataproc can call theh BigLake connection in BigQuery
resource "google_project_iam_member" "dataproc_customconnectiondelegate" {
project = var.project_id
role = google_project_iam_custom_role.customconnectiondelegate.id
member = "serviceAccount:${google_service_account.dataproc_service_account.email}"
depends_on = [
google_project_iam_member.dataproc_service_account_editor_role
]
}
# Create the cluster
# NOTE: This is now done in Airflow, but has kept for reference
/*
resource "google_dataproc_cluster" "mycluster" {
name = "testcluster"
project = var.project_id
region = var.dataproc_region
graceful_decommission_timeout = "120s"
cluster_config {
staging_bucket = "dataproc-${var.storage_bucket}"
master_config {
num_instances = 1
machine_type = "n1-standard-8"
disk_config {
boot_disk_type = "pd-ssd"
boot_disk_size_gb = 30
}
}
worker_config {
num_instances = 4
machine_type = "n1-standard-8"
disk_config {
boot_disk_size_gb = 30
num_local_ssds = 1
}
}
preemptible_worker_config {
num_instances = 0
}
# Override or set some custom properties
#software_config {
# image_version = "2.0.28-debian10"
# override_properties = {
# "dataproc:dataproc.allow.zero.workers" = "true"
# }
#}
gce_cluster_config {
zone = var.zone
subnetwork = google_compute_subnetwork.dataproc_subnet.id
service_account = google_service_account.dataproc_service_account.email
service_account_scopes = ["cloud-platform"]
}
}
depends_on = [
google_compute_subnetwork.dataproc_subnet,
google_storage_bucket.dataproc_bucket,
google_project_iam_member.dataproc_service_account_editor_role,
google_compute_firewall.dataproc_subnet_firewall_rule
]
}
*/
####################################################################################
# Composer 2
####################################################################################
# Cloud Composer v2 API Service Agent Extension
# The below does not overwrite at the Org level like GCP docs: https://cloud.google.com/composer/docs/composer-2/create-environments#terraform
resource "google_project_iam_member" "cloudcomposer_account_service_agent_v2_ext" {
project = var.project_id
role = "roles/composer.ServiceAgentV2Ext"
member = "serviceAccount:service-${var.project_number}@cloudcomposer-accounts.iam.gserviceaccount.com"
}
# Cloud Composer API Service Agent
resource "google_project_iam_member" "cloudcomposer_account_service_agent" {
project = var.project_id
role = "roles/composer.serviceAgent"
member = "serviceAccount:service-${var.project_number}@cloudcomposer-accounts.iam.gserviceaccount.com"
depends_on = [
google_project_iam_member.cloudcomposer_account_service_agent_v2_ext
]
}
resource "google_compute_subnetwork" "composer_subnet" {
project = var.project_id
name = "composer-subnet"
ip_cidr_range = "10.2.0.0/16"
region = var.composer_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network
]
}
resource "google_service_account" "composer_service_account" {
project = var.project_id
account_id = "composer-service-account"
display_name = "Service Account for Composer Environment"
}
resource "google_project_iam_member" "composer_service_account_worker_role" {
project = var.project_id
role = "roles/composer.worker"
member = "serviceAccount:${google_service_account.composer_service_account.email}"
depends_on = [
google_service_account.composer_service_account
]
}
# The DAGs will be doing a lot of BQ automation
# This role can be scaled down once the DAGs are created (the DAGS do high level Owner automation - just for demo purposes)
resource "google_project_iam_member" "composer_service_account_bq_admin_role" {
# provider= google.service_principal_impersonation
project = var.project_id
role = "roles/owner"
member = "serviceAccount:${google_service_account.composer_service_account.email}"
depends_on = [
google_project_iam_member.composer_service_account_worker_role
]
}
# Let composer impersonation the service account that can change org policies (for demo purposes)
# This account also will be running Terraform scripts and impersonating this account
resource "google_service_account_iam_member" "cloudcomposer_service_account_impersonation" {
service_account_id = "projects/${var.project_id}/serviceAccounts/${var.project_id}@${var.project_id}.iam.gserviceaccount.com"
role = "roles/iam.serviceAccountTokenCreator"
member = "serviceAccount:${google_service_account.composer_service_account.email}"
depends_on = [google_project_iam_member.composer_service_account_bq_admin_role]
}
# ActAs role
resource "google_project_iam_member" "cloudcomposer_act_as" {
project = var.project_id
role = "roles/iam.serviceAccountUser"
member = "serviceAccount:${google_service_account.composer_service_account.email}"
depends_on = [
google_service_account_iam_member.cloudcomposer_service_account_impersonation
]
}
#fetching latest available Cloud Composer versions in a region for a given project.
data "google_composer_image_versions" "latest_image" {
region = var.composer_region
}
resource "google_composer_environment" "composer_env" {
project = var.project_id
name = "data-analytics-demo-composer-2"
region = var.composer_region
config {
software_config {
image_version = "composer-2.10.2-airflow-2.10.2" # the latest version is broke
#image_version = data.google_composer_image_versions.latest_image.image_versions[0].image_version_id
# Upgrading this failed as well same unhealthy error
pypi_packages = {
psycopg2-binary = "==2.9.10" # You need to upgrade this as you upgrade the composer version
}
env_variables = {
ENV_PROJECT_ID = var.project_id,
ENV_PROJECT_NUMBER = var.project_number,
ENV_RAW_BUCKET = "raw-${var.storage_bucket}",
ENV_PROCESSED_BUCKET = "processed-${var.storage_bucket}",
ENV_CODE_BUCKET = "code-${var.storage_bucket}",
ENV_COMPOSER_REGION = var.composer_region
ENV_DATAFORM_REGION = var.dataform_region
ENV_DATAPLEX_REGION = var.dataplex_region
ENV_DATAPROC_REGION = var.dataproc_region
ENV_DATAFLOW_REGION = var.dataflow_region
ENV_BIGQUERY_REGION = var.bigquery_region
ENV_BIGQUERY_NON_MULTI_REGION = var.bigquery_non_multi_region
ENV_SPANNER_REGION = var.spanner_region
ENV_DATAFUSION_REGION = var.datafusion_region
ENV_VERTEX_AI_REGION = var.vertex_ai_region
ENV_CLOUD_FUNCTION_REGION = var.cloud_function_region
ENV_DATA_CATALOG_REGION = var.data_catalog_region
ENV_DATAPROC_SERVERLESS_REGION = var.dataproc_serverless_region
ENV_DATAPROC_SERVERLESS_SUBNET = "projects/${var.project_id}/regions/${var.dataproc_serverless_region}/subnetworks/dataproc-serverless-subnet",
ENV_DATAPROC_SERVERLESS_SUBNET_NAME = google_compute_subnetwork.dataproc_serverless_subnet.name,
ENV_CLOUD_SQL_REGION = var.cloud_sql_region,
ENV_CLOUD_SQL_ZONE = var.cloud_sql_zone,
ENV_DATASTREAM_REGION = var.datastream_region,
ENV_DATAPROC_BUCKET = "dataproc-${var.storage_bucket}",
ENV_DATAPROC_SUBNET = "projects/${var.project_id}/regions/${var.dataproc_region}/subnetworks/dataproc-subnet",
ENV_DATAPROC_SERVICE_ACCOUNT = "dataproc-service-account@${var.project_id}.iam.gserviceaccount.com",
ENV_GCP_ACCOUNT_NAME = "${var.gcp_account_name}",
ENV_TAXI_DATASET_ID = google_bigquery_dataset.taxi_dataset.dataset_id,
ENV_THELOOK_DATASET_ID = google_bigquery_dataset.thelook_ecommerce_dataset.dataset_id,
ENV_SPANNER_INSTANCE_ID = "spanner-${var.random_extension}" //google_spanner_instance.spanner_instance.name,
ENV_DATAFLOW_SUBNET = "regions/${var.dataflow_region}/subnetworks/dataflow-subnet",
ENV_DATAFLOW_SERVICE_ACCOUNT = "dataflow-service-account@${var.project_id}.iam.gserviceaccount.com",
ENV_RANDOM_EXTENSION = var.random_extension
ENV_SPANNER_CONFIG = var.spanner_config
ENV_RIDESHARE_LAKEHOUSE_RAW_BUCKET = google_storage_bucket.rideshare_lakehouse_raw.name
ENV_RIDESHARE_LAKEHOUSE_ENRICHED_BUCKET = google_storage_bucket.rideshare_lakehouse_enriched.name
ENV_RIDESHARE_LAKEHOUSE_CURATED_BUCKET = google_storage_bucket.rideshare_lakehouse_curated.name
ENV_RIDESHARE_LAKEHOUSE_RAW_DATASET = var.bigquery_rideshare_lakehouse_raw_dataset
ENV_RIDESHARE_LAKEHOUSE_ENRICHED_DATASET = var.bigquery_rideshare_lakehouse_enriched_dataset
ENV_RIDESHARE_LAKEHOUSE_CURATED_DATASET = var.bigquery_rideshare_lakehouse_curated_dataset
ENV_RIDESHARE_LLM_RAW_DATASET = var.bigquery_rideshare_llm_raw_dataset
ENV_RIDESHARE_LLM_ENRICHED_DATASET = var.bigquery_rideshare_llm_enriched_dataset
ENV_RIDESHARE_LLM_CURATED_DATASET = var.bigquery_rideshare_llm_curated_dataset
ENV_TERRAFORM_SERVICE_ACCOUNT = var.terraform_service_account,
ENV_RIDESHARE_PLUS_SERVICE_ACCOUNT = google_service_account.cloud_run_rideshare_plus_service_account.email
}
}
# this is designed to be the smallest cheapest Composer for demo purposes
# In your TF Script, Increase Web Server Memory to at least 2 GB (4 GB per CPU core)
workloads_config {
scheduler {
cpu = 1
memory_gb = 1
storage_gb = 1
count = 1
}
web_server {
cpu = 0.5
memory_gb = 2
storage_gb = 1
}
worker {
cpu = 2
memory_gb = 10
storage_gb = 10
min_count = 1
max_count = 4
}
}
environment_size = "ENVIRONMENT_SIZE_SMALL"
node_config {
network = google_compute_network.default_network.id
subnetwork = google_compute_subnetwork.composer_subnet.id
service_account = google_service_account.composer_service_account.name
}
private_environment_config {
enable_private_endpoint = true
}
}
depends_on = [
google_project_iam_member.cloudcomposer_account_service_agent_v2_ext,
google_project_iam_member.cloudcomposer_account_service_agent,
google_compute_subnetwork.composer_subnet,
google_service_account.composer_service_account,
google_project_iam_member.composer_service_account_worker_role,
google_project_iam_member.composer_service_account_bq_admin_role,
google_compute_router_nat.nat-config-distinct-regions,
google_service_account.cloud_run_rideshare_plus_service_account
]
timeouts {
create = "90m"
}
}
####################################################################################
# BigQuery Datasets
####################################################################################
resource "google_bigquery_dataset" "taxi_dataset" {
project = var.project_id
dataset_id = var.bigquery_taxi_dataset
friendly_name = var.bigquery_taxi_dataset
description = "This contains the NYC taxi data"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "thelook_ecommerce_dataset" {
project = var.project_id
dataset_id = var.bigquery_thelook_ecommerce_dataset
friendly_name = var.bigquery_thelook_ecommerce_dataset
description = "This contains the Looker eCommerce data"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "rideshare_lakehouse_raw_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_lakehouse_raw_dataset
friendly_name = var.bigquery_rideshare_lakehouse_raw_dataset
description = "This contains the Rideshare Plus Analytics Raw Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "rideshare_lakehouse_enriched_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_lakehouse_enriched_dataset
friendly_name = var.bigquery_rideshare_lakehouse_enriched_dataset
description = "This contains the Rideshare Plus Analytics Curated Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "rideshare_lakehouse_curated_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_lakehouse_curated_dataset
friendly_name = var.bigquery_rideshare_lakehouse_curated_dataset
description = "This contains the Rideshare Plus Analytics Curated Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "rideshare_llm_raw_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_llm_raw_dataset
friendly_name = var.bigquery_rideshare_llm_raw_dataset
description = "This contains the Rideshare Plus LLM Raw Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "rideshare_llm_enriched_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_llm_enriched_dataset
friendly_name = var.bigquery_rideshare_llm_enriched_dataset
description = "This contains the Rideshare Plus LLM Enriched Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "ideshare_llm_curated_dataset" {
project = var.project_id
dataset_id = var.bigquery_rideshare_llm_curated_dataset
friendly_name = var.bigquery_rideshare_llm_curated_dataset
description = "This contains the Rideshare Plus LLM Curated Zone"
location = var.bigquery_region
}
resource "google_bigquery_dataset" "aws_omni_biglake_dataset" {
project = var.project_id
dataset_id = var.aws_omni_biglake_dataset_name
friendly_name = var.aws_omni_biglake_dataset_name
description = "This contains the AWS OMNI NYC taxi data"
location = var.aws_omni_biglake_dataset_region
}
resource "google_bigquery_dataset" "azure_omni_biglake_dataset" {
project = var.project_id
dataset_id = var.azure_omni_biglake_dataset_name
friendly_name = var.azure_omni_biglake_dataset_name
description = "This contains the Azure OMNI NYC taxi data"
location = var.azure_omni_biglake_dataset_region
}
# Subnet for bigspark / central region
resource "google_compute_subnetwork" "dataproc_serverless_subnet" {
project = var.project_id
name = "dataproc-serverless-subnet"
ip_cidr_range = "10.5.0.0/16"
region = var.bigquery_non_multi_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network,
]
}
# Needed for BigSpark to Dataproc
resource "google_compute_firewall" "dataproc_serverless_subnet_firewall_rule" {
project = var.project_id
name = "dataproc-serverless-firewall"
network = google_compute_network.default_network.id
allow {
protocol = "all"
}
source_ranges = ["10.5.0.0/16"]
depends_on = [
google_compute_subnetwork.dataproc_serverless_subnet
]
}
####################################################################################
# Data Catalog Taxonomy
# AWS Region
####################################################################################
resource "google_data_catalog_taxonomy" "business_critical_taxonomy_aws" {
project = var.project_id
region = var.aws_omni_biglake_dataset_region
# Must be unique accross your Org
display_name = "Business-Critical-AWS-${var.random_extension}"
description = "A collection of policy tags (AWS)"
activated_policy_types = ["FINE_GRAINED_ACCESS_CONTROL"]
}
resource "google_data_catalog_policy_tag" "low_security_policy_tag_aws" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy_aws.id
display_name = "AWS Low security"
description = "A policy tag normally associated with low security items (AWS)"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy_aws,
]
}
resource "google_data_catalog_policy_tag" "high_security_policy_tag_aws" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy_aws.id
display_name = "AWS High security"
description = "A policy tag normally associated with high security items (AWS)"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy_aws
]
}
resource "google_data_catalog_policy_tag_iam_member" "member_aws" {
policy_tag = google_data_catalog_policy_tag.low_security_policy_tag_aws.name
role = "roles/datacatalog.categoryFineGrainedReader"
member = "user:${var.gcp_account_name}"
depends_on = [
google_data_catalog_policy_tag.low_security_policy_tag_aws,
]
}
####################################################################################
# Data Catalog Taxonomy
# Azure Region
####################################################################################
resource "google_data_catalog_taxonomy" "business_critical_taxonomy_azure" {
project = var.project_id
region = var.azure_omni_biglake_dataset_region
# Must be unique accross your Org
display_name = "Business-Critical-Azure-${var.random_extension}"
description = "A collection of policy tags (Azure)"
activated_policy_types = ["FINE_GRAINED_ACCESS_CONTROL"]
}
resource "google_data_catalog_policy_tag" "low_security_policy_tag_azure" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy_azure.id
display_name = "Azure Low security"
description = "A policy tag normally associated with low security items (Azure)"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy_azure,
]
}
resource "google_data_catalog_policy_tag" "high_security_policy_tag_azure" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy_azure.id
display_name = "Azure High security"
description = "A policy tag normally associated with high security items (Azure)"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy_azure
]
}
resource "google_data_catalog_policy_tag_iam_member" "member_azure" {
policy_tag = google_data_catalog_policy_tag.low_security_policy_tag_azure.name
role = "roles/datacatalog.categoryFineGrainedReader"
member = "user:${var.gcp_account_name}"
depends_on = [
google_data_catalog_policy_tag.low_security_policy_tag_azure,
]
}
####################################################################################
# Dataplex / Data Lineage
####################################################################################
resource "google_project_iam_member" "gcp_roles_datalineage_admin" {
project = var.project_id
role = "roles/datalineage.admin"
member = "user:${var.gcp_account_name}"
}
####################################################################################
# Bring in Analytics Hub reference
####################################################################################
# https://cloud.google.com/bigquery/docs/reference/analytics-hub/rest/v1/projects.locations.dataExchanges.listings/subscribe
/*
# https://cloud.google.com/bigquery/docs/reference/analytics-hub/rest/v1/projects.locations.dataExchanges.listings/subscribe
curl --request POST \
'https://analyticshub.googleapis.com/v1/projects/1057666841514/locations/us/dataExchanges/google_cloud_public_datasets_17e74966199/listings/ghcn_daily_17ee6ceb8e9:subscribe' \
--header "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
--header 'Accept: application/json' \
--header 'Content-Type: application/json' \
--data '{"destinationDataset":{"datasetReference":{"datasetId":"ghcn_daily","projectId":"data-analytics-demo-5qiz4e36kf"},"friendlyName":"ghcn_daily","location":"us","description":"ghcn_daily"}}' \
--compressed
*/
resource "null_resource" "analyticshub_daily_weather_data" {
provisioner "local-exec" {
when = create
command = <<EOF
curl --request POST \
"https://analyticshub.googleapis.com/v1/projects/1057666841514/locations/us/dataExchanges/google_cloud_public_datasets_17e74966199/listings/ghcn_daily_17ee6ceb8e9:subscribe" \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--data '{"destinationDataset":{"datasetReference":{"datasetId":"ghcn_daily","projectId":"${var.project_id}"},"friendlyName":"ghcn_daily","location":"us","description":"ghcn_daily"}}' \
--compressed
EOF
}
depends_on = [
]
}
####################################################################################
# Data Catalog Taxonomy
# Taxi US Region
####################################################################################
resource "google_data_catalog_taxonomy" "business_critical_taxonomy" {
project = var.project_id
region = var.bigquery_region
# Must be unique accross your Org
display_name = "Business-Critical-${var.random_extension}"
description = "A collection of policy tags"
activated_policy_types = ["FINE_GRAINED_ACCESS_CONTROL"]
}
resource "google_data_catalog_policy_tag" "low_security_policy_tag" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy.id
display_name = "Low security"
description = "A policy tag normally associated with low security items"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy,
]
}
resource "google_data_catalog_policy_tag" "high_security_policy_tag" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy.id
display_name = "High security"
description = "A policy tag normally associated with high security items"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy
]
}
resource "google_data_catalog_policy_tag_iam_member" "member" {
policy_tag = google_data_catalog_policy_tag.low_security_policy_tag.name
role = "roles/datacatalog.categoryFineGrainedReader"
member = "user:${var.gcp_account_name}"
depends_on = [
google_data_catalog_policy_tag.low_security_policy_tag,
]
}
# Data Masking
resource "google_data_catalog_policy_tag" "data_masking_policy_tag" {
taxonomy = google_data_catalog_taxonomy.business_critical_taxonomy.id
display_name = "Data Masking security"
description = "A policy tag that will apply data masking"
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy
]
}
# REST API (no gcloud or Terraform yet)
# https://cloud.google.com/bigquery/docs/reference/bigquerydatapolicy/rest/v1beta1/projects.locations.dataPolicies#datamaskingpolicy
# Create a Hash Rule
resource "null_resource" "deploy_data_masking_sha256" {
provisioner "local-exec" {
when = create
command = <<EOF
curl \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
-X POST \
https://bigquerydatapolicy.googleapis.com/v1beta1/projects/${var.project_id}/locations/us/dataPolicies?prettyPrint=true \
--data \ '{"dataMaskingPolicy":{"predefinedExpression":"SHA256"},"dataPolicyId":"Hash_Rule","dataPolicyType":"DATA_MASKING_POLICY","policyTag":"${google_data_catalog_policy_tag.data_masking_policy_tag.id}"}'
EOF
}
depends_on = [
google_data_catalog_policy_tag.data_masking_policy_tag
]
}
# Create a Nullify Rule
resource "null_resource" "deploy_data_masking_nullify" {
provisioner "local-exec" {
when = create
command = <<EOF
curl \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
-X POST \
https://bigquerydatapolicy.googleapis.com/v1beta1/projects/${var.project_id}/locations/us/dataPolicies?prettyPrint=true \
--data \ '{"dataMaskingPolicy":{"predefinedExpression":"ALWAYS_NULL"},"dataPolicyId":"Nullify_Rule","dataPolicyType":"DATA_MASKING_POLICY","policyTag":"${google_data_catalog_policy_tag.data_masking_policy_tag.id}"}'
EOF
}
depends_on = [
null_resource.deploy_data_masking_sha256
]
}
# Create a Default-Value Rule
resource "null_resource" "deploy_data_masking_default_value" {
provisioner "local-exec" {
when = create
command = <<EOF
curl \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
-X POST \
https://bigquerydatapolicy.googleapis.com/v1beta1/projects/${var.project_id}/locations/us/dataPolicies?prettyPrint=true \
--data \ '{"dataMaskingPolicy":{"predefinedExpression":"DEFAULT_MASKING_VALUE"},"dataPolicyId":"DefaultValue_Rule","dataPolicyType":"DATA_MASKING_POLICY","policyTag":"${google_data_catalog_policy_tag.data_masking_policy_tag.id}"}'
EOF
}
depends_on = [
null_resource.deploy_data_masking_nullify
]
}
# Grant access to the user to the Nullify (you can change during the demo)
resource "null_resource" "deploy_data_masking_iam_permissions" {
provisioner "local-exec" {
when = create
command = <<EOT
curl \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
-X POST \
https://bigquerydatapolicy.googleapis.com/v1beta1/projects/${var.project_id}/locations/us/dataPolicies/Nullify_Rule:setIamPolicy?prettyPrint=true \
--data \ '{"policy":{"bindings":[{"members":["user:${var.gcp_account_name}"],"role":"roles/bigquerydatapolicy.maskedReader"}]}}'
EOT
}
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy,
google_data_catalog_policy_tag.data_masking_policy_tag,
null_resource.deploy_data_masking_sha256,
null_resource.deploy_data_masking_nullify,
null_resource.deploy_data_masking_default_value,
]
}
####################################################################################
# Cloud Function (BigQuery)
####################################################################################
# Zip the source code
data "archive_file" "bigquery_external_function_zip" {
type = "zip"
source_dir = "../cloud-functions/bigquery-external-function"
output_path = "../cloud-functions/bigquery-external-function.zip"
depends_on = [
google_storage_bucket.code_bucket
]
}
# Upload code
resource "google_storage_bucket_object" "bigquery_external_function_zip_upload" {
name = "cloud-functions/bigquery-external-function/bigquery-external-function.zip"
bucket = google_storage_bucket.code_bucket.name
source = data.archive_file.bigquery_external_function_zip.output_path
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.bigquery_external_function_zip
]
}
# Deploy the function
resource "google_cloudfunctions_function" "bigquery_external_function" {
project = var.project_id
region = var.cloud_function_region
name = "bigquery_external_function"
description = "bigquery_external_function"
runtime = "python310"
available_memory_mb = 256
source_archive_bucket = google_storage_bucket.code_bucket.name
source_archive_object = google_storage_bucket_object.bigquery_external_function_zip_upload.name
trigger_http = true
ingress_settings = "ALLOW_ALL"
https_trigger_security_level = "SECURE_ALWAYS"
entry_point = "bigquery_external_function"
environment_variables = {
PROJECT_ID = var.project_id,
ENV_CLOUD_FUNCTION_REGION = var.cloud_function_region
}
# no-allow-unauthenticated ???
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.bigquery_external_function_zip,
google_storage_bucket_object.bigquery_external_function_zip_upload
]
}
####################################################################################
# Cloud Function (Rideshare Plis)
####################################################################################
# Zip the source code
data "archive_file" "rideshare_plus_function_zip" {
type = "zip"
source_dir = "../cloud-functions/rideshare-plus-rest-api"
output_path = "../cloud-functions/rideshare-plus-rest-api.zip"
depends_on = [
google_storage_bucket.code_bucket
]
}
# Upload code
resource "google_storage_bucket_object" "rideshare_plus_function_zip_upload" {
name = "cloud-functions/rideshare-plus-rest-api/rideshare-plus-rest-api.zip"
bucket = google_storage_bucket.code_bucket.name
source = data.archive_file.rideshare_plus_function_zip.output_path
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip
]
}
# Deploy the function V2
resource "google_cloudfunctions2_function" "rideshare_plus_function" {
project = var.project_id
location = var.cloud_function_region
name = "demo-rest-api-service"
description = "demo-rest-api-service"
build_config {
runtime = "python310"
entry_point = "entrypoint" # Set the entry point
source {
storage_source {
bucket = google_storage_bucket.code_bucket.name
object = google_storage_bucket_object.rideshare_plus_function_zip_upload.name
}
}
}
service_config {
max_instance_count = 10
min_instance_count = 1
available_memory = "256M"
timeout_seconds = 60
ingress_settings = "ALLOW_ALL"
all_traffic_on_latest_revision = true
environment_variables = {
PROJECT_ID = var.project_id,
ENV_CODE_BUCKET = "code-${var.storage_bucket}"
}
}
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload
]
}
# IAM entry for all users to invoke the function
resource "google_cloudfunctions2_function_iam_member" "rideshare_plus_function_invoker" {
project = google_cloudfunctions2_function.rideshare_plus_function.project
location = google_cloudfunctions2_function.rideshare_plus_function.location
cloud_function = google_cloudfunctions2_function.rideshare_plus_function.name
role = "roles/cloudfunctions.invoker"
member = "allUsers"
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions2_function.rideshare_plus_function
]
}
# Update the Cloud Run to support allUsers used by Cloud Function V2
resource "google_cloud_run_service_iam_binding" "rideshare_plus_function_cloudrun" {
project = google_cloudfunctions2_function.rideshare_plus_function.project
location = google_cloudfunctions2_function.rideshare_plus_function.location
service = google_cloudfunctions2_function.rideshare_plus_function.name
role = "roles/run.invoker"
members = ["allUsers"]
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions2_function.rideshare_plus_function
]
}
# Deploy the function (V1)
/*
resource "google_cloudfunctions_function" "rideshare_plus_function" {
project = var.project_id
region = var.cloud_function_region
name = "demo-rest-api-service"
description = "demo-rest-api-service"
runtime = "python310"
available_memory_mb = 256
source_archive_bucket = google_storage_bucket.code_bucket.name
source_archive_object = google_storage_bucket_object.rideshare_plus_function_zip_upload.name
trigger_http = true
ingress_settings = "ALLOW_ALL"
https_trigger_security_level = "SECURE_ALWAYS"
entry_point = "entrypoint"
environment_variables = {
PROJECT_ID = var.project_id
}
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload
]
}
# IAM entry for all users to invoke the function
resource "google_cloudfunctions_function_iam_member" "rideshare_plus_function_invoker" {
project = var.project_id
region = var.cloud_function_region
cloud_function = google_cloudfunctions_function.rideshare_plus_function.name
role = "roles/cloudfunctions.invoker"
member = "allUsers"
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions_function.rideshare_plus_function
]
}
*/
####################################################################################
# BigQuery - Connections (BigLake, Functions, etc)
####################################################################################
# Cloud Function connection
# https://cloud.google.com/bigquery/docs/biglake-quickstart#terraform
resource "google_bigquery_connection" "cloud_function_connection" {
project = var.project_id
connection_id = "cloud-function"
location = var.bigquery_region
friendly_name = "cloud-function"
description = "cloud-function"
cloud_resource {}
depends_on = [
google_bigquery_connection.cloud_function_connection
]
}
# Allow service account to invoke the cloud function
resource "google_cloudfunctions_function_iam_member" "invoker" {
project = google_cloudfunctions_function.bigquery_external_function.project
region = google_cloudfunctions_function.bigquery_external_function.region
cloud_function = google_cloudfunctions_function.bigquery_external_function.name
role = "roles/cloudfunctions.invoker"
member = "serviceAccount:${google_bigquery_connection.cloud_function_connection.cloud_resource[0].service_account_id}"
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.bigquery_external_function_zip,
google_storage_bucket_object.bigquery_external_function_zip_upload,
google_cloudfunctions_function.bigquery_external_function,
google_bigquery_connection.cloud_function_connection
]
}
# Allow cloud function service account to read storage [V1 Function]
resource "google_project_iam_member" "bq_connection_iam_cloud_invoker" {
project = var.project_id
role = "roles/storage.objectViewer"
member = "serviceAccount:${var.project_id}@appspot.gserviceaccount.com"
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.bigquery_external_function_zip,
google_storage_bucket_object.bigquery_external_function_zip_upload,
google_cloudfunctions_function.bigquery_external_function,
google_bigquery_connection.cloud_function_connection
]
}
# Allow cloud function service account to call the STT API
resource "google_project_iam_member" "stt_iam_cloud_invoker" {
project = var.project_id
role = "roles/speech.client"
member = "serviceAccount:${var.project_id}@appspot.gserviceaccount.com"
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.bigquery_external_function_zip,
google_storage_bucket_object.bigquery_external_function_zip_upload,
google_cloudfunctions_function.bigquery_external_function,
google_bigquery_connection.cloud_function_connection
]
}
# Needed per https://cloud.google.com/build/docs/cloud-build-service-account-updates
resource "google_project_iam_member" "cloudfunction_builder" {
project = var.project_id
role = "roles/cloudbuild.builds.builder"
member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com"
}
# Needed per https://cloud.google.com/build/docs/cloud-build-service-account-updates
# Allow cloud function service account to read storage [V2 Function]
resource "google_project_iam_member" "cloudfunction_objectViewer" {
project = var.project_id
role = "roles/storage.objectViewer"
member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
google_project_iam_member.cloudfunction_builder
]
}
# Allow cloud function service account to run BQ jobs
resource "google_project_iam_member" "cloud_function_bq_job_user" {
project = var.project_id
role = "roles/bigquery.jobUser"
member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
google_project_iam_member.cloudfunction_objectViewer
]
}
# The cloud function needs to read/write to this bucket (code bucket)
resource "google_storage_bucket_iam_member" "function_code_bucket_storage_admin" {
bucket = google_storage_bucket.code_bucket.name
role = "roles/storage.admin"
member = "serviceAccount:${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
google_storage_bucket.code_bucket
]
}
# Allow cloud function to access Rideshare BQ Datasets
resource "google_bigquery_dataset_access" "cloud_function_access_bq_rideshare_curated" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_curated_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = "${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions2_function.rideshare_plus_function,
google_bigquery_dataset.rideshare_lakehouse_curated_dataset
]
}
# For streaming data / view
resource "google_bigquery_dataset_access" "cloud_function_access_bq_rideshare_raw" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_raw_dataset.dataset_id
role = "roles/bigquery.dataViewer"
user_by_email = "${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions2_function.rideshare_plus_function,
google_bigquery_dataset.rideshare_lakehouse_raw_dataset
]
}
# For streaming data / view [V2 function]
resource "google_bigquery_dataset_access" "cloud_function_access_bq_taxi_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
role = "roles/bigquery.dataViewer"
user_by_email = "${var.project_number}-compute@developer.gserviceaccount.com"
depends_on = [
data.archive_file.rideshare_plus_function_zip,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_cloudfunctions2_function.rideshare_plus_function,
google_bigquery_dataset.taxi_dataset
]
}
# BigLake connection
resource "google_bigquery_connection" "biglake_connection" {
project = var.project_id
connection_id = "biglake-connection"
location = var.bigquery_region
friendly_name = "biglake-connection"
description = "biglake-connection"
cloud_resource {}
depends_on = [
google_bigquery_connection.cloud_function_connection
]
}
resource "time_sleep" "biglake_connection_time_delay" {
depends_on = [google_bigquery_connection.biglake_connection]
create_duration = "30s"
}
# Allow BigLake to read storage
resource "google_project_iam_member" "bq_connection_iam_object_viewer" {
project = var.project_id
role = "roles/storage.objectViewer"
member = "serviceAccount:${google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.biglake_connection_time_delay
]
}
# Allow BigLake connection to call STT
resource "google_project_iam_member" "bq_connection_iam_stt_client" {
project = var.project_id
role = "roles/speech.client"
member = "serviceAccount:${google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.biglake_connection_time_delay
]
}
# BigLake Managed Tables
resource "google_storage_bucket_iam_member" "bq_connection_mt_iam_object_owner" {
bucket = google_storage_bucket.biglake_managed_table_bucket.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.biglake_connection_time_delay
]
}
# Allow BigLake to custom role
resource "google_project_iam_member" "biglake_customconnectiondelegate" {
project = var.project_id
role = google_project_iam_custom_role.customconnectiondelegate.id
member = "serviceAccount:${google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.biglake_connection_time_delay,
google_project_iam_custom_role.customconnectiondelegate
]
}
# In IAM add roles/biglake.admin to the us.biglake-notebook-connection service account
# To create the tables in BigQuery linked to BLMS
resource "google_project_iam_member" "biglake_connection_biglake_admin" {
project = var.project_id
role = "roles/biglake.admin"
member = "serviceAccount:${google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.biglake_connection_time_delay
]
}
resource "google_bigquery_dataset_access" "biglake_connection_taxi_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id
depends_on = [
time_sleep.biglake_connection_time_delay,
google_bigquery_dataset.taxi_dataset
]
}
resource "google_bigquery_dataset_access" "biglake_connection_rideshare_lakehouse_raw_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_raw_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id
depends_on = [
time_sleep.biglake_connection_time_delay,
google_bigquery_dataset.rideshare_lakehouse_raw_dataset
]
}
resource "google_bigquery_dataset_access" "biglake_connection_rideshare_lakehouse_enriched_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_enriched_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.biglake_connection.cloud_resource[0].service_account_id
depends_on = [
time_sleep.biglake_connection_time_delay,
google_bigquery_dataset.rideshare_lakehouse_enriched_dataset
]
}
# Vertex AI connection
resource "google_bigquery_connection" "vertex_ai_connection" {
project = var.project_id
connection_id = "vertex-ai"
location = var.bigquery_region
friendly_name = "vertex-ai"
description = "vertex-ai"
cloud_resource {}
depends_on = [
google_bigquery_connection.biglake_connection
]
}
resource "time_sleep" "vertex_ai_connection_time_delay" {
depends_on = [google_bigquery_connection.vertex_ai_connection]
create_duration = "30s"
}
# Allow Vertex AI connection to Vertex User
resource "google_project_iam_member" "vertex_ai_connection_vertex_user_role" {
project = var.project_id
role = "roles/aiplatform.user"
member = "serviceAccount:${google_bigquery_connection.vertex_ai_connection.cloud_resource[0].service_account_id}"
depends_on = [
time_sleep.vertex_ai_connection_time_delay
]
}
# Spark connection
resource "google_bigquery_connection" "spark_connection" {
project = var.project_id
connection_id = "spark-connection"
location = var.bigquery_region
friendly_name = "spark-connection"
description = "spark-connection"
spark {}
depends_on = [
google_bigquery_connection.vertex_ai_connection
]
}
resource "time_sleep" "spark_connection_time_delay" {
depends_on = [google_bigquery_connection.spark_connection]
create_duration = "30s"
}
# Set bucket object admin on Dataproc temp bucket
resource "google_storage_bucket_iam_member" "spark_connection_object_admin_dataproc_bucket" {
bucket = google_storage_bucket.dataproc_bucket.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
google_storage_bucket.dataproc_bucket
]
}
# Set bucket object admin on Raw
resource "google_storage_bucket_iam_member" "spark_connection_object_admin_raw_bucket" {
bucket = google_storage_bucket.raw_bucket.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
google_storage_bucket.raw_bucket
]
}
resource "google_storage_bucket_iam_member" "spark_connection_object_admin_rideshare_lakehouse_enriched" {
bucket = google_storage_bucket.rideshare_lakehouse_enriched.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
google_storage_bucket.rideshare_lakehouse_enriched
]
}
resource "google_storage_bucket_iam_member" "spark_connection_object_admin_rideshare_lakehouse_raw" {
bucket = google_storage_bucket.rideshare_lakehouse_raw.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
google_storage_bucket.rideshare_lakehouse_raw
]
}
resource "google_project_iam_member" "spark_connection_connection_admin" {
project = var.project_id
role = "roles/bigquery.connectionAdmin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
]
}
# In IAM add roles/biglake.admin to the us.spark-notebook-connection service account
# To create the Iceberg Catalog in BLMS
resource "google_project_iam_member" "spark_connection_biglake_admin" {
project = var.project_id
role = "roles/biglake.admin"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
]
}
# In IAM add roles/bigquery.user to the us.spark-notebook-connection service account
# To create BigQuery jobs
resource "google_project_iam_member" "spark_connection_bigquery_user" {
project = var.project_id
role = "roles/bigquery.user"
member = "serviceAccount:${google_bigquery_connection.spark_connection.spark[0].service_account_id}"
depends_on = [
time_sleep.spark_connection_time_delay,
]
}
resource "google_bigquery_dataset_access" "spark_connection_taxi_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.spark_connection.spark[0].service_account_id
depends_on = [
time_sleep.spark_connection_time_delay,
google_bigquery_dataset.taxi_dataset
]
}
resource "google_bigquery_dataset_access" "spark_connection_rideshare_lakehouse_raw_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_raw_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.spark_connection.spark[0].service_account_id
depends_on = [
time_sleep.spark_connection_time_delay,
google_bigquery_dataset.rideshare_lakehouse_raw_dataset
]
}
resource "google_bigquery_dataset_access" "spark_connection_rideshare_lakehouse_enriched_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_enriched_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_bigquery_connection.spark_connection.spark[0].service_account_id
depends_on = [
time_sleep.spark_connection_time_delay,
google_bigquery_dataset.rideshare_lakehouse_enriched_dataset
]
}
####################################################################################
# BigQuery Table with Column Level Security
####################################################################################
resource "google_bigquery_table" "default" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
table_id = "taxi_trips_with_col_sec"
clustering = ["Pickup_DateTime"]
schema = <<EOF
[
{
"name": "Vendor_Id",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "Pickup_DateTime",
"type": "TIMESTAMP",
"mode": "NULLABLE"
},
{
"name": "Dropoff_DateTime",
"type": "TIMESTAMP",
"mode": "NULLABLE"
},
{
"name": "Passenger_Count",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "Trip_Distance",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "Rate_Code_Id",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "Store_And_Forward",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "PULocationID",
"type": "INTEGER",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.data_masking_policy_tag.id}"]
}
},
{
"name": "DOLocationID",
"type": "INTEGER",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.data_masking_policy_tag.id}"]
}
},
{
"name": "Payment_Type_Id",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "Fare_Amount",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
},
{
"name": "Surcharge",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
},
{
"name": "MTA_Tax",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
},
{
"name": "Tip_Amount",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.high_security_policy_tag.id}"]
}
},
{
"name": "Tolls_Amount",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
},
{
"name": "Improvement_Surcharge",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
},
{
"name": "Total_Amount",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.high_security_policy_tag.id}"]
}
},
{
"name": "Congestion_Surcharge",
"type": "FLOAT64",
"mode": "NULLABLE",
"policyTags": {
"names": ["${google_data_catalog_policy_tag.low_security_policy_tag.id}"]
}
}
]
EOF
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy,
google_data_catalog_policy_tag.low_security_policy_tag,
google_data_catalog_policy_tag.high_security_policy_tag,
]
}
resource "google_bigquery_table" "taxi_trips_streaming" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
table_id = "taxi_trips_streaming"
time_partitioning {
field = "timestamp"
type = "HOUR"
}
clustering = ["ride_id"]
schema = <<EOF
[
{
"name": "ride_id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "point_idx",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "latitude",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "longitude",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "timestamp",
"type": "TIMESTAMP",
"mode": "NULLABLE"
},
{
"name": "meter_reading",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "meter_increment",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "ride_status",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "passenger_count",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "product_id",
"type": "INTEGER",
"mode": "NULLABLE"
}
]
EOF
depends_on = [
google_data_catalog_taxonomy.business_critical_taxonomy,
google_data_catalog_policy_tag.low_security_policy_tag,
google_data_catalog_policy_tag.high_security_policy_tag,
]
}
####################################################################################
# Spanner
####################################################################################
/* This is now part of a DAG
resource "google_spanner_instance" "spanner_instance" {
project = var.project_id
config = var.spanner_config
display_name = "main-instance"
processing_units = 100
}
resource "google_spanner_database" "spanner_weather_database" {
project = var.project_id
instance = google_spanner_instance.spanner_instance.name
name = "weather"
ddl = [
"CREATE TABLE weather (station_id STRING(100), station_date DATE, snow_mm_amt FLOAT64, precipitation_tenth_mm_amt FLOAT64, min_celsius_temp FLOAT64, max_celsius_temp FLOAT64) PRIMARY KEY(station_date,station_id)",
]
deletion_protection = false
depends_on = [
google_spanner_instance.spanner_instance
]
}
*/
####################################################################################
# DataFlow
####################################################################################
# Subnet for dataflow cluster
resource "google_compute_subnetwork" "dataflow_subnet" {
project = var.project_id
name = "dataflow-subnet"
ip_cidr_range = "10.4.0.0/16"
region = var.dataflow_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network,
]
}
# Firewall rule for dataflow cluster
resource "google_compute_firewall" "dataflow_subnet_firewall_rule" {
project = var.project_id
name = "dataflow-firewall"
network = google_compute_network.default_network.id
allow {
protocol = "icmp"
}
allow {
protocol = "tcp"
}
allow {
protocol = "udp"
}
source_ranges = ["10.4.0.0/16"]
depends_on = [
google_compute_subnetwork.dataflow_subnet
]
}
# Service account for dataflow cluster
resource "google_service_account" "dataflow_service_account" {
project = var.project_id
account_id = "dataflow-service-account"
display_name = "Service Account for Dataflow Environment"
}
# Grant editor (too high) to service account
resource "google_project_iam_member" "dataflow_service_account_editor_role" {
project = var.project_id
role = "roles/editor"
member = "serviceAccount:${google_service_account.dataflow_service_account.email}"
depends_on = [
google_service_account.dataflow_service_account
]
}
####################################################################################
# Set Impersonation for BigQuery Data Transfer Service for Composer
####################################################################################
# NOTE: In order to for the data transfer server service account "gcp-sa-bigquerydatatransfer.iam.gserviceaccount.com"
# to be created, a call must be made to the service. The below will do a "list" call which
# will return nothing, but will trigger the cloud to create the service account. Then
# IAM permissions can be set for this account.
# resource "null_resource" "trigger_data_trasfer_service_account_create" {
# provisioner "local-exec" {
# interpreter = ["/bin/bash","-c"]
# command = <<EOF
# if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ]
# then
# echo "We are not running in a local docker container. No need to login."
# else
# echo "We are running in local docker container. Logging in."
# gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}"
# gcloud config set account "${var.deployment_service_account_name}"
# fi
# curl "https://bigquerydatatransfer.googleapis.com/v1/projects/${var.project_id}/locations/${var.bigquery_non_multi_region}/transferConfigs" \
# --header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
# --header "Accept: application/json" \
# --compressed
# EOF
# }
# depends_on = [
# google_project_iam_member.cloudcomposer_account_service_agent_v2_ext,
# google_project_iam_member.cloudcomposer_account_service_agent,
# google_service_account.composer_service_account
# ]
# }
# Add the Service Account Short Term Token Minter role to a Google-managed service account used by the BigQuery Data Transfer Service
resource "google_project_service_identity" "service_identity_bigquery_data_transfer" {
project = var.project_id
service = "bigquerydatatransfer.googleapis.com"
depends_on = [
google_project_iam_member.cloudcomposer_account_service_agent_v2_ext,
google_project_iam_member.cloudcomposer_account_service_agent,
google_service_account.composer_service_account
]
}
resource "time_sleep" "create_bigquerydatatransfer_account_time_delay" {
depends_on = [google_project_service_identity.service_identity_bigquery_data_transfer]
create_duration = "30s"
}
resource "google_service_account_iam_member" "service_account_impersonation" {
service_account_id = google_service_account.composer_service_account.name
role = "roles/iam.serviceAccountTokenCreator"
member = "serviceAccount:${google_project_service_identity.service_identity_bigquery_data_transfer.email}"
# "serviceAccount:service-${var.project_number}@gcp-sa-bigquerydatatransfer.iam.gserviceaccount.com"
depends_on = [time_sleep.create_bigquerydatatransfer_account_time_delay]
}
resource "google_project_iam_member" "iam_member_bigquerydatatransfer_serviceAgent" {
project = var.project_id
role = "roles/bigquerydatatransfer.serviceAgent"
member = "serviceAccount:${google_project_service_identity.service_identity_bigquery_data_transfer.email}"
depends_on = [google_service_account_iam_member.service_account_impersonation]
}
####################################################################################
# Dataplex (Tag Templates)
####################################################################################
# https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/data_catalog_tag_template
resource "google_data_catalog_tag_template" "table_dq_tag_template" {
project = var.project_id
tag_template_id = "table_dq_tag_template"
region = var.data_catalog_region
display_name = "Data-Quality-Table"
fields {
field_id = "table_name"
display_name = "Table Name"
type {
primitive_type = "STRING"
}
is_required = true
}
fields {
field_id = "record_count"
display_name = "Number of rows in the data asset"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "latest_execution_ts"
display_name = "Last Data Quality Run Date"
type {
primitive_type = "TIMESTAMP"
}
}
fields {
field_id = "columns_validated"
display_name = "Number of columns validated"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "columns_count"
display_name = "Number of columns in data asset"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "success_pct"
display_name = "Success Percentage"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "failed_pct"
display_name = "Failed Percentage"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "invocation_id"
display_name = "Data Quality Invocation Id"
type {
primitive_type = "STRING"
}
is_required = true
}
force_delete = "false"
}
resource "google_data_catalog_tag_template" "column_dq_tag_template" {
project = var.project_id
tag_template_id = "column_dq_tag_template"
region = var.data_catalog_region
display_name = "Data-Quality-Column"
fields {
field_id = "table_name"
display_name = "Table Name"
type {
primitive_type = "STRING"
}
is_required = true
}
fields {
field_id = "column_id"
display_name = "Column Name"
type {
primitive_type = "STRING"
}
}
fields {
field_id = "execution_ts"
display_name = "Last Run Date"
type {
primitive_type = "TIMESTAMP"
}
}
fields {
field_id = "rule_binding_id"
display_name = "Rule Binding"
type {
primitive_type = "STRING"
}
}
fields {
field_id = "rule_id"
display_name = "Rule Id"
type {
primitive_type = "STRING"
}
}
fields {
field_id = "dimension"
display_name = "Dimension"
type {
primitive_type = "STRING"
}
}
fields {
field_id = "rows_validated"
display_name = "Rows Validated"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "success_count"
display_name = "Success Count"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "success_pct"
display_name = "Success Percentage"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "failed_count"
display_name = "Failed Count"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "failed_pct"
display_name = "Failed Percentage"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "null_count"
display_name = "Null Count"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "null_pct"
display_name = "Null Percentage"
type {
primitive_type = "DOUBLE"
}
}
fields {
field_id = "invocation_id"
display_name = "Invocation Id"
type {
primitive_type = "STRING"
}
is_required = true
}
force_delete = "false"
depends_on = [google_data_catalog_tag_template.table_dq_tag_template]
}
####################################################################################
# Pub/Sub
####################################################################################
resource "google_project_service_identity" "service_identity_pub_sub" {
project = var.project_id
service = "pubsub.googleapis.com"
depends_on = [
]
}
resource "time_sleep" "create_pubsub_account_time_delay" {
depends_on = [google_project_service_identity.service_identity_pub_sub]
create_duration = "30s"
}
# Grant require worker role
resource "google_bigquery_dataset_access" "pubsub_access_bq_taxi_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_project_service_identity.service_identity_pub_sub.email
depends_on = [
time_sleep.create_pubsub_account_time_delay
]
}
####################################################################################
# Colab Enterprise
####################################################################################
# Subnet for colab enterprise
resource "google_compute_subnetwork" "colab_subnet" {
project = var.project_id
name = "colab-subnet"
ip_cidr_range = "10.8.0.0/16"
region = var.colab_enterprise_region
network = google_compute_network.default_network.id
private_ip_google_access = true
depends_on = [
google_compute_network.default_network,
]
}
# https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.notebookRuntimeTemplates
# NOTE: If you want a "when = destroy" example TF please see:
# https://github.com/GoogleCloudPlatform/data-analytics-golden-demo/blob/main/cloud-composer/data/terraform/dataplex/terraform.tf#L147
resource "null_resource" "colab_runtime_template" {
provisioner "local-exec" {
when = create
command = <<EOF
curl -X POST \
https://${var.colab_enterprise_region}-aiplatform.googleapis.com/ui/projects/${var.project_id}/locations/${var.colab_enterprise_region}/notebookRuntimeTemplates?notebookRuntimeTemplateId=colab-enterprise-template \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Content-Type: application/json" \
--data '{
displayName: "colab-enterprise-template",
description: "colab-enterprise-template",
isDefault: true,
machineSpec: {
machineType: "e2-highmem-4"
},
dataPersistentDiskSpec: {
diskType: "pd-standard",
diskSizeGb: 500,
},
networkSpec: {
enableInternetAccess: false,
network: "projects/${var.project_id}/global/networks/vpc-main",
subnetwork: "projects/${var.project_id}/regions/${var.colab_enterprise_region}/subnetworks/colab-subnet"
},
shieldedVmConfig: {
enableSecureBoot: true
}
}'
EOF
}
depends_on = [
google_compute_subnetwork.colab_subnet
]
}
# https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/projects.locations.notebookRuntimes
resource "null_resource" "colab_runtime" {
provisioner "local-exec" {
when = create
command = <<EOF
curl -X POST \
https://${var.colab_enterprise_region}-aiplatform.googleapis.com/ui/projects/${var.project_id}/locations/${var.colab_enterprise_region}/notebookRuntimes:assign \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Content-Type: application/json" \
--data '{
notebookRuntimeTemplate: "projects/${var.project_number}/locations/${var.colab_enterprise_region}/notebookRuntimeTemplates/colab-enterprise-template",
notebookRuntime: {
displayName: "colab-enterprise-runtime",
description: "colab-enterprise-runtime",
runtimeUser: "${var.gcp_account_name}"
}
}'
EOF
}
depends_on = [
google_compute_subnetwork.colab_subnet,
null_resource.colab_runtime_template
]
}
####################################################################################
# Cloud Run Rideshare Plus Website
####################################################################################
# Service account
# Permissions to storage
# Permissions to BigQuery (JobUser and Datasets)
# Zip up a website
resource "google_service_account" "cloud_run_rideshare_plus_service_account" {
project = var.project_id
account_id = "rideshare-plus-service-account"
display_name = "Service Account for Rideshare Plus website"
}
# Grant access to run BigQuery Jobs
resource "google_project_iam_member" "cloud_run_rideshare_plus_service_account_jobuser" {
project = var.project_id
role = "roles/bigquery.jobUser"
member = "serviceAccount:${google_service_account.cloud_run_rideshare_plus_service_account.email}"
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account
]
}
# Allow access to read/write storage
/*
resource "google_project_iam_member" "cloud_run_rideshare_plus_service_account_objectadmin" {
project = var.project_id
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_service_account.cloud_run_rideshare_plus_service_account.email}"
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_project_iam_member.cloud_run_rideshare_plus_service_account_jobuser
]
}
*/
# The cloud function needs to read/write to this bucket (code bucket)
resource "google_storage_bucket_iam_member" "cloud_run_rideshare_plus_service_account_objectadmin" {
bucket = google_storage_bucket.code_bucket.name
role = "roles/storage.objectAdmin"
member = "serviceAccount:${google_service_account.cloud_run_rideshare_plus_service_account.email}"
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_storage_bucket.code_bucket,
google_project_iam_member.cloud_run_rideshare_plus_service_account_jobuser
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_lakehouse_curated_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_curated_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.rideshare_lakehouse_curated_dataset,
google_storage_bucket_iam_member.cloud_run_rideshare_plus_service_account_objectadmin
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_lakehouse_enriched_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_enriched_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.rideshare_lakehouse_enriched_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_lakehouse_curated_dataset
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_lakehouse_raw_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_lakehouse_raw_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.rideshare_lakehouse_raw_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_lakehouse_enriched_dataset
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_llm_curated_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.ideshare_llm_curated_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.ideshare_llm_curated_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_lakehouse_raw_dataset
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_llm_enriched_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_llm_enriched_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.rideshare_llm_enriched_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_llm_curated_dataset
]
}
resource "google_bigquery_dataset_access" "cloud_run_rideshare_llm_raw_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.rideshare_llm_raw_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.rideshare_llm_raw_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_llm_enriched_dataset
]
}
resource "google_bigquery_dataset_access" "cloud_run_taxi_dataset" {
project = var.project_id
dataset_id = google_bigquery_dataset.taxi_dataset.dataset_id
role = "roles/bigquery.dataOwner"
user_by_email = google_service_account.cloud_run_rideshare_plus_service_account.email
depends_on = [
google_service_account.cloud_run_rideshare_plus_service_account,
google_bigquery_dataset.taxi_dataset,
google_bigquery_dataset_access.cloud_run_rideshare_llm_raw_dataset
]
}
# Zip the source code
data "archive_file" "cloud_run_rideshare_website_archive_file" {
type = "zip"
source_dir = "../cloud-run/rideshare-plus-website"
output_path = "../cloud-run/rideshare-plus-website.zip"
depends_on = [
google_storage_bucket.code_bucket
]
}
# Upload code
resource "google_storage_bucket_object" "cloud_run_rideshare_website_archive_upload" {
name = "cloud-run/rideshare-plus-website/rideshare-plus-website.zip"
bucket = google_storage_bucket.code_bucket.name
source = data.archive_file.cloud_run_rideshare_website_archive_file.output_path
depends_on = [
google_storage_bucket.code_bucket,
data.archive_file.cloud_run_rideshare_website_archive_file
]
}
# Repo for Docker Image
resource "google_artifact_registry_repository" "artifact_registry_cloud_run_deploy" {
project = var.project_id
location = var.cloud_function_region
repository_id = "cloud-run-source-deploy"
description = "cloud-run-source-deploy"
format = "DOCKER"
}
# Deploy Cloud Run Web App
# This is a C# MVC dotnet core application
# We want cloud build to build an image and deploy to cloud run
/*
gcloud_make = f"gcloud builds submit " + \
f"--project=\"{project_id}\" " + \
f"--pack image=\"{cloud_function_region}-docker.pkg.dev/{project_id}/cloud-run-source-deploy/rideshareplus\" " + \
f"gs://{code_bucket_name}/cloud-run/rideshare-plus-website/rideshare-plus-website.zip"
gcloud_deploy = f"gcloud run deploy demo-rideshare-plus-website " + \
f"--project=\"{project_id}\" " + \
f"--image \"{cloud_function_region}-docker.pkg.dev/{project_id}/cloud-run-source-deploy/rideshareplus\" " + \
f"--region=\"{cloud_function_region}\" " + \
f"--cpu=1 " + \
f"--allow-unauthenticated " + \
f"--service-account=\"{rideshare_plus_service_account}\" " + \
f"--set-env-vars \"ENV_PROJECT_ID={project_id}\" " + \
f"--set-env-vars \"ENV_RIDESHARE_LAKEHOUSE_CURATED_DATASET={rideshare_lakehouse_curated_dataset}\" " + \
f"--set-env-vars \"ENV_CODE_BUCKET={code_bucket_name}\" " + \
f"--set-env-vars \"ENV_RIDESHARE_LLM_CURATED_DATASET={rideshare_llm_curated_dataset}\""
*/
/*
resource "null_resource" "cloudbuild_buildpack_rideshare_plus_image" {
provisioner "local-exec" {
when = create
command = <<EOF
gcloud builds submit \
--project="${var.project_id}" \
--pack image="${var.cloud_function_region}-docker.pkg.dev/${var.project_id}/cloud-run-source-deploy/rideshareplus" \
"gs://code-${var.storage_bucket}/cloud-run/rideshare-plus-website/rideshare-plus-website.zip"
EOF
}
depends_on = [
google_artifact_registry_repository.artifact_registry_cloud_run_deploy,
google_storage_bucket.code_bucket,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
]
}
*/
# This will execute a cloud build job (there does not seem to be a terraform command)
# The cloud build will build a docker image from the .net core code
# The image will be checked into our Artifact Repo
# The source code is from GCS
# Logic (requires "jq")
# 1. Kick off build
# 2. Wait for build to complete in loop
resource "null_resource" "cloudbuild_rideshareplus_docker_image" {
provisioner "local-exec" {
when = create
command = <<EOF
json=$(curl --request POST \
"https://cloudbuild.googleapis.com/v1/projects/${var.project_id}/builds" \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--data '{"source":{"storageSource":{"bucket":"${google_storage_bucket.code_bucket.name}","object":"cloud-run/rideshare-plus-website/rideshare-plus-website.zip"}},"steps":[{"name":"gcr.io/cloud-builders/docker","args":["build","-t","${var.cloud_function_region}-docker.pkg.dev/${var.project_id}/cloud-run-source-deploy/rideshareplus","."]},{"name":"gcr.io/cloud-builders/docker","args":["push","${var.cloud_function_region}-docker.pkg.dev/${var.project_id}/cloud-run-source-deploy/rideshareplus"]}]}' \
--compressed)
build_id=$(echo $${json} | jq .metadata.build.id --raw-output)
echo "build_id: $${build_id}"
# Loop while it creates
build_status_id="PENDING"
while [[ "$${build_status_id}" == "PENDING" || "$${build_status_id}" == "QUEUED" || "$${build_status_id}" == "WORKING" ]]
do
sleep 5
build_status_json=$(curl \
"https://cloudbuild.googleapis.com/v1/projects/${var.project_id}/builds/$${build_id}" \
--header "Authorization: Bearer $(gcloud auth print-access-token ${var.curl_impersonation})" \
--header "Accept: application/json" \
--compressed)
build_status_id=$(echo $${build_status_json} | jq .status --raw-output)
echo "build_status_id: $${build_status_id}"
done
if [[ "$${build_status_id}" != "SUCCESS" ]];
then
echo "Could not build the RidesharePlus Docker image with Cloud Build"
exit 1;
else
echo "Cloud Build Successful"
# For new projects you need to wait up to 240 seconds after your cloud build.
# The Cloud Run Terraform task is placed after the deployment of Composer which takes 15+ minutes to deploy.
# sleep 240
fi
EOF
}
depends_on = [
google_artifact_registry_repository.artifact_registry_cloud_run_deploy,
google_storage_bucket.code_bucket,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
]
}
resource "google_cloud_run_service" "cloud_run_service_rideshare_plus_website" {
project = var.project_id
name = "demo-rideshare-plus-website"
location = var.cloud_function_region
template {
spec {
timeout_seconds = 120
service_account_name = google_service_account.cloud_run_rideshare_plus_service_account.email
containers {
image = "${var.cloud_function_region}-docker.pkg.dev/${var.project_id}/cloud-run-source-deploy/rideshareplus"
env {
name = "ENV_PROJECT_ID"
value = var.project_id
}
env {
name = "ENV_CODE_BUCKET"
value = "code-${var.storage_bucket}"
}
env {
name = "ENV_RIDESHARE_LAKEHOUSE_CURATED_DATASET"
value = var.bigquery_rideshare_lakehouse_curated_dataset
}
env {
name = "ENV_RIDESHARE_LLM_CURATED_DATASET"
value = var.bigquery_rideshare_llm_curated_dataset
}
}
}
}
traffic {
percent = 100
latest_revision = true
}
depends_on = [
#null_resource.cloudbuild_buildpack_rideshare_plus_image,
null_resource.cloudbuild_rideshareplus_docker_image,
google_service_account.cloud_run_rideshare_plus_service_account,
google_artifact_registry_repository.artifact_registry_cloud_run_deploy,
google_storage_bucket.code_bucket,
google_storage_bucket_object.rideshare_plus_function_zip_upload,
google_composer_environment.composer_env,
]
}
output "cloud_run_service_rideshare_plus_website_url" {
value = "${google_cloud_run_service.cloud_run_service_rideshare_plus_website.status[0].url}"
}
data "google_iam_policy" "cloud_run_service_rideshare_plus_website_noauth" {
binding {
role = "roles/run.invoker"
members = [
"allUsers",
]
}
}
# Set the cloud run to allow anonymous access
resource "google_cloud_run_service_iam_policy" "google_cloud_run_service_iam_policy_noauth" {
location = google_cloud_run_service.cloud_run_service_rideshare_plus_website.location
project = google_cloud_run_service.cloud_run_service_rideshare_plus_website.project
service = google_cloud_run_service.cloud_run_service_rideshare_plus_website.name
policy_data = data.google_iam_policy.cloud_run_service_rideshare_plus_website_noauth.policy_data
depends_on = [
google_cloud_run_service.cloud_run_service_rideshare_plus_website
]
}
####################################################################################
# Outputs
####################################################################################
output "gcs_raw_bucket" {
value = google_storage_bucket.raw_bucket.name
}
output "gcs_processed_bucket" {
value = google_storage_bucket.processed_bucket.name
}
output "gcs_code_bucket" {
value = google_storage_bucket.code_bucket.name
}
output "default_network" {
value = google_compute_network.default_network.name
}
#output "nat-router" {
# value = google_compute_router.nat-router.name
#}
output "dataproc_subnet_name" {
value = google_compute_subnetwork.dataproc_subnet.name
}
output "dataproc_subnet_name_ip_cidr_range" {
value = google_compute_subnetwork.dataproc_subnet.ip_cidr_range
}
output "gcs_dataproc_bucket" {
value = google_storage_bucket.dataproc_bucket.name
}
output "dataproc_service_account" {
value = google_service_account.dataproc_service_account.email
}
output "cloudcomposer_account_service_agent_v2_ext" {
value = google_project_iam_member.cloudcomposer_account_service_agent_v2_ext.member
}
output "composer_subnet" {
value = google_compute_subnetwork.composer_subnet.name
}
output "composer_subnet_ip_cidr_range" {
value = google_compute_subnetwork.composer_subnet.ip_cidr_range
}
output "composer_service_account" {
value = google_service_account.composer_service_account.email
}
output "composer_env_name" {
value = google_composer_environment.composer_env.name
}
output "composer_env_dag_bucket" {
value = google_composer_environment.composer_env.config.0.dag_gcs_prefix
}
output "dataproc_serverless_subnet_name" {
value = google_compute_subnetwork.dataproc_serverless_subnet.name
}
output "dataproc_serverless_ip_cidr_range" {
value = google_compute_subnetwork.dataproc_serverless_subnet.ip_cidr_range
}
output "business_critical_taxonomy_aws_id" {
value = google_data_catalog_taxonomy.business_critical_taxonomy_aws.id
}
output "business_critical_taxonomy_azure_id" {
value = google_data_catalog_taxonomy.business_critical_taxonomy_azure.id
}
output "business_critical_taxonomy_id" {
value = google_data_catalog_taxonomy.business_critical_taxonomy.id
}
output "bigquery_external_function" {
value = google_cloudfunctions_function.bigquery_external_function.name
}
output "cloud_function_connection" {
value = google_bigquery_connection.cloud_function_connection.connection_id
}
output "biglake_connection" {
value = google_bigquery_connection.biglake_connection.connection_id
}
output "dataflow_subnet_name" {
value = google_compute_subnetwork.dataflow_subnet.name
}
output "dataflow_subnet_ip_cidr_range" {
value = google_compute_subnetwork.dataflow_subnet.ip_cidr_range
}
output "dataflow_service_account" {
value = google_service_account.dataflow_service_account.email
}
output "bigquery_taxi_dataset" {
value = var.bigquery_taxi_dataset
}
output "bigquery_thelook_ecommerce_dataset" {
value = var.bigquery_thelook_ecommerce_dataset
}
output "bigquery_rideshare_lakehouse_raw_dataset" {
value = var.bigquery_rideshare_lakehouse_raw_dataset
}
output "bigquery_rideshare_lakehouse_enriched_dataset" {
value = var.bigquery_rideshare_lakehouse_enriched_dataset
}
output "bigquery_rideshare_lakehouse_curated_dataset" {
value = var.bigquery_rideshare_lakehouse_curated_dataset
}
output "gcs_rideshare_lakehouse_raw_bucket" {
value = google_storage_bucket.rideshare_lakehouse_raw.name
}
output "gcs_rideshare_lakehouse_enriched_bucket" {
value = google_storage_bucket.rideshare_lakehouse_enriched.name
}
output "gcs_rideshare_lakehouse_curated_bucket" {
value = google_storage_bucket.rideshare_lakehouse_curated.name
}
output "demo_rest_api_service_uri" {
value = google_cloudfunctions2_function.rideshare_plus_function.service_config[0].uri
}
output "bigquery_rideshare_llm_raw_dataset" {
value = var.bigquery_rideshare_llm_raw_dataset
}
output "bigquery_rideshare_llm_enriched_dataset" {
value = var.bigquery_rideshare_llm_enriched_dataset
}
output "bigquery_rideshare_llm_curated_dataset" {
value = var.bigquery_rideshare_llm_curated_dataset
}
output "bigquery_cleanroom_dataset" {
value = var.bigquery_cleanroom_dataset
}