data-analytics-demos/bigquery-data-governance/terraform-modules/sql-scripts/tf-sql-scripts.tf (272 lines of code) (raw):

#################################################################################### # Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #################################################################################### #################################################################################### # Create the GCP resources # # Author: Adam Paternostro #################################################################################### terraform { required_providers { google = { source = "hashicorp/google-beta" version = "5.35.0" } } } #################################################################################### # Variables #################################################################################### variable "gcp_account_name" {} variable "project_id" {} variable "dataplex_region" {} variable "multi_region" {} variable "bigquery_non_multi_region" {} variable "vertex_ai_region" {} variable "data_catalog_region" {} variable "appengine_region" {} variable "colab_enterprise_region" {} variable "random_extension" {} variable "project_number" {} variable "deployment_service_account_name" {} variable "terraform_service_account" {} variable "bigquery_governed_data_raw_dataset" {} variable "bigquery_governed_data_enriched_dataset" {} variable "bigquery_governed_data_curated_dataset" {} variable "bigquery_analytics_hub_publisher_dataset" {} variable "governed_data_raw_bucket" {} variable "governed_data_enriched_bucket" {} variable "governed_data_curated_bucket" {} variable "governed_data_code_bucket" {} variable "governed_data_scan_bucket" {} data "google_client_config" "current" { } #################################################################################### # UDFs #################################################################################### resource "google_bigquery_routine" "clean_llmgemini_model_result_as_json_json" { project = var.project_id dataset_id = var.bigquery_governed_data_raw_dataset routine_id = "gemini_model_result_as_json" routine_type = "SCALAR_FUNCTION" language = "SQL" definition_body = templatefile("../sql-scripts/governed_data_raw/gemini_model_result_as_json.sql", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset }) arguments { name = "input" argument_kind = "FIXED_TYPE" data_type = jsonencode({ "typeKind" : "JSON" }) } return_type = "{\"typeKind\" : \"JSON\"}" } resource "google_bigquery_routine" "gemini_model_result_as_string" { project = var.project_id dataset_id = var.bigquery_governed_data_raw_dataset routine_id = "gemini_model_result_as_string" routine_type = "SCALAR_FUNCTION" language = "SQL" definition_body = templatefile("../sql-scripts/governed_data_raw/gemini_model_result_as_string.sql", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset }) arguments { name = "input" argument_kind = "FIXED_TYPE" data_type = jsonencode({ "typeKind" : "JSON" }) } return_type = "{\"typeKind\" : \"STRING\"}" } #################################################################################### # RAW: Stored Procedures #################################################################################### resource "google_bigquery_routine" "initialize_raw" { project = var.project_id dataset_id = var.bigquery_governed_data_raw_dataset routine_id = "initialize" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_raw_dataset}/initialize.sql", { project_id = var.project_id multi_region = var.multi_region bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) } #################################################################################### # ENRICHED: Stored Procedures #################################################################################### resource "google_bigquery_routine" "initialize_enriched" { project = var.project_id dataset_id = var.bigquery_governed_data_enriched_dataset routine_id = "initialize" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_enriched_dataset}/initialize.sql", { project_id = var.project_id multi_region = var.multi_region bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) depends_on = [ google_bigquery_routine.initialize_raw ] } resource "google_bigquery_routine" "transform_customer" { project = var.project_id dataset_id = var.bigquery_governed_data_enriched_dataset routine_id = "transform_customer" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_enriched_dataset}/transform_customer.sql", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) } resource "google_bigquery_routine" "transform_product_category" { project = var.project_id dataset_id = var.bigquery_governed_data_enriched_dataset routine_id = "transform_product_category" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_enriched_dataset}/transform_product_category.sql", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) } resource "google_bigquery_routine" "transform_product" { project = var.project_id dataset_id = var.bigquery_governed_data_enriched_dataset routine_id = "transform_product" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_enriched_dataset}/transform_product.sql", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) } #################################################################################### # ENRICHED: Spark Transformation #################################################################################### resource "google_bigquery_routine" "transform_order_pyspark" { project = var.project_id dataset_id = var.bigquery_governed_data_enriched_dataset routine_id = "transform_order_pyspark" routine_type = "PROCEDURE" language = "PYTHON" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_enriched_dataset}/transform_order_pyspark.py", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) spark_options { connection = "projects/${var.project_id}/locations/${var.multi_region}/connections/spark-connection" runtime_version = "2.1" #properties = { # "dataproc:dataproc.lineage.enabled" : "true" # https://cloud.google.com/dataproc/docs/guides/lineage#enable-data-lineage-at-the-cluster-level #} properties = { "spark.openlineage.namespace" : "${var.project_id}" "spark.openlineage.appName": "transform_order_pyspark" } } } # Spark code for dataproc cluster resource "google_storage_bucket_object" "dataproc_transform_order_pyspark" { name = "dataproc/transform_order_pyspark.py" content = templatefile("../dataproc/transform_order_pyspark.py", { project_id = var.project_id bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) bucket = var.governed_data_code_bucket } #################################################################################### # CURATED: Stored Procedures #################################################################################### resource "google_bigquery_routine" "initialize_curated" { project = var.project_id dataset_id = var.bigquery_governed_data_curated_dataset routine_id = "initialize" routine_type = "PROCEDURE" language = "SQL" definition_body = templatefile("../sql-scripts/${var.bigquery_governed_data_curated_dataset}/initialize.sql", { project_id = var.project_id multi_region = var.multi_region bigquery_governed_data_raw_dataset = var.bigquery_governed_data_raw_dataset bigquery_governed_data_enriched_dataset = var.bigquery_governed_data_enriched_dataset bigquery_governed_data_curated_dataset = var.bigquery_governed_data_curated_dataset bigquery_analytics_hub_publisher_dataset = var.bigquery_analytics_hub_publisher_dataset governed_data_raw_bucket = var.governed_data_raw_bucket governed_data_enriched_bucket = var.governed_data_enriched_bucket governed_data_curated_bucket = var.governed_data_curated_bucket governed_data_code_bucket = var.governed_data_code_bucket governed_data_scan_bucket = var.governed_data_scan_bucket }) depends_on = [ google_bigquery_routine.initialize_raw, google_bigquery_routine.initialize_enriched ] } #################################################################################### # Invoke Initalize SP #################################################################################### /* THIS RUNS OVER AND OVER AGAIN (for each TF execution) WHICH WILL OVERWRITE THE DATA data "google_client_config" "current" { } # Call the BigQuery initialize stored procedure to initialize the system data "http" "call_sp_initialize" { url = "https://bigquery.googleapis.com/bigquery/v2/projects/${var.project_id}/jobs" method = "POST" request_headers = { Accept = "application/json" Authorization = "Bearer ${data.google_client_config.current.access_token}" } request_body = "{\"configuration\":{\"query\":{\"query\":\"CALL `${var.project_id}.${var.bigquery_governed_data_raw_dataset}.initialize`();\",\"useLegacySql\":false}}}" depends_on = [ google_bigquery_routine.initialize ] } */ resource "null_resource" "call_sp_initialize" { provisioner "local-exec" { when = create command = <<EOF curl -X POST \ https://bigquery.googleapis.com/bigquery/v2/projects/${var.project_id}/jobs \ --header "Authorization: Bearer ${data.google_client_config.current.access_token}" \ --header "Content-Type: application/json" \ --data '{ "configuration" : { "query" : { "query" : "CALL `${var.project_id}.${var.bigquery_governed_data_raw_dataset}.initialize`();", "useLegacySql" : false } } }' EOF } depends_on = [ google_bigquery_routine.initialize_raw, google_bigquery_routine.initialize_enriched, google_bigquery_routine.transform_product, google_bigquery_routine.transform_product_category, google_bigquery_routine.transform_customer, google_bigquery_routine.transform_order_pyspark, google_bigquery_routine.initialize_curated, ] }