infra-as-code/modules/ingest-pipeline/main.tf (520 lines of code) (raw):

# Copyright 2024 Google LLC # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. locals { timeout_seconds = 3600 scheduler_timeout = 1800 } data "local_file" "orchestration" { filename = "${path.module}/workflow/orchestration.yaml" } data "google_service_account" "ccai_insights_sa" { account_id = var.service_account_id } resource "google_eventarc_trigger" "primary" { name = var.pipeline_name location = var.region service_account = var.service_account_email matching_criteria { attribute = "type" value = "google.cloud.storage.object.v1.finalized" } matching_criteria { attribute = "bucket" value = module.formatted_bucket.name } destination { workflow = google_workflows_workflow.orchestration.id } } resource "google_workflows_workflow" "orchestration" { project = var.project_id name = var.pipeline_name region = var.region service_account = data.google_service_account.ccai_insights_sa.id source_contents = data.local_file.orchestration.content user_env_vars = { cf_ccai_conversation_upload_url = module.cf_conversation_upload.uri cf_genai_url = module.cf_genai_transcript_fix.uri cf_stt_url = module.cf_stt_transcript.uri cf_feedback_generator_url = module.cf_feedback_generator.uri cf_audio_redaction_url = module.cf_audio_redaction.uri insights_endpoint = var.insights_endpoint } depends_on = [ module.cf_conversation_upload ] } resource "random_string" "random" { length = 5 special = false lower = true } module "cf_bundle_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "cf-ccai-conversation-upload-bucket-${random_string.random.result}" location = "US" versioning = true } module "cf_conversation_upload" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/cloud-function-v2?ref=v31.1.0&depth=1" project_id = var.project_id region = var.region name = var.pipeline_name bucket_name = module.cf_bundle_bucket.name bundle_config = { source_dir = "${path.module}/cf-ccai-conversation-upload" output_path = "${path.module}/cf-ccai-conversation-upload/bundle.zip" } service_account = data.google_service_account.ccai_insights_sa.email function_config = { timeout_seconds = local.timeout_seconds instance_count = 250 memory_mb = 2048 cpu = "1" } environment_variables = { PROJECT_ID = var.project_id INSIGHTS_ENDPOINT = var.insights_endpoint INSIGHTS_API_VERSION = var.insights_api_version CCAI_INSIGHTS_PROJECT_ID = var.ccai_insights_project_id CCAI_INSIGHTS_LOCATION_ID = var.ccai_insights_location_id INGEST_RECORD_BUCKET_ID = module.ingest_record_bucket.name REDACTED_AUDIOS_BUCKET_NAME = module.redacted_audio_bucket.name } } resource "google_data_loss_prevention_inspect_template" "custom" { parent = "projects/${var.project_id}/locations/${var.region}" description = "DLP inspection template" display_name = "inspect_template" inspect_config { info_types { name = "AGE" } info_types { name = "BLOOD_TYPE" } info_types { name = "CREDIT_CARD_NUMBER" } info_types { name = "DATE_OF_BIRTH" } info_types { name = "EMAIL_ADDRESS" } info_types { name = "FEMALE_NAME" } info_types { name = "FINANCIAL_ACCOUNT_NUMBER" } info_types { name = "FIRST_NAME" } info_types { name = "GENDER" } info_types { name = "LAST_NAME" } info_types { name = "LOCATION_COORDINATES" } info_types { name = "MALE_NAME" } info_types { name = "MARITAL_STATUS" } info_types { name = "MEDICAL_RECORD_NUMBER" } info_types { name = "PERSON_NAME" } info_types { name = "PHONE_NUMBER" } info_types { name = "STREET_ADDRESS" } info_types { name = "US_HEALTHCARE_NPI" } info_types { name = "US_MEDICARE_BENEFICIARY_ID_NUMBER" } info_types { name = "US_SOCIAL_SECURITY_NUMBER" } info_types { name = "US_STATE" } info_types { name = "US_TOLLFREE_PHONE_NUMBER" } rule_set { info_types { name = "PERSON_NAME" } info_types { name = "FIRST_NAME" } info_types { name = "LAST_NAME" } rules { exclusion_rule { dictionary { word_list { words = [""] } } matching_type = "MATCHING_TYPE_FULL_MATCH" } } } custom_info_types { info_type { name = "SPELLED_NAME" } regex { pattern = "[A-Z][A-Z-]+" } } custom_info_types { info_type { name = "EMAIL_AT" } regex { pattern = "[\\w.-]+ ?(at) ?[\\w.-]+\\.[a-z]+" } } custom_info_types { info_type { name = "NUMBERS_SEPARATED_BY_SLASH" } regex { pattern = "\\d+(?:/\\d+)+" } } include_quote = true } } resource "google_data_loss_prevention_deidentify_template" "basic" { parent = "projects/${var.project_id}/locations/${var.region}" description = "DLP de-identification template" display_name = "deidentification_template" deidentify_config { info_type_transformations { transformations { info_types { name = "AGE" } info_types { name = "BLOOD_TYPE" } info_types { name = "CREDIT_CARD_NUMBER" } info_types { name = "DATE_OF_BIRTH" } info_types { name = "EMAIL_ADDRESS" } info_types { name = "FEMALE_NAME" } info_types { name = "FINANCIAL_ACCOUNT_NUMBER" } info_types { name = "FIRST_NAME" } info_types { name = "GENDER" } info_types { name = "LAST_NAME" } info_types { name = "LOCATION_COORDINATES" } info_types { name = "MALE_NAME" } info_types { name = "MARITAL_STATUS" } info_types { name = "MEDICAL_RECORD_NUMBER" } info_types { name = "PERSON_NAME" } info_types { name = "PHONE_NUMBER" } info_types { name = "STREET_ADDRESS" } info_types { name = "US_HEALTHCARE_NPI" } info_types { name = "US_MEDICARE_BENEFICIARY_ID_NUMBER" } info_types { name = "US_SOCIAL_SECURITY_NUMBER" } info_types { name = "US_STATE" } info_types { name = "US_TOLLFREE_PHONE_NUMBER" } info_types { name = "SPELLED_NAME" } info_types { name = "EMAIL_AT" } info_types { name = "NUMBERS_SEPARATED_BY_SLASH" } primitive_transformation { replace_with_info_type_config = true } } } } } # STT Transcript Cloud Function resource "random_id" "bucket_ext" { byte_length = 4 } module "cf_stt_bundle_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "cf-stt-bucket-${random_id.bucket_ext.id}" location = "US" versioning = true } module "cf_stt_transcript" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/cloud-function-v2?ref=v31.1.0&depth=1" project_id = var.project_id region = var.region name = var.stt_function_name bucket_name = module.cf_stt_bundle_bucket.name bundle_config = { source_dir = "${path.module}/cf-stt-transcript" output_path = "${path.module}/cf-stt-transcript/bundle.zip" } service_account = data.google_service_account.ccai_insights_sa.email function_config = { timeout_seconds = local.timeout_seconds instance_count = 250 runtime = "python312" memory_mb = 2048 cpu = "1" } environment_variables = { PROJECT_ID = var.project_id TRANSCRIPT_BUCKET_ID = module.transcript_bucket.name RECOGNIZER_PATH = var.recognizer_path INGEST_RECORD_BUCKET_ID = module.ingest_record_bucket.name } } # GenAI Cloud function business key word fix resource "random_id" "genai_bucket_ext" { byte_length = 4 } module "cf_genai_bundle_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "cf-genai-bucket-${random_id.genai_bucket_ext.id}" location = "US" versioning = true } module "cf_genai_transcript_fix" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/cloud-function-v2?ref=v31.1.0&depth=1" project_id = var.project_id region = var.region name = var.genai_function_name bucket_name = module.cf_genai_bundle_bucket.name bundle_config = { source_dir = "${path.module}/cf-transcript-correction" output_path = "${path.module}/cf-transcript-correction/bundle.zip" } service_account = data.google_service_account.ccai_insights_sa.email function_config = { timeout_seconds = local.timeout_seconds entry_point = "main" runtime = "python312" memory_mb = 2048 cpu = "1" instance_count = 250 } environment_variables = { PROJECT_ID = var.project_id LOCATION_ID = var.region MODEL_NAME = var.model_name INGEST_RECORD_BUCKET_ID = module.ingest_record_bucket.name CLIENT_SPECIFIC_CONSTRAINTS = var.client_specific_constraints CLIENT_SPECIFIC_CONTEXT = var.client_specific_context FEW_SHOT_EXAMPLES = var.few_shot_examples } } module "cf_feedback_generator_bundle_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "cf-feedback-generator-bucket-${random_id.bucket_ext.id}" location = "US" versioning = true } module "cf_feedback_generator" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/cloud-function-v2?ref=v31.1.0&depth=1" project_id = var.project_id region = var.region name = var.feedback_generator_function_name bucket_name = module.cf_feedback_generator_bundle_bucket.name bundle_config = { source_dir = "${path.module}/cf-feedback-generator" output_path = "${path.module}/cf-feedback-generator/bundle.zip" } service_account = data.google_service_account.ccai_insights_sa.email function_config = { timeout_seconds = local.timeout_seconds instance_count = 250 memory_mb = 8192 cpu = "2" } environment_variables = { PROJECT_ID = var.project_id, INSIGHTS_ENDPOINT = var.insights_endpoint, INSIGHTS_API_VERSION = var.insights_api_version, CCAI_INSIGHTS_LOCATION_ID = var.ccai_insights_location_id, LOCATION_ID = var.region, MODEL_NAME = var.model_name, DATASET_NAME = var.dataset_name, FEEDBACK_TABLE_NAME = var.feedback_table_name, SCORECARD_ID = var.scorecard_id INGEST_RECORD_BUCKET_ID = module.ingest_record_bucket.name TARGET_TAGS = var.target_tags TARGET_VALUES = var.target_values } } resource "google_project_iam_member" "ccai_insights_editor" { project = var.ccai_insights_project_id role = "roles/contactcenterinsights.editor" member = "serviceAccount:${data.google_service_account.ccai_insights_sa.email}" } # Audio Redaction Cloud Function resource "random_id" "audio_redaction_bucket_ext" { byte_length = 4 } module "cf_audio_redaction_bundle_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "cf-audio-redaction-bucket-${random_id.audio_redaction_bucket_ext.id}" location = "US" versioning = true } module "cf_audio_redaction" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/cloud-function-v2?ref=v31.1.0&depth=1" project_id = var.project_id region = var.region name = var.audio_redaction_function_name bucket_name = module.cf_audio_redaction_bundle_bucket.name bundle_config = { source_dir = "${path.module}/cf-audio-redaction" output_path = "${path.module}/cf-audio-redaction/bundle.zip" } service_account = data.google_service_account.ccai_insights_sa.email function_config = { timeout_seconds = local.timeout_seconds instance_count = 250 runtime = "python312" memory_mb = 2048 cpu = "1" } environment_variables = { PROJECT_ID = var.project_id TRANSCRIPT_BUCKET_ID = module.transcript_bucket.name REDACTED_AUDIOS_BUCKET_NAME = module.redacted_audio_bucket.name } } #Bucket for the output of the STT Transcript in json format module "transcript_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "stt-transcript-${random_id.bucket_ext.id}-${var.env}" location = "US" versioning = true } # Buckets for the audio formatting cloud function module "trigger_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "original-audio-files-${random_id.bucket_ext.id}-${var.env}" location = var.region # The trigger must be in the same location as the bucket storage_class = "REGIONAL" versioning = true } module "formatted_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "formatted-audio-files-${random_id.bucket_ext.id}-${var.env}" location = var.region storage_class = "REGIONAL" versioning = true } module "meta_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "formatted-audio-metadata-${random_id.bucket_ext.id}-${var.env}" location = var.region storage_class = "REGIONAL" versioning = true } module "redacted_audio_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "redacted-audio-files" location = var.region storage_class = "REGIONAL" versioning = true } # Secret manager module "secret_manager_hash_key" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/secret-manager?ref=v31.1.0&depth=1" project_id = var.project_id secrets = { (var.hash_secret_name) = { locations = null keys = null } } versions = { (var.hash_secret_name) = { "latest" = { enabled = true data = var.hash_key } } } } module "audio_data_format_change" { source = "../../modules/audio-data-format-change" project_id = var.project_id region = var.region env = var.env service_account_email = data.google_service_account.ccai_insights_sa.email function_name = "audio-format-change" formatted_audio_bucket_id = module.formatted_bucket.name metadata_bucket_id = module.meta_bucket.name ingest_record_bucket_id = module.ingest_record_bucket.name number_of_channels = 2 hash_key = var.hash_secret_name trigger_bucket_name = module.trigger_bucket.name } module "ingest_record_bucket" { source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/gcs?ref=v31.1.0&depth=1" project_id = var.project_id name = "ingest-record-bucket-${random_id.bucket_ext.id}-${var.env}" location = var.region storage_class = "REGIONAL" versioning = true }