deprecated-code/terraform-modules/deploy-files/tf-deploy-files.tf (234 lines of code) (raw):

#################################################################################### # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #################################################################################### #################################################################################### # Create the GCP resources # # Author: Adam Paternostro #################################################################################### # Need this version to implement terraform { required_providers { google = { source = "hashicorp/google-beta" version = "5.35.0" } } } #################################################################################### # Variables #################################################################################### variable "project_id" {} variable "storage_bucket" {} variable "random_extension" {} variable "deployment_service_account_name" {} variable "composer_name" {} variable "composer_dag_bucket" {} #################################################################################### # Deploy "data" and "scripts" ################################################################################### # Upload the Airflow initial DAGs needed to run the system (dependencies of run-all-dags) # Upload all the DAGs can cause issues since the Airflow instance is so small they call cannot sync # before run-all-dags is launched resource "null_resource" "deploy_initial_airflow_dags" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gsutil cp ../cloud-composer/dags/step-*.py ${var.composer_dag_bucket} gsutil cp ../cloud-composer/dags/sample-dataflow-start-streaming-job.py ${var.composer_dag_bucket} EOF } } # Upload the Airflow "data/template" files # The data folder is the same path as the DAGs, but just has DATA as the folder name resource "null_resource" "deploy_initial_airflow_dags_data" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gsutil cp -r ../cloud-composer/data/* ${replace(var.composer_dag_bucket, "/dags", "/data")} EOF } } # Upload the PySpark scripts resource "null_resource" "deploy_dataproc_scripts" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gsutil cp ../dataproc/* gs://raw-${var.storage_bucket}/pyspark-code/ EOF } } # Download the BigQuery Spark JAR file # Download the Iceberg JAR File resource "null_resource" "deploy_dataproc_jars" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi curl -L https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/0.14.0/iceberg-spark-runtime-3.1_2.12-0.14.0.jar --output iceberg-spark-runtime-3.1_2.12-0.14.0.jar curl -L https://github.com/GoogleCloudDataproc/spark-bigquery-connector/releases/download/0.26.0/spark-bigquery-with-dependencies_2.12-0.26.0.jar --output spark-bigquery-with-dependencies_2.12-0.26.0.jar gsutil cp *.jar gs://raw-${var.storage_bucket}/pyspark-code/ EOF } } # Upload the Dataflow scripts resource "null_resource" "deploy_dataflow_scripts" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gsutil cp ../dataflow/* gs://raw-${var.storage_bucket}/dataflow/ EOF } } # Upload the Dataplex scripts data "template_file" "dataplex_data_quality_template" { template = "${file("../dataplex/data-quality/dataplex_data_quality_taxi.yaml")}" vars = { project_id = var.project_id dataplex_region = "REPLACE-REGION" random_extension = var.random_extension } } resource "google_storage_bucket_object" "dataplex_data_quality_yaml" { name = "dataplex/data-quality/dataplex_data_quality_taxi.yaml" content = "${data.template_file.dataplex_data_quality_template.rendered}" bucket = "code-${var.storage_bucket}" } # Replace the Bucket Name in the Jupyter notebooks resource "null_resource" "deploy_vertex_notebooks" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi find ../notebooks -type f -name "*.ipynb" -print0 | while IFS= read -r -d '' file; do echo "Notebook Replacing: $${file}" searchString="../notebooks/" replaceString="../notebooks-with-substitution/" destFile=$(echo "$${file//$searchString/$replaceString}") echo "destFile: $${destFile}" sed "s/REPLACE-BUCKET-NAME/processed-${var.storage_bucket}/g" "$${file}" > "$${destFile}.tmp" sed "s/REPLACE-PROJECT-ID/${var.project_id}/g" "$${destFile}.tmp" > "$${destFile}" done gsutil cp ../notebooks-with-substitution/*.ipynb gs://processed-${var.storage_bucket}/notebooks/ EOF } } # Replace the Bucket Name in the Jupyter notebooks resource "null_resource" "deploy_bigspark" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi find ../bigspark -type f -name "*.py" -print0 | while IFS= read -r -d '' file; do echo "BigSpark Replacing: $${file}" searchString="../bigspark/" replaceString="../bigspark-with-substitution/" destFile=$(echo "$${file//$searchString/$replaceString}") echo "destFile: $${destFile}" sed "s/REPLACE-BUCKET-NAME/raw-${var.storage_bucket}/g" "$${file}" > "$${destFile}.tmp" sed "s/REPLACE-PROJECT-ID/${var.project_id}/g" "$${destFile}.tmp" > "$${destFile}" done gsutil cp ../bigspark-with-substitution/*.py gs://raw-${var.storage_bucket}/bigspark/ gsutil cp ../bigspark/*.csv gs://raw-${var.storage_bucket}/bigspark/ EOF } } # Upload the sample Delta.io files # The manifest files need to have the GCS bucket name updated resource "null_resource" "deploy_delta_io_files" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi cp -rf ../sample-data/rideshare_trips/* ../sample-data/rideshare_trips-with-substitution find ../sample-data/rideshare_trips/_symlink_format_manifest -type f -name "*" -print0 | while IFS= read -r -d '' file; do echo "Updating Manifest file: $${file}" searchString="../sample-data/rideshare_trips" replaceString="../sample-data/rideshare_trips-with-substitution" destFile=$(echo "$${file//$searchString/$replaceString}") echo "destFile: $${destFile}" sed "s/REPLACE-BUCKET-NAME/processed-${var.storage_bucket}/g" "$${file}" > "$${destFile}" done gsutil cp -r ../sample-data/rideshare_trips-with-substitution/* gs://processed-${var.storage_bucket}/delta_io/rideshare_trips/ gsutil rm gs://processed-${var.storage_bucket}/delta_io/rideshare_trips/README.md EOF } } # You need to wait for Airflow to read the DAGs just uploaded # Only a few DAGs are uploaded so that we can sync quicker resource "time_sleep" "wait_for_airflow_dag_sync" { depends_on = [ null_resource.deploy_initial_airflow_dags, null_resource.deploy_initial_airflow_dags_data ] # This just a "guess" and might need to be extended. The Composer (Airflow) cluster is sized very small so it # takes longer to sync the DAG files create_duration = "180s" } # Kick off Airflow DAG # The Run-All-Dags as been scheduled for @once. # The below commands do not work for No External IPs /* resource "null_resource" "run_airflow_dag" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gcloud composer environments run ${var.composer_name} --project ${var.project_id} --location ${var.composer_region} dags trigger -- run-all-dags EOF } depends_on = [ time_sleep.wait_for_airflow_dag_sync ] } */ # Deploy all the remaining DAGs (hopefully the initial ones have synced) # When the Run-All-Dag deploys, it should run automatically resource "null_resource" "deploy_all_airflow_dags" { provisioner "local-exec" { interpreter = ["/bin/bash", "-c"] command = <<EOF if [ -z "$${GOOGLE_APPLICATION_CREDENTIALS}" ] then echo "We are not running in a local docker container. No need to login." else echo "We are running in local docker container. Logging in." gcloud auth activate-service-account "${var.deployment_service_account_name}" --key-file="$${GOOGLE_APPLICATION_CREDENTIALS}" --project="${var.project_id}" gcloud config set account "${var.deployment_service_account_name}" fi gsutil cp -n ../cloud-composer/dags/* ${var.composer_dag_bucket} EOF } depends_on = [ time_sleep.wait_for_airflow_dag_sync ] }