ai-infrastructure/tpu-training-on-gke/environment/cloudbuild.provision.yaml (58 lines of code) (raw):

# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. steps: - id: Prepare base infrastructure stage name: gcr.io/cloud-builders/gke-deploy entrypoint: /bin/bash dir: 1-base-infrastructure args: - -c - | gsutil cp gs://${_AUTOMATION_BUCKET}/providers/* . gsutil cp gs://${_AUTOMATION_BUCKET}/tfvars/0-bootstrap.auto.tfvars.json . sed -i "s/<FOLDER-NAME>/${_ENV_NAME}\/tfstate\/base_infra/g" backend.tf echo '{"env_name":"${_ENV_NAME}"}' > env-name.auto.tfvars.json - id: Provision base infra - tf init name: hashicorp/terraform dir: 1-base-infrastructure args: - init - id: Provision base infra -tf apply name: hashicorp/terraform dir: 1-base-infrastructure args: - apply - -auto-approve - id: Prepare GKE setup stage name: gcr.io/cloud-builders/gke-deploy entrypoint: /bin/bash dir: 2-gke-config args: - -c - | # Install JobSet and Kueue ## This is a mitigation to address issues with using Terraform to install CRDs. wget -O jq https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-amd64 chmod 755 jq gsutil cp gs://${_AUTOMATION_BUCKET}/${_ENV_NAME}/settings/1-base-infra-settings.json . CLUSTER_NAME=$(cat 1-base-infra-settings.json | ./jq -r '.cluster_name') CLUSTER_LOCATION=$(cat 1-base-infra-settings.json | ./jq -r '.cluster_location') gcloud config set auth/impersonate_service_account $_AUTOMATION_ACCOUNT gcloud container clusters get-credentials $$CLUSTER_NAME --region $$CLUSTER_LOCATION kubectl apply --server-side -f "https://github.com/kubernetes-sigs/jobset/releases/download/$_JOBSET_API_VERSION/manifests.yaml" kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/$_KUEUE_API_VERSION/manifests.yaml" # Prepare tfvars and providers for the Terraform gke-config stage gsutil cp gs://${_AUTOMATION_BUCKET}/tfvars/0-bootstrap.auto.tfvars.json . gsutil cp gs://${_AUTOMATION_BUCKET}/providers/* . gsutil cp gs://${_AUTOMATION_BUCKET}/${_ENV_NAME}/tfvars/1-base-infra.auto.tfvars.json . sed -i "s/<FOLDER-NAME>/${_ENV_NAME}\/tfstate\/gke_config/g" backend.tf echo '{"env_name":"${_ENV_NAME}"}' > env-name.auto.tfvars.json - id: Configure GKE - tf init name: hashicorp/terraform dir: 2-gke-config args: - init - id: Configure GKE - tf apply name: hashicorp/terraform dir: 2-gke-config args: - apply - -auto-approve substitutions: _JOBSET_API_VERSION: v0.4.0 _KUEUE_API_VERSION: v0.5.3