cloudbuild.yaml (126 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# yamllint disable
steps:
# ================================ CANCEL CURRENT BUILD If ANY ONGOING ==================================================
- name: 'gcr.io/cloud-builders/gcloud'
entrypoint: 'bash'
args:
- '-c'
- |
build_count=$(gcloud builds list --ongoing --format='value(id)' --filter="substitutions.TRIGGER_NAME=$TRIGGER_NAME" | wc -l)
if [ "$build_count" -gt 1 ]; then
echo "Cancelling current build as build count > 1: $build_count"
gcloud builds cancel $BUILD_ID
fi
# ================================ DOCKER CONTAINER BUILDS ==============================================================
- name: 'gcr.io/cloud-builders/docker'
# TODO: we don't necessarily need to build this every time, it will barely ever change
id: 'integration_tests_build'
args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA",
"-f", "docker/integration_tests.Dockerfile", '.']
waitFor: ['-']
- name: 'gcr.io/cloud-builders/docker'
id: 'health_runner_build'
args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA",
"-f", "docker/health_runner.Dockerfile", '.']
waitFor: ['-'] # Technically redundant, as this is the first step.
- name: 'gcr.io/cloud-builders/docker'
id: 'gpu_healthcheck_build'
args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA",
"-f", "docker/gpu_healthcheck.Dockerfile", '.']
waitFor: ['-'] # Specifying a list of nothing to wait for makes these all run in parallel
- name: 'gcr.io/cloud-builders/docker'
id: 'nccl_healthcheck_build'
args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA",
"-f", "docker/nccl_healthcheck.Dockerfile", '.']
waitFor: ['-']
# NOTE: all subsequent steps will be run sequentially & only start after all of the images are built
# ================================ DOCKER IMAGE UPLOADS ==============================================================
#
- name: 'gcr.io/cloud-builders/docker'
id: 'integration_tests_push'
args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA"]
waitFor: ['integration_tests_build']
- name: 'gcr.io/cloud-builders/docker'
id: 'health_runner_push'
args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA"]
waitFor: ['health_runner_build']
- name: 'gcr.io/cloud-builders/docker'
id: 'gpu_healthcheck_push'
args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA"]
waitFor: ['gpu_healthcheck_build']
- name: 'gcr.io/cloud-builders/docker'
id: 'nccl_healthcheck_push'
args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA"]
waitFor: ['nccl_healthcheck_build']
# ================================== NCCL Healthcheck Tests ===========================================================
- name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master
id: 'nccl_healthcheck_uninstall'
entrypoint: "bash"
args: ["./uninstall_from_cluster.sh", "healthcheck-test", "*health-runner*", "*healthcheck*"]
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'GCLOUD_PROJECT=supercomputer-testing'
- name: 'gcr.io/cloud-builders/kubectl'
id: 'nccl_healthcheck_clean'
args: ['label', 'nodes', '--all', 'aiinfra/nccl-healthcheck-test=true', 'aiinfra/nccl-healthcheck-bandwidth-',
'aiinfra/nccl-healthcheck-result-', 'aiinfra/nccl-healthcheck-runtime-sec-']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'CLOUDSDK_CORE_PROJECT=supercomputer-testing'
- name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master
id: 'nccl_healthcheck_run'
args: ['upgrade', '-i', 'healthcheck-test', 'deploy/helm/health_runner', '--set',
'health_checks.nccl_healthcheck.image.tag=$SHORT_SHA', '--set',
'health_checks.nccl_healthcheck.env.HC_IMAGE_TAG=$SHORT_SHA', '--set',
'health_checks.nccl_healthcheck.run_check=true', '--set',
'health_checks.nccl_healthcheck.DRY_RUN="false"', '--set',
'health_checks.nccl_healthcheck.blast_mode.blast_mode_enabled=true', '--set',
'health_checks.nccl_healthcheck.env.HELM_INSTALL_FLAGS=-f /app/health_checks/nccl_healthcheck/a3plus.yaml --set health_check.image.tag=$SHORT_SHA']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'GCLOUD_PROJECT=supercomputer-testing'
- name: "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA"
id: 'nccl_healthcheck_check'
args: ['--check', 'aiinfra/nccl-healthcheck-bandwidth=^\d*$', 'aiinfra/nccl-healthcheck-result=^pass$',
'aiinfra/nccl-healthcheck-runtime-sec=^\d*$', '--filter', 'node.kubernetes.io/instance-type=a3-megagpu']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'CLOUDSDK_CORE_PROJECT=supercomputer-testing'
# ================================== GPU Healthcheck Tests ===========================================================
- name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master
id: 'gpu_healthcheck_uninstall'
entrypoint: "bash"
args: ["./uninstall_from_cluster.sh", "healthcheck-test", "*health-runner*", "*healthcheck*"]
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'GCLOUD_PROJECT=supercomputer-testing'
- name: 'gcr.io/cloud-builders/kubectl'
id: 'gpu_healthcheck_clean'
args: ['label', 'nodes', '--all', 'aiinfra/gpu-healthcheck-test=true', 'aiinfra/gpu-healthcheck-result-',
'aiinfra/gpu-healthcheck-runtime-sec-']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'CLOUDSDK_CORE_PROJECT=supercomputer-testing'
- name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master
id: 'gpu_healthcheck_run'
args: ['upgrade', '-i', 'healthcheck-test', 'deploy/helm/health_runner', '--set',
'health_checks.gpu_healthcheck.image.tag=$SHORT_SHA', '--set',
'health_checks.gpu_healthcheck.env.HC_IMAGE_TAG=$SHORT_SHA', '--set',
'health_checks.gpu_healthcheck.run_check=true', '--set',
'health_checks.gpu_healthcheck.DRY_RUN="false"', '--set',
'health_checks.gpu_healthcheck.blast_mode.blast_mode_enabled=true', '--set',
'health_checks.nccl_healthcheck.env.HELM_INSTALL_FLAGS=--set health_check.image.tag=$SHORT_SHA']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'GCLOUD_PROJECT=supercomputer-testing'
- name: "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA"
id: 'gpu_healthcheck_check'
args: ['--check', 'aiinfra/gpu-healthcheck-result=^pass$', 'aiinfra/gpu-healthcheck-runtime-sec=^\d*$',
'--filter', 'node.kubernetes.io/instance-type=a3-megagpu']
env:
- 'CLOUDSDK_COMPUTE_REGION=us-east5-a'
- 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets
- 'CLOUDSDK_CORE_PROJECT=supercomputer-testing'
images: # these images will be uploaded to cloud build
- "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA"
- "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA"
- "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA"
- "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA"