cloudbuild.yaml (126 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # yamllint disable steps: # ================================ CANCEL CURRENT BUILD If ANY ONGOING ================================================== - name: 'gcr.io/cloud-builders/gcloud' entrypoint: 'bash' args: - '-c' - | build_count=$(gcloud builds list --ongoing --format='value(id)' --filter="substitutions.TRIGGER_NAME=$TRIGGER_NAME" | wc -l) if [ "$build_count" -gt 1 ]; then echo "Cancelling current build as build count > 1: $build_count" gcloud builds cancel $BUILD_ID fi # ================================ DOCKER CONTAINER BUILDS ============================================================== - name: 'gcr.io/cloud-builders/docker' # TODO: we don't necessarily need to build this every time, it will barely ever change id: 'integration_tests_build' args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA", "-f", "docker/integration_tests.Dockerfile", '.'] waitFor: ['-'] - name: 'gcr.io/cloud-builders/docker' id: 'health_runner_build' args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA", "-f", "docker/health_runner.Dockerfile", '.'] waitFor: ['-'] # Technically redundant, as this is the first step. - name: 'gcr.io/cloud-builders/docker' id: 'gpu_healthcheck_build' args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA", "-f", "docker/gpu_healthcheck.Dockerfile", '.'] waitFor: ['-'] # Specifying a list of nothing to wait for makes these all run in parallel - name: 'gcr.io/cloud-builders/docker' id: 'nccl_healthcheck_build' args: ['build', '-t', "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA", "-f", "docker/nccl_healthcheck.Dockerfile", '.'] waitFor: ['-'] # NOTE: all subsequent steps will be run sequentially & only start after all of the images are built # ================================ DOCKER IMAGE UPLOADS ============================================================== # - name: 'gcr.io/cloud-builders/docker' id: 'integration_tests_push' args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA"] waitFor: ['integration_tests_build'] - name: 'gcr.io/cloud-builders/docker' id: 'health_runner_push' args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA"] waitFor: ['health_runner_build'] - name: 'gcr.io/cloud-builders/docker' id: 'gpu_healthcheck_push' args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA"] waitFor: ['gpu_healthcheck_build'] - name: 'gcr.io/cloud-builders/docker' id: 'nccl_healthcheck_push' args: ['push', "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA"] waitFor: ['nccl_healthcheck_build'] # ================================== NCCL Healthcheck Tests =========================================================== - name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master id: 'nccl_healthcheck_uninstall' entrypoint: "bash" args: ["./uninstall_from_cluster.sh", "healthcheck-test", "*health-runner*", "*healthcheck*"] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'GCLOUD_PROJECT=supercomputer-testing' - name: 'gcr.io/cloud-builders/kubectl' id: 'nccl_healthcheck_clean' args: ['label', 'nodes', '--all', 'aiinfra/nccl-healthcheck-test=true', 'aiinfra/nccl-healthcheck-bandwidth-', 'aiinfra/nccl-healthcheck-result-', 'aiinfra/nccl-healthcheck-runtime-sec-'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'CLOUDSDK_CORE_PROJECT=supercomputer-testing' - name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master id: 'nccl_healthcheck_run' args: ['upgrade', '-i', 'healthcheck-test', 'deploy/helm/health_runner', '--set', 'health_checks.nccl_healthcheck.image.tag=$SHORT_SHA', '--set', 'health_checks.nccl_healthcheck.env.HC_IMAGE_TAG=$SHORT_SHA', '--set', 'health_checks.nccl_healthcheck.run_check=true', '--set', 'health_checks.nccl_healthcheck.DRY_RUN="false"', '--set', 'health_checks.nccl_healthcheck.blast_mode.blast_mode_enabled=true', '--set', 'health_checks.nccl_healthcheck.env.HELM_INSTALL_FLAGS=-f /app/health_checks/nccl_healthcheck/a3plus.yaml --set health_check.image.tag=$SHORT_SHA'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'GCLOUD_PROJECT=supercomputer-testing' - name: "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA" id: 'nccl_healthcheck_check' args: ['--check', 'aiinfra/nccl-healthcheck-bandwidth=^\d*$', 'aiinfra/nccl-healthcheck-result=^pass$', 'aiinfra/nccl-healthcheck-runtime-sec=^\d*$', '--filter', 'node.kubernetes.io/instance-type=a3-megagpu'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'CLOUDSDK_CORE_PROJECT=supercomputer-testing' # ================================== GPU Healthcheck Tests =========================================================== - name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master id: 'gpu_healthcheck_uninstall' entrypoint: "bash" args: ["./uninstall_from_cluster.sh", "healthcheck-test", "*health-runner*", "*healthcheck*"] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'GCLOUD_PROJECT=supercomputer-testing' - name: 'gcr.io/cloud-builders/kubectl' id: 'gpu_healthcheck_clean' args: ['label', 'nodes', '--all', 'aiinfra/gpu-healthcheck-test=true', 'aiinfra/gpu-healthcheck-result-', 'aiinfra/gpu-healthcheck-runtime-sec-'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'CLOUDSDK_CORE_PROJECT=supercomputer-testing' - name: 'gcr.io/$PROJECT_ID/helm' # community image, built manually from instructions at https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master id: 'gpu_healthcheck_run' args: ['upgrade', '-i', 'healthcheck-test', 'deploy/helm/health_runner', '--set', 'health_checks.gpu_healthcheck.image.tag=$SHORT_SHA', '--set', 'health_checks.gpu_healthcheck.env.HC_IMAGE_TAG=$SHORT_SHA', '--set', 'health_checks.gpu_healthcheck.run_check=true', '--set', 'health_checks.gpu_healthcheck.DRY_RUN="false"', '--set', 'health_checks.gpu_healthcheck.blast_mode.blast_mode_enabled=true', '--set', 'health_checks.nccl_healthcheck.env.HELM_INSTALL_FLAGS=--set health_check.image.tag=$SHORT_SHA'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'GCLOUD_PROJECT=supercomputer-testing' - name: "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA" id: 'gpu_healthcheck_check' args: ['--check', 'aiinfra/gpu-healthcheck-result=^pass$', 'aiinfra/gpu-healthcheck-runtime-sec=^\d*$', '--filter', 'node.kubernetes.io/instance-type=a3-megagpu'] env: - 'CLOUDSDK_COMPUTE_REGION=us-east5-a' - 'CLOUDSDK_CONTAINER_CLUSTER=a3plus-chs' # TODO: read these from secrets - 'CLOUDSDK_CORE_PROJECT=supercomputer-testing' images: # these images will be uploaded to cloud build - "us-docker.pkg.dev/$PROJECT_ID/health-check/integration-tests:$SHORT_SHA" - "us-docker.pkg.dev/$PROJECT_ID/health-check/health-runner:$SHORT_SHA" - "us-docker.pkg.dev/$PROJECT_ID/health-check/gpu-healthcheck:$SHORT_SHA" - "us-docker.pkg.dev/$PROJECT_ID/health-check/nccl-healthcheck:$SHORT_SHA"