dags/infra/clean_up.py (35 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A DAG to clean up idle accelerator resources."""
import datetime
from airflow import models
from dags import composer_env
from dags.common.vm_resource import Project, Zone
from xlml.utils import tpu
# Run every 10min
SCHEDULED_TIME = "*/10 * * * *" if composer_env.is_prod_env() else None
with models.DAG(
dag_id="clean_up",
schedule=SCHEDULED_TIME,
tags=["solutions_team", "clean_up"],
start_date=datetime.datetime(2024, 2, 22),
catchup=False,
) as dag:
# List tpu zones for projects to avoid permission issue
tpu_zones = [
Zone.US_CENTRAL1_A,
Zone.US_CENTRAL1_B,
Zone.US_CENTRAL2_B,
Zone.US_CENTRAL1_C,
Zone.US_EAST1_D,
]
v5_tpu_zones = [
Zone.US_EAST1_C,
Zone.US_EAST5_A,
]
# TPUs
node_cloud_ml_auto_solutions = tpu.clean_up_idle_nodes.override(
task_id="cleanup_nodes_cloud-ml-auto-solutions"
)(Project.CLOUD_ML_AUTO_SOLUTIONS.value, tpu_zones)
node_tpu_prod_env_automated = tpu.clean_up_idle_nodes.override(
task_id="cleanup_nodes_tpu-prod-env-automated"
)(Project.TPU_PROD_ENV_AUTOMATED.value, v5_tpu_zones)
# QRs
# clean up in tpu-prod-env-automated has been handled in script below:
# https://source.corp.google.com/piper///depot/google3/cloud/tpu/tools/multipod/qr_tool/qr_delete.sh;l=32
# No need to handle here to avoid `maximum number of DeleteNode requests per minute` error.
qr_cloud_ml_auto_solutions = tpu.clean_up_idle_queued_resources.override(
task_id="cleanup_qr_cloud-ml-auto-solutions"
)(Project.CLOUD_ML_AUTO_SOLUTIONS.value, tpu_zones)
# Overview dependency
node_cloud_ml_auto_solutions >> qr_cloud_ml_auto_solutions
node_tpu_prod_env_automated