in xlml/utils/tpu.py [0:0]
def clean_up_idle_nodes(project_name: str, zones: Iterable[str]) -> None:
"""Clean up TPU nodes that are expired.
Args:
project_name: The project of resources.
zones: Available zones to clean up for the project.
"""
creds, _ = google.auth.default()
client = tpu_api.TpuClient(credentials=creds)
logging.info(f'Cleaning up nodes in project {project_name}.')
for zone in zones:
logging.info(f'Checking in zone {zone.value}.')
parent = f'projects/{project_name}/locations/{zone.value}'
request = tpu_api.types.ListNodesRequest(parent=parent)
responses = client.list_nodes(request)
for node in responses:
ttl = int(node.labels[TTL]) if TTL in node.labels else None
if ttl:
create_time = node.create_time
current_time = datetime.datetime.now(datetime.timezone.utc)
logging.info(
(
f'Checking node {node.name}: create_time is {create_time},'
f' and current_time is {current_time}.'
)
)
active_time = current_time - create_time
delta = active_time.total_seconds() - ttl
if delta > 0:
datetime_delta = str(datetime.timedelta(seconds=delta))
logging.info(
(
f'Deleting node {node.name} due to exceeding its time to'
f' live (TTL) by {datetime_delta}.'
)
)
client.delete_node(name=node.name)