def clean_up_idle_nodes()

in xlml/utils/tpu.py [0:0]


def clean_up_idle_nodes(project_name: str, zones: Iterable[str]) -> None:
  """Clean up TPU nodes that are expired.

  Args:
   project_name: The project of resources.
   zones: Available zones to clean up for the project.
  """
  creds, _ = google.auth.default()
  client = tpu_api.TpuClient(credentials=creds)

  logging.info(f'Cleaning up nodes in project {project_name}.')
  for zone in zones:
    logging.info(f'Checking in zone {zone.value}.')
    parent = f'projects/{project_name}/locations/{zone.value}'
    request = tpu_api.types.ListNodesRequest(parent=parent)
    responses = client.list_nodes(request)

    for node in responses:
      ttl = int(node.labels[TTL]) if TTL in node.labels else None
      if ttl:
        create_time = node.create_time
        current_time = datetime.datetime.now(datetime.timezone.utc)
        logging.info(
            (
                f'Checking node {node.name}: create_time is {create_time},'
                f' and current_time is {current_time}.'
            )
        )
        active_time = current_time - create_time
        delta = active_time.total_seconds() - ttl
        if delta > 0:
          datetime_delta = str(datetime.timedelta(seconds=delta))
          logging.info(
              (
                  f'Deleting node {node.name} due to exceeding its time to'
                  f' live (TTL) by {datetime_delta}.'
              )
          )
          client.delete_node(name=node.name)