ansible/roles/slurm/files/scripts/destroy_resource_policies.py (77 lines of code) (raw):
#!/usr/bin/env python3
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
from suspend import batch_execute, truncate_iter, wait_for_operations
from util import lkp, compute, config_root_logger, parse_self_link
logger_name = Path(__file__).name
log = logging.getLogger(logger_name)
def delete_placement_groups(project, region, resourcePolicy):
request = compute.resourcePolicies().delete(
project=project, region=region, resourcePolicy=resourcePolicy
)
return request
def delete_policies(policy_list):
log.info(
"Deleting {0} resource policies:\n{1}".format(
len(policy_list), "\n".join(policy_list)
)
)
ops = {}
for self_link in policy_list:
link_info = parse_self_link(self_link)
ops[self_link] = delete_placement_groups(
project=link_info.project,
region=link_info.region,
resourcePolicy=link_info.resourcePolicie,
)
done, failed = batch_execute(ops)
if failed:
failed_items = [f"{n}: {e}" for n, (_, e) in failed.items()]
items_str = "\n".join(str(el) for el in truncate_iter(failed_items, 5))
log.error(f"some policies failed to delete: {items_str}")
wait_for_operations(done.values())
def main(args):
# NOTE: Resource policies cannot be labeled
if args.partition_name:
filter = f"name={args.slurm_cluster_name}-{args.partition_name}-*"
else:
filter = f"name={args.slurm_cluster_name}-*"
log.debug(f'filter = "{filter}"')
p_id = args.project_id if args.project_id else lkp.project
if not p_id:
print("Error: Project id cannot be determined")
exit(1)
result = (
compute.resourcePolicies().aggregatedList(project=p_id, filter=filter).execute()
)
policy_list = []
for item in result["items"].values():
policies = item.get("resourcePolicies")
if policies is not None:
for policy in policies:
policy_list.append(policy["selfLink"])
delete_policies(policy_list)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("slurm_cluster_name", help="Slurm cluster name filter")
parser.add_argument(
"--partition", "-p", dest="partition_name", help="Slurm partition name filter"
)
parser.add_argument(
"--project_id", help="Google cloud project ID", type=str, default=None
)
parser.add_argument(
"--debug",
"-d",
dest="debug",
action="store_true",
help="Enable debugging output",
)
args = parser.parse_args()
logfile = (Path(__file__).parent / logger_name).with_suffix(".log")
if args.debug:
config_root_logger(logger_name, level="DEBUG", logfile=logfile)
else:
config_root_logger(logger_name, level="INFO", logfile=logfile)
main(args)