in azure-slurm/slurmcc/cli.py [0:0]
def _shutdown(self, config: Dict, node_list: List[str], node_mgr: NodeManager) -> None:
by_name = hpcutil.partition_single(node_mgr.get_nodes(), lambda node: node.name)
node_list_filtered = []
to_keep_alive = []
for node_name in node_list:
if node_name in by_name:
node = by_name[node_name]
if node.keep_alive:
to_keep_alive.append(node_name)
logging.warning(f"{node_name} has KeepAlive=true in CycleCloud. Cannot terminate.")
else:
node_list_filtered.append(node_name)
else:
logging.info(f"{node_name} does not exist. Skipping.")
if to_keep_alive:
# This will prevent the node from falsely being resume/resume_fail over and over again.
logging.warning(f"Nodes {to_keep_alive} have KeepAlive=true in CycleCloud. Cannot terminate." +
" Setting state to down reason=keep_alive")
to_keep_alive_str = slutil.to_hostlist(to_keep_alive)
scontrol(["update", f"nodename={to_keep_alive_str}", "state=down", "reason=keep_alive"])
if not node_list_filtered:
logging.warning(f"No nodes out of node list {node_list} could be shutdown." +
" Post-processing the nodes only.")
else:
result = _safe_shutdown(node_list_filtered, node_mgr)
if not result:
raise AzureSlurmError(f"Failed to shutdown {node_list_filtered} - {result.message}")
if slutil.is_autoscale_enabled():
# undo internal DNS
for node_name in node_list:
_undo_internal_dns(node_name)
else:
# set states back to future and set NodeAddr/NodeHostName to node name
_update_future_states(self._get_node_manager(config, force=True), node_list)