in scripts/slurmsync.py [0:0]
def do_node_update(status, nodes):
"""update node/instance based on node status"""
if status == NodeStatus.unchanged:
return
count = len(nodes)
hostlist = util.to_hostlist(nodes)
def nodes_down():
"""down nodes"""
log.info(
f"{count} nodes set down due to node status '{status.name}' ({hostlist})"
)
run(
f"{lkp.scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'"
)
def nodes_restart():
"""start instances for nodes"""
log.info(f"{count} instances restarted ({hostlist})")
start_instances(nodes)
def nodes_idle():
"""idle nodes"""
log.info(f"{count} nodes to idle ({hostlist})")
run(f"{lkp.scontrol} update nodename={hostlist} state=resume")
def nodes_resume():
"""resume nodes via scontrol"""
log.info(f"{count} instances to resume ({hostlist})")
run(f"{lkp.scontrol} update nodename={hostlist} state=power_up")
def nodes_delete():
"""delete instances for nodes"""
log.info(f"{count} instances to delete ({hostlist})")
delete_instances(nodes)
def nodes_power_down():
"""power_down node in slurm"""
log.info(f"{count} instances to power down ({hostlist})")
run(f"{lkp.scontrol} update nodename={hostlist} state=power_down")
def nodes_unknown():
"""Error status, nodes shouldn't get in this status"""
log.error(f"{count} nodes have unexpected status: ({hostlist})")
first = next(iter(nodes))
state = lkp.slurm_node(first)
state = "{}+{}".format(state.base, "+".join(state.flags)) if state else "None"
inst = lkp.instance(first)
log.error(f"{first} state: {state}, instance status:{inst.status}")
update = dict.get(
{
NodeStatus.orphan: nodes_delete,
NodeStatus.power_down: nodes_power_down,
NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()),
NodeStatus.restore: nodes_idle,
NodeStatus.resume: nodes_resume,
NodeStatus.terminated: nodes_down,
NodeStatus.unbacked: nodes_down,
NodeStatus.unchanged: lambda: None,
NodeStatus.unknown: nodes_unknown,
},
status,
)
update()