in community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py [0:0]
def get_node_action(nodename: str) -> NodeAction:
"""Determine node/instance status that requires action"""
lkp = lookup()
state = lkp.node_state(nodename)
if lkp.node_is_fr(nodename):
fr = lkp.future_reservation(lkp.node_nodeset(nodename))
assert fr
if action := get_fr_action(fr, state):
return action
if lkp.is_flex_node(nodename):
return _find_flex_node_actions(nodename, state, lkp)
if lkp.node_is_dyn(nodename):
return _find_dynamic_node_status()
if lkp.node_is_tpu(nodename):
return _find_tpu_node_action(nodename, state)
# split below is workaround for VMs whose hostname is FQDN
inst = lkp.instance(nodename.split(".")[0])
power_flags = frozenset(
("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN")
) & (state.flags if state is not None else set())
if (state is None) and (inst is None):
# Should never happen
return NodeActionUnknown(None, None)
if inst is None:
assert state is not None # to keep type-checker happy
if "POWERING_UP" in state.flags:
return NodeActionUnchanged()
if state.base == "DOWN" and "POWERED_DOWN" in state.flags:
return NodeActionIdle()
if "POWERING_DOWN" in state.flags:
return NodeActionIdle()
if "COMPLETING" in state.flags:
return NodeActionDown(reason="Unbacked instance")
if state.base != "DOWN" and not power_flags:
return NodeActionDown(reason="Unbacked instance")
if state.base == "DOWN" and not power_flags:
return NodeActionPowerDown()
if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename):
return NodeActionPowerUp()
elif (
state is not None
and "POWERED_DOWN" not in state.flags
and "POWERING_DOWN" not in state.flags
and inst.status == "TERMINATED"
):
if inst.scheduling.preemptible:
return NodeActionPrempt()
if state.base != "DOWN":
return NodeActionDown(reason="Instance terminated")
elif (state is None or "POWERED_DOWN" in state.flags) and inst.status == "RUNNING":
log.info("%s is potential orphan node", nodename)
threshold = timedelta(seconds=90)
age = util.now() - inst.creation_timestamp
log.info(f"{nodename} state: {state}, age: {age}")
if age < threshold:
log.info(f"{nodename} not marked as orphan, it started less than {threshold.seconds}s ago ({age.seconds}s)")
return NodeActionUnchanged()
return NodeActionDelete()
elif state is None:
# if state is None here, the instance exists but it's not in Slurm
return NodeActionUnknown(slurm_state=state, instance_state=inst.status)
return NodeActionUnchanged()