in ansible/roles/slurm/files/scripts/slurmsync.py [0:0]
def find_node_status(nodename):
"""Determine node/instance status that requires action"""
state = lkp.slurm_node(nodename)
if lkp.node_is_dyn(nodename):
return _find_dynamic_node_status()
if lkp.node_is_tpu(nodename):
return _find_tpu_node_status(nodename, state)
# split below is workaround for VMs whose hostname is FQDN
inst = lkp.instance(nodename.split(".")[0])
power_flags = frozenset(
("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN")
) & (state.flags if state is not None else set())
if inst is None:
if "POWERING_UP" in state.flags:
return NodeStatus.unchanged
if state.base == "DOWN" and "POWERED_DOWN" in state.flags:
return NodeStatus.restore
if "POWERING_DOWN" in state.flags:
return NodeStatus.restore
if "COMPLETING" in state.flags:
return NodeStatus.unbacked
if state.base != "DOWN" and not power_flags:
return NodeStatus.unbacked
if state.base == "DOWN" and not power_flags and allow_power_down(state):
return NodeStatus.power_down
if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename):
return NodeStatus.resume
elif (
state is not None
and "POWERED_DOWN" not in state.flags
and "POWERING_DOWN" not in state.flags
and inst.status == "TERMINATED"
):
if inst.scheduling.preemptible:
return NodeStatus.preempted
if not state.base.startswith("DOWN"):
return NodeStatus.terminated
elif (state is None or "POWERED_DOWN" in state.flags) and inst.status == "RUNNING":
log.info("%s is potential orphan node", nodename)
age_threshold_seconds = 90
inst_seconds_old = _seconds_since_timestamp(inst.creationTimestamp)
log.info("%s state: %s, age: %0.1fs", nodename, state, inst_seconds_old)
if inst_seconds_old < age_threshold_seconds:
log.info(
"%s not marked as orphan, it started less than %ds ago (%0.1fs)",
nodename,
age_threshold_seconds,
inst_seconds_old,
)
return NodeStatus.unchanged
return NodeStatus.orphan
elif state is None:
# if state is None here, the instance exists but it's not in Slurm
return NodeStatus.unknown
return NodeStatus.unchanged