in community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py [0:0]
def _find_tpu_node_action(nodename, state) -> NodeAction:
lkp = lookup()
tpuobj = tpu.TPU.make(lkp.node_nodeset_name(nodename), lkp)
inst = tpuobj.get_node(nodename)
# If we do not find the node but it is from a Tpu that has multiple vms look for the master node
if inst is None and tpuobj.vmcount > 1:
# Get the tpu slurm nodelist of the nodes in the same tpu group as nodename
nodelist = run(
f"{lkp.scontrol} show topo {nodename}"
+ " | awk -F'=' '/Level=0/ { print $NF }'",
shell=True,
).stdout
l_nodelist = util.to_hostnames(nodelist)
group_names = set(l_nodelist)
# get the list of all the existing tpus in the nodeset
tpus_list = set(tpuobj.list_node_names())
# In the intersection there must be only one node that is the master
tpus_int = list(group_names.intersection(tpus_list))
if len(tpus_int) > 1:
log.error(
f"More than one cloud tpu node for tpu group {nodelist}, there should be only one that should be {l_nodelist[0]}, but we have found {tpus_int}"
)
return NodeActionUnknown(slurm_state=state, instance_state=None)
if len(tpus_int) == 1:
inst = tpuobj.get_node(tpus_int[0])
# if len(tpus_int ==0) this case is not relevant as this would be the case always that a TPU group is not running
if inst is None:
if state.base == "DOWN" and "POWERED_DOWN" in state.flags:
return NodeActionIdle()
if "POWERING_DOWN" in state.flags:
return NodeActionIdle()
if "COMPLETING" in state.flags:
return NodeActionDown(reason="Unbacked instance")
if state.base != "DOWN" and not (
set(("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN"))
& state.flags
):
return NodeActionDown(reason="Unbacked instance")
if lkp.is_static_node(nodename):
return NodeActionPowerUp()
elif (
state is not None
and "POWERED_DOWN" not in state.flags
and "POWERING_DOWN" not in state.flags
and inst.state == tpu.TPU.State.STOPPED
):
if tpuobj.preemptible:
return NodeActionPrempt()
if state.base != "DOWN":
return NodeActionDown(reason="Instance terminated")
elif (
state is None or "POWERED_DOWN" in state.flags
) and inst.state == tpu.TPU.State.READY:
return NodeActionDelete()
elif state is None:
# if state is None here, the instance exists but it's not in Slurm
return NodeActionUnknown(slurm_state=state, instance_state=inst.status)
return NodeActionUnchanged()