def _find_tpu_node_action()

in community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py [0:0]


def _find_tpu_node_action(nodename, state) -> NodeAction:
    lkp = lookup()
    tpuobj = tpu.TPU.make(lkp.node_nodeset_name(nodename), lkp)
    inst = tpuobj.get_node(nodename)
    # If we do not find the node but it is from a Tpu that has multiple vms look for the master node
    if inst is None and tpuobj.vmcount > 1:
        # Get the tpu slurm nodelist of the nodes in the same tpu group as nodename
        nodelist = run(
            f"{lkp.scontrol} show topo {nodename}"
            + " | awk -F'=' '/Level=0/ { print $NF }'",
            shell=True,
        ).stdout
        l_nodelist = util.to_hostnames(nodelist)
        group_names = set(l_nodelist)
        # get the list of all the existing tpus in the nodeset
        tpus_list = set(tpuobj.list_node_names())
        # In the intersection there must be only one node that is the master
        tpus_int = list(group_names.intersection(tpus_list))
        if len(tpus_int) > 1:
            log.error(
                f"More than one cloud tpu node for tpu group {nodelist}, there should be only one that should be {l_nodelist[0]}, but we have found {tpus_int}"
            )
            return NodeActionUnknown(slurm_state=state, instance_state=None)
        if len(tpus_int) == 1:
            inst = tpuobj.get_node(tpus_int[0])
        # if len(tpus_int ==0) this case is not relevant as this would be the case always that a TPU group is not running
    if inst is None:
        if state.base == "DOWN" and "POWERED_DOWN" in state.flags:
            return NodeActionIdle()
        if "POWERING_DOWN" in state.flags:
            return NodeActionIdle()
        if "COMPLETING" in state.flags:
            return NodeActionDown(reason="Unbacked instance")
        if state.base != "DOWN" and not (
            set(("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN"))
            & state.flags
        ):
            return NodeActionDown(reason="Unbacked instance")
        if lkp.is_static_node(nodename):
            return NodeActionPowerUp()
    elif (
        state is not None
        and "POWERED_DOWN" not in state.flags
        and "POWERING_DOWN" not in state.flags
        and inst.state == tpu.TPU.State.STOPPED
    ):
        if tpuobj.preemptible:
            return NodeActionPrempt()
        if state.base != "DOWN":
            return NodeActionDown(reason="Instance terminated")
    elif (
        state is None or "POWERED_DOWN" in state.flags
    ) and inst.state == tpu.TPU.State.READY:
        return NodeActionDelete()
    elif state is None:
        # if state is None here, the instance exists but it's not in Slurm
        return NodeActionUnknown(slurm_state=state, instance_state=inst.status)

    return NodeActionUnchanged()