in community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tpu.py [0:0]
def delete_node(self, nodename):
request = tpu.DeleteNodeRequest(name=f"{self._parent}/nodes/{nodename}")
try:
resp = self._client.delete_node(request=request).result()
if resp:
return self.get_node(nodename=nodename) is None
return False
except gExceptions.NotFound:
# log only error if vmcount is 1 as for other tpu vm count, this could be "phantom" nodes
if self.vmcount == 1:
log.error(f"Tpu single node {nodename} not found")
else:
# for the TPU nodes that consist in more than one vm, only the first node of the TPU a.k.a. the master node will
# exist as real TPU nodes, so the other ones are expected to not be found, check the hostname of the node that has
# not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence
# log an error
nodehostname = yaml.safe_load(
util.run(f"{self.lkp.scontrol} --yaml show node {nodename}").stdout.rstrip()
)["nodes"][0]["hostname"]
if nodehostname.split("-")[-1] == "0":
log.error(f"TPU master node {nodename} not found")
else:
log.info(f"Deleted TPU 'phantom' node {nodename}")
# If the node is not found it is tecnichally deleted, so return success.
return True