def find_schedulable_nodes()

in gpudirect-tcpxo/topology-scheduler/schedule-daemon.py [0:0]


def find_schedulable_nodes(nodes, pods, tolerated_taints):
  """Finds nodes that can be scheduled."""
  nodes_info = {}

  if tolerated_taints is not None:
    tolerated_taint_dict = {t.key: t for t in tolerated_taints}
  else:
    tolerated_taint_dict = {}

  for node in nodes:
    node_name = node.metadata.name
    node_labels = node.metadata.labels

    if 'cloud.google.com/gke-placement-group' not in node_labels:
      print(
          f'Skipping node {node_name} because it does not have topology'
          ' metadata'
      )
      continue

    skip_node = False
    # check node taints
    if node.spec.taints is not None:
      for t in node.spec.taints:
        if t.key not in tolerated_taint_dict:
          print(f'Skipping node {node_name} because it is tainted with key {t.key}')
          skip_node = True
          break
        else:
          tol = tolerated_taint_dict[t.key]
          if tol.operator == "Equal" and tol.value != t.value:
            print(f'Skipping node {node_name} because it is tainted with key {t.key} with value {t.value}')
            skip_node = True
            break
    # check node status
    if any(condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions):
      print(f'Skipping node {node_name} because it is NotReady')
      skip_node = True
      break

    if skip_node:
      continue

    allocatable = node.status.allocatable

    used_cpu = 0
    used_memory = 0
    used_gpu = 0

    for pod in pods:
      if pod.spec.node_name == node_name:
        cpu, mem, gpu = get_pod_used_resources(pod)
        used_cpu += cpu
        used_memory += mem
        used_gpu += gpu

    free_cpu = parse_quantity(allocatable['cpu']) - used_cpu
    free_memory = parse_quantity(allocatable['memory']) - used_memory
    free_gpu = int(allocatable.get('nvidia.com/gpu', 0)) - used_gpu

    node_info = {
        'name': node_name,
        'cpu': free_cpu,
        'memory': free_memory,
        'gpu': free_gpu,
        'node_labels': node_labels,
    }
    nodes_info[node_name] = node_info

    print(
        f'Node: {node_name}, CPU: {free_cpu}, Memory: {free_memory}, GPU:'
        f' {free_gpu}, Topology: {node_topology_key(node_info)}'
    )

  return nodes_info