in gpudirect-tcpxo/topology-scheduler/schedule-daemon.py [0:0]
def find_schedulable_nodes(nodes, pods, tolerated_taints):
"""Finds nodes that can be scheduled."""
nodes_info = {}
if tolerated_taints is not None:
tolerated_taint_dict = {t.key: t for t in tolerated_taints}
else:
tolerated_taint_dict = {}
for node in nodes:
node_name = node.metadata.name
node_labels = node.metadata.labels
if 'cloud.google.com/gke-placement-group' not in node_labels:
print(
f'Skipping node {node_name} because it does not have topology'
' metadata'
)
continue
skip_node = False
# check node taints
if node.spec.taints is not None:
for t in node.spec.taints:
if t.key not in tolerated_taint_dict:
print(f'Skipping node {node_name} because it is tainted with key {t.key}')
skip_node = True
break
else:
tol = tolerated_taint_dict[t.key]
if tol.operator == "Equal" and tol.value != t.value:
print(f'Skipping node {node_name} because it is tainted with key {t.key} with value {t.value}')
skip_node = True
break
# check node status
if any(condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions):
print(f'Skipping node {node_name} because it is NotReady')
skip_node = True
break
if skip_node:
continue
allocatable = node.status.allocatable
used_cpu = 0
used_memory = 0
used_gpu = 0
for pod in pods:
if pod.spec.node_name == node_name:
cpu, mem, gpu = get_pod_used_resources(pod)
used_cpu += cpu
used_memory += mem
used_gpu += gpu
free_cpu = parse_quantity(allocatable['cpu']) - used_cpu
free_memory = parse_quantity(allocatable['memory']) - used_memory
free_gpu = int(allocatable.get('nvidia.com/gpu', 0)) - used_gpu
node_info = {
'name': node_name,
'cpu': free_cpu,
'memory': free_memory,
'gpu': free_gpu,
'node_labels': node_labels,
}
nodes_info[node_name] = node_info
print(
f'Node: {node_name}, CPU: {free_cpu}, Memory: {free_memory}, GPU:'
f' {free_gpu}, Topology: {node_topology_key(node_info)}'
)
return nodes_info