in gpudirect-tcpxo/topology-scheduler/schedule-daemon.py [0:0]
def find_schedulable_pods(pods, gate_name):
"""Finds pods that can be scheduled."""
pods_to_schedule = {}
for pod in pods:
if pod.spec.scheduling_gates:
gates = pod.spec.scheduling_gates
for gate in gates:
if gate.name == gate_name:
pod_name = pod.metadata.name
pod_namespace = pod.metadata.namespace
pod_index = None
job_name = None
if pod.metadata.labels is not None:
if (
'batch.kubernetes.io/job-completion-index'
in pod.metadata.labels
):
pod_index = pod.metadata.labels[
'batch.kubernetes.io/job-completion-index'
]
else:
print('Unable to find index in metadata. Can not queue jobs')
if 'job-name' in pod.metadata.labels:
job_name = pod.metadata.labels['job-name']
else:
print('Unable to find job_name in metadata. Can not queue jobs')
else:
print('No labels on pod to extract job metadata from.')
creation_time = None
if pod.metadata.creation_timestamp is not None:
creation_time = pod.metadata.creation_timestamp
else:
print(
'Unable to find creation_time in metadata. Can not queue jobs'
)
used_cpu = 0
used_memory = 0
used_gpu = 0
for container in pod.spec.containers:
requests = container.resources.requests or {}
used_cpu += parse_quantity(requests.get('cpu', 0))
used_memory += parse_quantity(requests.get('memory', 0))
used_gpu += int(requests.get('nvidia.com/gpu', 0))
pods_to_schedule[pod_name] = {
'name': pod_name,
'namespace': pod_namespace,
'index': pod_index,
'cpu': used_cpu,
'memory': used_memory,
'gpu': used_gpu,
'node_selector': pod.spec.node_selector,
'spec': pod.spec,
'metadata': pod.metadata,
'job_name': job_name,
'creation_time': creation_time
}
print(
f'Found schedulable pod: {pod_namespace}/{pod_name}, CPU:'
f' {used_cpu}, Memory: {used_memory}, GPU: {used_gpu}'
f' Index: {pod_index}'
)
return pods_to_schedule