in services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py [0:0]
def determine_scale_down_nodes(nodes_data: List[Dict[str, Any]], instance_uptime: Dict[str, int]) \
-> Dict[str, List[str]]:
"""
Determine which instances should be shut down due to idle
:param nodes_data: Currently connected nodes (List of dicts)
:param instance_uptime: Duration about how long each node has been running
:return: Dict(label, list(nodes_to_disable))
"""
nodes_to_disable: DefaultDict[str, list] = defaultdict(list)
considered_nodes: DefaultDict[str, list] = defaultdict(list)
for node_data in nodes_data:
if not node_data['offline'] and node_data['idle']:
display_name = node_data['displayName']
label = _managed_node_label(node_data)
if not label:
logging.error('Could not extract the managed label for node %s', display_name)
continue
# Check if label is managed - otherwise skip
if label not in _managed_jenkins_node_labels():
logging.debug('Label %s is not managed, skipping...', label)
continue
# TODO: Add a label that marks reserved instances
if node_data['monitorData']['hudson.node_monitors.ArchitectureMonitor'] is None:
# Sometimes, the architecture monitor is not set. This is a race condition and can be
# ignored since the information is available within the next turn
logging.info('Architecture has not been propagated for %s, ignoring until next scale_down check',
display_name)
continue
# TODO: Check for how long an instance has been idling. There is no built-in API for now and the
# only way is to go through the entire Jenkins build history. Save this up for later.
nodes_to_disable[label].append(node_data)
considered_nodes[label].append(node_data)
# Leave some buffer for warm pool. This code makes sure to always leave X instances in idle while scaling down.
# For example: 5 instances running, 3 in idle, WARM_POOL_SIZE set to 2. This code will remove only 1 instance,
# leading to 4 instances running and 2 in idle.
for warm_label, warm_value in _warm_pool_node_counts().items():
cur_nodes = nodes_to_disable[warm_label]
cur_considered_nodes = considered_nodes[warm_label]
if cur_nodes:
warm_value -= (len(cur_considered_nodes) - len(cur_nodes))
for _ in range(0, warm_value):
# Pop a random entry. Otherwise, the first node is never going to be shut down
cur_nodes.pop(random.randrange(0, len(cur_nodes)))
# Remove empty lists, caused by the usage of defaultdict()
return {key: val for key, val in nodes_to_disable.items() if val}