def determine_scale_down_nodes()

in services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py [0:0]


def determine_scale_down_nodes(nodes_data: List[Dict[str, Any]], instance_uptime: Dict[str, int]) \
        -> Dict[str, List[str]]:
    """
    Determine which instances should be shut down due to idle
    :param nodes_data: Currently connected nodes (List of dicts)
    :param instance_uptime: Duration about how long each node has been running
    :return: Dict(label, list(nodes_to_disable))
    """
    nodes_to_disable: DefaultDict[str, list] = defaultdict(list)
    considered_nodes: DefaultDict[str, list] = defaultdict(list)

    for node_data in nodes_data:
        if not node_data['offline'] and node_data['idle']:
            display_name = node_data['displayName']
            label = _managed_node_label(node_data)
            if not label:
                logging.error('Could not extract the managed label for node %s', display_name)
                continue

            # Check if label is managed - otherwise skip
            if label not in _managed_jenkins_node_labels():
                logging.debug('Label %s is not managed, skipping...', label)
                continue

            # TODO: Add a label that marks reserved instances

            if node_data['monitorData']['hudson.node_monitors.ArchitectureMonitor'] is None:
                # Sometimes, the architecture monitor is not set. This is a race condition and can be
                # ignored since the information is available within the next turn
                logging.info('Architecture has not been propagated for %s, ignoring until next scale_down check',
                             display_name)
                continue

            # TODO: Check for how long an instance has been idling. There is no built-in API for now and the
            # only way is to go through the entire Jenkins build history. Save this up for later.

            nodes_to_disable[label].append(node_data)
            considered_nodes[label].append(node_data)

    # Leave some buffer for warm pool. This code makes sure to always leave X instances in idle while scaling down.
    # For example: 5 instances running, 3 in idle, WARM_POOL_SIZE set to 2. This code will remove only 1 instance,
    # leading to 4 instances running and 2 in idle.
    for warm_label, warm_value in _warm_pool_node_counts().items():
        cur_nodes = nodes_to_disable[warm_label]
        cur_considered_nodes = considered_nodes[warm_label]
        if cur_nodes:
            warm_value -= (len(cur_considered_nodes) - len(cur_nodes))
            for _ in range(0, warm_value):
                # Pop a random entry. Otherwise, the first node is never going to be shut down
                cur_nodes.pop(random.randrange(0, len(cur_nodes)))

    # Remove empty lists, caused by the usage of defaultdict()
    return {key: val for key, val in nodes_to_disable.items() if val}