def _determine_faulty_nodes()

in services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py [0:0]


def _determine_faulty_nodes(nodes: List[Dict[str, Any]], unconnected_instances: Dict[str, List[str]],
                            instance_uptime: Dict[str, int]) -> Dict[str, List[Any]]:
    """
    Determine all nodes that are in a faulty state and should thus be turned off
    :param unconnected_instances: Names of all nodes that are currently starting up
    :param nodes: Currently connected nodes (List of dicts)
    :param instance_uptime: Duration about how long each node has been running
    :return: (Dict[label, List(node)] containing nodes that are faulty, instances not found in jenkins)

    """
    label2faulty_nodes: DefaultDict[str, List[Any]] = defaultdict(list)
    orphaned_instances = []

    # Determine instances that failed to start up. This sometimes happens with windows slaves
    for label, instances in unconnected_instances.items():
        maximum_startup_limit = _maximum_startup_time()[label]
        for instance in instances:
            node = _find_node_by_name(nodes, instance)
            if not node:
                # Autoscaling instance not known to jenkins
                logging.error('Could not find node_data for %s, marked as orphaned instance for termination', instance)
                orphaned_instances.append(instance)
                continue

            uptime = instance_uptime[instance]
            if uptime > maximum_startup_limit:
                logging.warning('Instance %s failed to start up within %d seconds', instance, uptime)
                label2faulty_nodes[label].append(node)

    for node in nodes:
        if node['displayName'] in ['master', 'Built-In Node']:
            # Don't do anything for master
            continue

        label = _managed_node_label(node)

        if not label:
            logging.info('Slave %s is not managed by auto scaling. Ignoring for consideration of faulty instances',
                         node['displayName'])
        # Turn off slaves that have been marked as offline by Jenkins due to things like too low disk space
        elif node['temporarilyOffline'] and node['offlineCause'] and \
                node['offlineCause']['_class'].startswith(NODE_MONITOR_OFFLINE_CAUSE):
            logging.warning('Instance %s has been marked as offline by Jenkins monitoring due to "%s"',
                            node['displayName'], node['offlineCauseReason'])
            label2faulty_nodes[label].append(node)
        # Turn off slaves that have been marked to downscale but have not been downscaled
        elif node['offlineCauseReason'] == DOWNSCALE_REASON or node['offlineCauseReason'] \
                .startswith(DOWNSCALE_MANUAL_REASON):
            logging.warning('Instance %s has been marked to downscale but has not scaled down: "%s"',
                            node['displayName'], node['offlineCauseReason'])
            label2faulty_nodes[label].append(node)
        # Delete node slots that have been created but dont have a matching instance
        elif node['displayName'] not in instance_uptime:
            logging.warning('Slot for %s has been created but instance never started', node['displayName'])
            label2faulty_nodes[label].append(node)

    # Remove empty lists, caused by the usage of defaultdict()
    return ({key: val for key, val in label2faulty_nodes.items() if val}, orphaned_instances)