in services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py [0:0]
def _determine_faulty_nodes(nodes: List[Dict[str, Any]], unconnected_instances: Dict[str, List[str]],
instance_uptime: Dict[str, int]) -> Dict[str, List[Any]]:
"""
Determine all nodes that are in a faulty state and should thus be turned off
:param unconnected_instances: Names of all nodes that are currently starting up
:param nodes: Currently connected nodes (List of dicts)
:param instance_uptime: Duration about how long each node has been running
:return: (Dict[label, List(node)] containing nodes that are faulty, instances not found in jenkins)
"""
label2faulty_nodes: DefaultDict[str, List[Any]] = defaultdict(list)
orphaned_instances = []
# Determine instances that failed to start up. This sometimes happens with windows slaves
for label, instances in unconnected_instances.items():
maximum_startup_limit = _maximum_startup_time()[label]
for instance in instances:
node = _find_node_by_name(nodes, instance)
if not node:
# Autoscaling instance not known to jenkins
logging.error('Could not find node_data for %s, marked as orphaned instance for termination', instance)
orphaned_instances.append(instance)
continue
uptime = instance_uptime[instance]
if uptime > maximum_startup_limit:
logging.warning('Instance %s failed to start up within %d seconds', instance, uptime)
label2faulty_nodes[label].append(node)
for node in nodes:
if node['displayName'] in ['master', 'Built-In Node']:
# Don't do anything for master
continue
label = _managed_node_label(node)
if not label:
logging.info('Slave %s is not managed by auto scaling. Ignoring for consideration of faulty instances',
node['displayName'])
# Turn off slaves that have been marked as offline by Jenkins due to things like too low disk space
elif node['temporarilyOffline'] and node['offlineCause'] and \
node['offlineCause']['_class'].startswith(NODE_MONITOR_OFFLINE_CAUSE):
logging.warning('Instance %s has been marked as offline by Jenkins monitoring due to "%s"',
node['displayName'], node['offlineCauseReason'])
label2faulty_nodes[label].append(node)
# Turn off slaves that have been marked to downscale but have not been downscaled
elif node['offlineCauseReason'] == DOWNSCALE_REASON or node['offlineCauseReason'] \
.startswith(DOWNSCALE_MANUAL_REASON):
logging.warning('Instance %s has been marked to downscale but has not scaled down: "%s"',
node['displayName'], node['offlineCauseReason'])
label2faulty_nodes[label].append(node)
# Delete node slots that have been created but dont have a matching instance
elif node['displayName'] not in instance_uptime:
logging.warning('Slot for %s has been created but instance never started', node['displayName'])
label2faulty_nodes[label].append(node)
# Remove empty lists, caused by the usage of defaultdict()
return ({key: val for key, val in label2faulty_nodes.items() if val}, orphaned_instances)