in cluster-autoscaler/core/static_autoscaler.go [222:589]
func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError {
a.cleanUpIfRequired()
a.processorCallbacks.reset()
a.clusterStateRegistry.PeriodicCleanup()
a.DebuggingSnapshotter.StartDataCollection()
defer a.DebuggingSnapshotter.Flush()
unschedulablePodLister := a.UnschedulablePodLister()
scheduledPodLister := a.ScheduledPodLister()
pdbLister := a.PodDisruptionBudgetLister()
scaleDown := a.scaleDown
autoscalingContext := a.AutoscalingContext
klog.V(4).Info("Starting main loop")
stateUpdateStart := time.Now()
// Get nodes and pods currently living on cluster
allNodes, readyNodes, typedErr := a.obtainNodeLists(a.CloudProvider)
if typedErr != nil {
klog.Errorf("Failed to get node list: %v", typedErr)
return typedErr
}
originalScheduledPods, err := scheduledPodLister.List()
if err != nil {
klog.Errorf("Failed to list scheduled pods: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
if abortLoop, err := a.processors.ActionableClusterProcessor.ShouldAbort(
a.AutoscalingContext, allNodes, readyNodes, currentTime); abortLoop {
return err
}
// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
metrics.UpdateClusterCPUCurrentCores(coresTotal)
metrics.UpdateClusterMemoryCurrentBytes(memoryTotal)
daemonsets, err := a.ListerRegistry.DaemonSetLister().List(labels.Everything())
if err != nil {
klog.Errorf("Failed to get daemonset list: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
// Call CloudProvider.Refresh before any other calls to cloud provider.
refreshStart := time.Now()
err = a.AutoscalingContext.CloudProvider.Refresh()
metrics.UpdateDurationFromStart(metrics.CloudProviderRefresh, refreshStart)
if err != nil {
klog.Errorf("Failed to refresh cloud provider config: %v", err)
return errors.ToAutoscalerError(errors.CloudProviderError, err)
}
// Update node groups min/max after cloud provider refresh
for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
}
nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
// Initialize cluster state to ClusterSnapshot
if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {
return typedErr.AddPrefix("Initialize ClusterSnapshot")
}
nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingContext, readyNodes, daemonsets, a.ignoredTaints, currentTime)
if autoscalerError != nil {
klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)
return autoscalerError.AddPrefix("failed to build node infos for node groups: ")
}
a.DebuggingSnapshotter.SetTemplateNodes(nodeInfosForGroups)
nodeInfosForGroups, err = a.processors.NodeInfoProcessor.Process(autoscalingContext, nodeInfosForGroups)
if err != nil {
klog.Errorf("Failed to process nodeInfos: %v", err)
return errors.ToAutoscalerError(errors.InternalError, err)
}
if typedErr := a.updateClusterState(allNodes, nodeInfosForGroups, currentTime); typedErr != nil {
klog.Errorf("Failed to update cluster state: %v", typedErr)
return typedErr
}
metrics.UpdateDurationFromStart(metrics.UpdateState, stateUpdateStart)
scaleUpStatus := &status.ScaleUpStatus{Result: status.ScaleUpNotTried}
scaleUpStatusProcessorAlreadyCalled := false
scaleDownStatus := &status.ScaleDownStatus{Result: status.ScaleDownNotTried}
scaleDownStatusProcessorAlreadyCalled := false
defer func() {
// Update status information when the loop is done (regardless of reason)
if autoscalingContext.WriteStatusConfigMap {
status := a.clusterStateRegistry.GetStatus(currentTime)
utils.WriteStatusConfigMap(autoscalingContext.ClientSet, autoscalingContext.ConfigNamespace,
status.GetReadableString(), a.AutoscalingContext.LogRecorder, a.AutoscalingContext.StatusConfigMapName)
}
// This deferred processor execution allows the processors to handle a situation when a scale-(up|down)
// wasn't even attempted because e.g. the iteration exited earlier.
if !scaleUpStatusProcessorAlreadyCalled && a.processors != nil && a.processors.ScaleUpStatusProcessor != nil {
a.processors.ScaleUpStatusProcessor.Process(a.AutoscalingContext, scaleUpStatus)
}
if !scaleDownStatusProcessorAlreadyCalled && a.processors != nil && a.processors.ScaleDownStatusProcessor != nil {
scaleDownStatus.SetUnremovableNodesInfo(scaleDown.unremovableNodeReasons, scaleDown.nodeUtilizationMap, scaleDown.context.CloudProvider)
a.processors.ScaleDownStatusProcessor.Process(a.AutoscalingContext, scaleDownStatus)
}
err := a.processors.AutoscalingStatusProcessor.Process(a.AutoscalingContext, a.clusterStateRegistry, currentTime)
if err != nil {
klog.Errorf("AutoscalingStatusProcessor error: %v.", err)
}
}()
// Check if there are any nodes that failed to register in Kubernetes
// master.
unregisteredNodes := a.clusterStateRegistry.GetUnregisteredNodes()
if len(unregisteredNodes) > 0 {
klog.V(1).Infof("%d unregistered nodes present", len(unregisteredNodes))
removedAny, err := removeOldUnregisteredNodes(unregisteredNodes, autoscalingContext,
a.clusterStateRegistry, currentTime, autoscalingContext.LogRecorder)
// There was a problem with removing unregistered nodes. Retry in the next loop.
if err != nil {
klog.Warningf("Failed to remove unregistered nodes: %v", err)
}
if removedAny {
klog.V(0).Infof("Some unregistered nodes were removed, skipping iteration")
return nil
}
}
if !a.clusterStateRegistry.IsClusterHealthy() {
klog.Warning("Cluster is not ready for autoscaling")
scaleDown.CleanUpUnneededNodes()
autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "ClusterUnhealthy", "Cluster is unhealthy")
return nil
}
if a.deleteCreatedNodesWithErrors() {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}
// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
// TODO: andrewskim - add protection for ready AWS nodes.
fixedSomething, err := fixNodeGroupSize(autoscalingContext, a.clusterStateRegistry, currentTime)
if err != nil {
klog.Errorf("Failed to fix node group sizes: %v", err)
return errors.ToAutoscalerError(errors.CloudProviderError, err)
}
if fixedSomething {
klog.V(0).Infof("Some node group target size was fixed, skipping the iteration")
return nil
}
metrics.UpdateLastTime(metrics.Autoscaling, time.Now())
unschedulablePods, err := unschedulablePodLister.List()
if err != nil {
klog.Errorf("Failed to list unscheduled pods: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
metrics.UpdateUnschedulablePodsCount(len(unschedulablePods))
unschedulablePods = tpu.ClearTPURequests(unschedulablePods)
// todo: move split and append below to separate PodListProcessor
// Some unschedulable pods can be waiting for lower priority pods preemption so they have nominated node to run.
// Such pods don't require scale up but should be considered during scale down.
unschedulablePods, unschedulableWaitingForLowerPriorityPreemption := core_utils.FilterOutExpendableAndSplit(unschedulablePods, allNodes, a.ExpendablePodsPriorityCutoff)
// modify the snapshot simulating scheduling of pods waiting for preemption.
// this is not strictly correct as we are not simulating preemption itself but it matches
// CA logic from before migration to scheduler framework. So let's keep it for now
for _, p := range unschedulableWaitingForLowerPriorityPreemption {
if err := a.ClusterSnapshot.AddPod(p, p.Status.NominatedNodeName); err != nil {
klog.Errorf("Failed to update snapshot with pod %s waiting for preemption", err)
return errors.ToAutoscalerError(errors.InternalError, err)
}
}
// add upcoming nodes to ClusterSnapshot
upcomingNodes := getUpcomingNodeInfos(a.clusterStateRegistry, nodeInfosForGroups)
for _, upcomingNode := range upcomingNodes {
var pods []*apiv1.Pod
for _, podInfo := range upcomingNode.Pods {
pods = append(pods, podInfo.Pod)
}
err = a.ClusterSnapshot.AddNodeWithPods(upcomingNode.Node(), pods)
if err != nil {
klog.Errorf("Failed to add upcoming node %s to cluster snapshot: %v", upcomingNode.Node().Name, err)
return errors.ToAutoscalerError(errors.InternalError, err)
}
}
l, err := a.ClusterSnapshot.NodeInfos().List()
if err != nil {
klog.Errorf("Unable to fetch ClusterNode List for Debugging Snapshot, %v", err)
} else {
a.AutoscalingContext.DebuggingSnapshotter.SetClusterNodes(l)
}
unschedulablePodsToHelp, _ := a.processors.PodListProcessor.Process(a.AutoscalingContext, unschedulablePods)
// finally, filter out pods that are too "young" to safely be considered for a scale-up (delay is configurable)
unschedulablePodsToHelp = a.filterOutYoungPods(unschedulablePodsToHelp, currentTime)
if len(unschedulablePodsToHelp) == 0 {
scaleUpStatus.Result = status.ScaleUpNotNeeded
klog.V(1).Info("No unschedulable pods")
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable
klog.V(1).Info("Max total nodes in cluster reached")
} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
// The assumption here is that these pods have been created very recently and probably there
// is more pods to come. In theory we could check the newest pod time but then if pod were created
// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
// We also want to skip a real scale down (just like if the pods were handled).
a.processorCallbacks.DisableScaleDownForLoop()
scaleUpStatus.Result = status.ScaleUpInCooldown
klog.V(1).Info("Unschedulable pods are very new, waiting one iteration for more")
} else {
scaleUpStart := time.Now()
metrics.UpdateLastTime(metrics.ScaleUp, scaleUpStart)
scaleUpStatus, typedErr = ScaleUp(autoscalingContext, a.processors, a.clusterStateRegistry, unschedulablePodsToHelp, readyNodes, daemonsets, nodeInfosForGroups, a.ignoredTaints)
metrics.UpdateDurationFromStart(metrics.ScaleUp, scaleUpStart)
if a.processors != nil && a.processors.ScaleUpStatusProcessor != nil {
a.processors.ScaleUpStatusProcessor.Process(autoscalingContext, scaleUpStatus)
scaleUpStatusProcessorAlreadyCalled = true
}
if typedErr != nil {
klog.Errorf("Failed to scale up: %v", typedErr)
return typedErr
}
if scaleUpStatus.Result == status.ScaleUpSuccessful {
a.lastScaleUpTime = currentTime
// No scale down in this iteration.
scaleDownStatus.Result = status.ScaleDownInCooldown
return nil
}
}
if a.ScaleDownEnabled {
pdbs, err := pdbLister.List()
if err != nil {
scaleDownStatus.Result = status.ScaleDownError
klog.Errorf("Failed to list pod disruption budgets: %v", err)
return errors.ToAutoscalerError(errors.ApiCallError, err)
}
unneededStart := time.Now()
klog.V(4).Infof("Calculating unneeded nodes")
scaleDown.CleanUp(currentTime)
var scaleDownCandidates []*apiv1.Node
var podDestinations []*apiv1.Node
// podDestinations and scaleDownCandidates are initialized based on allNodes variable, which contains only
// registered nodes in cluster.
// It does not include any upcoming nodes which can be part of clusterSnapshot. As an alternative to using
// allNodes here, we could use nodes from clusterSnapshot and explicitly filter out upcoming nodes here but it
// is of little (if any) benefit.
if a.processors == nil || a.processors.ScaleDownNodeProcessor == nil {
scaleDownCandidates = allNodes
podDestinations = allNodes
} else {
var err errors.AutoscalerError
scaleDownCandidates, err = a.processors.ScaleDownNodeProcessor.GetScaleDownCandidates(
autoscalingContext, allNodes)
if err != nil {
klog.Error(err)
return err
}
podDestinations, err = a.processors.ScaleDownNodeProcessor.GetPodDestinationCandidates(autoscalingContext, allNodes)
if err != nil {
klog.Error(err)
return err
}
}
// We use scheduledPods (not originalScheduledPods) here, so artificial scheduled pods introduced by processors
// (e.g unscheduled pods with nominated node name) can block scaledown of given node.
if typedErr := scaleDown.UpdateUnneededNodes(podDestinations, scaleDownCandidates, currentTime, pdbs); typedErr != nil {
scaleDownStatus.Result = status.ScaleDownError
klog.Errorf("Failed to scale down: %v", typedErr)
return typedErr
}
metrics.UpdateDurationFromStart(metrics.FindUnneeded, unneededStart)
if klog.V(4).Enabled() {
for key, val := range scaleDown.unneededNodes {
klog.Infof("%s is unneeded since %s duration %s", key, val.String(), currentTime.Sub(val).String())
}
}
scaleDownInCooldown := a.processorCallbacks.disableScaleDownForLoop ||
a.lastScaleUpTime.Add(a.ScaleDownDelayAfterAdd).After(currentTime) ||
a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime)
// In dry run only utilization is updated
calculateUnneededOnly := scaleDownInCooldown || scaleDown.nodeDeletionTracker.IsNonEmptyNodeDeleteInProgress()
klog.V(4).Infof("Scale down status: unneededOnly=%v lastScaleUpTime=%s "+
"lastScaleDownDeleteTime=%v lastScaleDownFailTime=%s scaleDownForbidden=%v "+
"isDeleteInProgress=%v scaleDownInCooldown=%v",
calculateUnneededOnly, a.lastScaleUpTime,
a.lastScaleDownDeleteTime, a.lastScaleDownFailTime, a.processorCallbacks.disableScaleDownForLoop,
scaleDown.nodeDeletionTracker.IsNonEmptyNodeDeleteInProgress(), scaleDownInCooldown)
metrics.UpdateScaleDownInCooldown(scaleDownInCooldown)
if scaleDownInCooldown {
scaleDownStatus.Result = status.ScaleDownInCooldown
} else if scaleDown.nodeDeletionTracker.IsNonEmptyNodeDeleteInProgress() {
scaleDownStatus.Result = status.ScaleDownInProgress
} else {
klog.V(4).Infof("Starting scale down")
// We want to delete unneeded Node Groups only if there was no recent scale up,
// and there is no current delete in progress and there was no recent errors.
removedNodeGroups, err := a.processors.NodeGroupManager.RemoveUnneededNodeGroups(autoscalingContext)
if err != nil {
klog.Errorf("Error while removing unneeded node groups: %v", err)
}
scaleDownStart := time.Now()
metrics.UpdateLastTime(metrics.ScaleDown, scaleDownStart)
scaleDownStatus, typedErr := scaleDown.TryToScaleDown(currentTime, pdbs)
metrics.UpdateDurationFromStart(metrics.ScaleDown, scaleDownStart)
metrics.UpdateUnremovableNodesCount(scaleDown.getUnremovableNodesCount())
scaleDownStatus.RemovedNodeGroups = removedNodeGroups
if scaleDownStatus.Result == status.ScaleDownNodeDeleteStarted {
a.lastScaleDownDeleteTime = currentTime
a.clusterStateRegistry.Recalculate()
}
if (scaleDownStatus.Result == status.ScaleDownNoNodeDeleted ||
scaleDownStatus.Result == status.ScaleDownNoUnneeded) &&
a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount != 0 {
scaleDown.SoftTaintUnneededNodes(allNodes)
}
if a.processors != nil && a.processors.ScaleDownStatusProcessor != nil {
scaleDownStatus.SetUnremovableNodesInfo(scaleDown.unremovableNodeReasons, scaleDown.nodeUtilizationMap, scaleDown.context.CloudProvider)
a.processors.ScaleDownStatusProcessor.Process(autoscalingContext, scaleDownStatus)
scaleDownStatusProcessorAlreadyCalled = true
}
if typedErr != nil {
klog.Errorf("Failed to scale down: %v", typedErr)
a.lastScaleDownFailTime = currentTime
return typedErr
}
}
}
return nil
}