internal/controller/node_reconciler.go (127 lines of code) (raw):
package controllers
import (
"context"
"time"
pdbautoscaler "github.com/azure/eviction-autoscaler/api/v1"
"github.com/azure/eviction-autoscaler/internal/podutil"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)
// EvictionAutoScalerReconciler reconciles a EvictionAutoScaler object
type NodeReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
}
const NodeNameIndex = "spec.nodeName"
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch
// +kubebuilder:rbac:groups=core,resources=pods,verbs=watch;get;list
// Reconcile is the main loop of the controller. It will look for unschedulded nodes and for every pod on the node
func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
// Fetch the EvictionAutoScaler instance
node := &corev1.Node{}
err := r.Get(ctx, req.NamespacedName, node)
if err != nil {
//should we use a finalizer to scale back down on deletion?
if errors.IsNotFound(err) {
return ctrl.Result{}, nil // EvictionAutoScaler not found, could be deleted, nothing to do
}
return ctrl.Result{}, err // Error fetching EvictionAutoScaler
}
if !node.Spec.Unschedulable {
return ctrl.Result{}, err
}
var podlist corev1.PodList
if err := r.List(ctx, &podlist, client.MatchingFields{NodeNameIndex: node.Name}); err != nil {
return ctrl.Result{}, err
}
podchanged := false
for _, pod := range podlist.Items {
// TODO group pods by namespace to share list/get of EvictionAutoScalers/pdbs
// Also could do this to avoid list/llooku up but need to measure if either helps
//if !possibleTarget(pod.GetOwnerReferences()) {
// continue
//}
EvictionAutoScalerList := &pdbautoscaler.EvictionAutoScalerList{}
err = r.Client.List(ctx, EvictionAutoScalerList, &client.ListOptions{Namespace: pod.Namespace})
if err != nil {
logger.Error(err, "Error: Unable to list EvictionAutoScalers")
return ctrl.Result{}, err
}
var applicableEvictionAutoScaler *pdbautoscaler.EvictionAutoScaler
for _, EvictionAutoScaler := range EvictionAutoScalerList.Items {
// Fetch the PDB using a 1:1 name mapping
pdb := &policyv1.PodDisruptionBudget{}
err = r.Get(ctx, types.NamespacedName{Name: EvictionAutoScaler.Name, Namespace: EvictionAutoScaler.Namespace}, pdb)
if err != nil {
if errors.IsNotFound(err) {
logger.Error(err, "no matching pdb", "namespace", EvictionAutoScaler.Namespace, "name", EvictionAutoScaler.Name)
continue
}
return ctrl.Result{}, err
}
// Check if the PDB selector matches the evicted pod's labels
selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
if err != nil {
logger.Error(err, "Error: Invalid PDB selector", "pdbname", EvictionAutoScaler.Name)
continue
}
if selector.Matches(labels.Set(pod.Labels)) {
applicableEvictionAutoScaler = EvictionAutoScaler.DeepCopy()
break //should we keep going to ensure multiple EvictionAutoScalers don't match?
}
}
if applicableEvictionAutoScaler == nil {
continue
}
logger.Info("Found EvictionAutoScaler for pod", "name", applicableEvictionAutoScaler.Name, "namespace", pod.Namespace, "podname", pod.Name, "node", node.Name)
pod := pod.DeepCopy()
updatedpod := podutil.UpdatePodCondition(&pod.Status, &v1.PodCondition{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
Reason: "EvictionAttempt",
Message: "eviction attempt anticipated by node cordon",
})
if updatedpod {
if err := r.Client.Status().Update(ctx, pod); err != nil {
logger.Error(err, "Error: Unable to update Pod status")
return ctrl.Result{}, err
}
}
applicableEvictionAutoScaler.Spec.LastEviction = pdbautoscaler.Eviction{
PodName: pod.Name,
EvictionTime: metav1.Now(),
}
if err := r.Update(ctx, applicableEvictionAutoScaler); err != nil {
logger.Error(err, "unable to update EvictionAutoScaler", "name", applicableEvictionAutoScaler.Name)
return ctrl.Result{}, err
}
podchanged = true
}
///if we updated requeue again so we keep updating (could ignore if there were no pods mathing pdbs)
// pods till they get off or node is uncordoned.
//TODO pull smallest cooldown from all EvictionAutoScalers if they allow defining it.
var cooldownNeeded time.Duration
if podchanged {
cooldownNeeded = cooldown
}
return ctrl.Result{RequeueAfter: cooldownNeeded}, nil
}
func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
if err := mgr.GetFieldIndexer().IndexField(context.TODO(), &corev1.Pod{}, NodeNameIndex, func(rawObj client.Object) []string {
// Extract the spec.nodeName field
pod := rawObj.(*corev1.Pod)
if pod.Spec.NodeName == "" {
return nil // Don't index Pods without a NodeName
}
return []string{pod.Spec.NodeName}
}); err != nil {
return err
}
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.Node{}).
WithEventFilter(predicate.Funcs{
// ignore status updates as we only care about cordon.
UpdateFunc: func(ue event.UpdateEvent) bool {
oldNode := ue.ObjectOld.(*v1.Node)
newNode := ue.ObjectNew.(*v1.Node)
return oldNode.Spec.Unschedulable == newNode.Spec.Unschedulable
},
}).
Complete(r)
}
/*
func possibleTarget(owners []metav1.OwnerReference) bool {
//this kind of funny since a deployment pod will be owned by a replicaset
for _, owner := range owners {
if owner.Kind == "ReplicaSet" || owner.Kind == "StatefulSet" {
return true
}
}
return false
}*/