pkg/controllers/member/internalmembercluster/v1beta1/controller_v1beta1.go (171 lines of code) (raw):
/*
Copyright (c) Microsoft Corporation.
Licensed under the MIT license.
*/
package v1beta1
import (
"context"
"fmt"
"time"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1"
fleetnetv1alpha1 "go.goms.io/fleet-networking/api/v1alpha1"
"go.goms.io/fleet-networking/pkg/common/apiretry"
)
const (
conditionReasonJoined = "AgentJoined"
conditionReasonLeft = "AgentLeft"
// we add +-5% jitter
jitterPercent = 10
)
// Reconciler reconciles a InternalMemberCluster object.
type Reconciler struct {
MemberClient client.Client
HubClient client.Client
AgentType clusterv1beta1.AgentType
}
//+kubebuilder:rbac:groups=cluster.kubernetes-fleet.io,resources=internalmemberclusters,verbs=get;list;watch
//+kubebuilder:rbac:groups=cluster.kubernetes-fleet.io,resources=internalmemberclusters/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=networking.fleet.azure.com,resources=multiclusterservices,verbs=get;list;delete
//+kubebuilder:rbac:groups=networking.fleet.azure.com,resources=serviceexports,verbs=get;list;delete
// Reconcile handles join/leave for the member cluster controllers and updates its heartbeats.
// For the MCS controller, it needs to delete created MCS related in the member clusters.
// For the ServiceExportImport controllers, it needs to delete created serviceExported related in the member clusters.
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
imcKRef := klog.KRef(req.Namespace, req.Name)
startTime := time.Now()
klog.V(2).InfoS("Reconciliation starts", "internalMemberCluster", imcKRef)
defer func() {
latency := time.Since(startTime).Milliseconds()
klog.V(2).InfoS("Reconciliation ends", "internalMemberCluster", imcKRef, "latency", latency)
}()
var imc clusterv1beta1.InternalMemberCluster
if err := r.HubClient.Get(ctx, req.NamespacedName, &imc); err != nil {
if apierrors.IsNotFound(err) {
klog.V(4).InfoS("internal member cluster object is not found", "internalMemberCluster", imcKRef)
return ctrl.Result{}, nil
}
klog.ErrorS(err, "Failed to get internal member cluster object", "internalMemberCluster", imcKRef)
return ctrl.Result{}, err
}
switch imc.Spec.State {
case clusterv1beta1.ClusterStateLeave:
// The member cluster is leaving the fleet.
klog.V(2).InfoS("member cluster has left the fleet; performing cleanup", "internalMemberCluster", imcKRef)
// Clean up fleet networking related resources.
if r.AgentType == clusterv1beta1.MultiClusterServiceAgent {
if err := r.cleanupMCSRelatedResources(ctx); err != nil {
return ctrl.Result{}, err
}
}
if r.AgentType == clusterv1beta1.ServiceExportImportAgent {
if err := r.cleanupServiceExportRelatedResources(ctx); err != nil {
return ctrl.Result{}, err
}
}
// Update the agent status.
return ctrl.Result{}, r.updateAgentStatus(ctx, &imc)
case clusterv1beta1.ClusterStateJoin:
// The member cluster still has an active membership in the fleet; update the agent status.
if err := r.updateAgentStatus(ctx, &imc); err != nil {
return ctrl.Result{}, err
}
// Add jitter to the heartbeat report interval, so as to mitigate the thundering herd problem.
hbInterval := 1000 * imc.Spec.HeartbeatPeriodSeconds
jitterRange := int64(hbInterval*jitterPercent) / 100
requeueAfter := time.Millisecond * (time.Duration(hbInterval) + time.Duration(rand.Int63nRange(0, jitterRange)-jitterRange/2))
return ctrl.Result{RequeueAfter: requeueAfter}, nil
default:
klog.ErrorS(fmt.Errorf("cluster is of an invalid state"), "internalMemberCluster", imcKRef, "clusterState", imc.Spec.State)
}
return ctrl.Result{}, nil
}
// updateAgentStatus reports the status of the agent via internal member cluster object.
func (r *Reconciler) updateAgentStatus(ctx context.Context, imc *clusterv1beta1.InternalMemberCluster) error {
imcKObj := klog.KObj(imc)
klog.V(2).InfoS("Updating internal member cluster status", "internalMemberCluster", imcKObj, "agentType", r.AgentType)
agentStatus := imc.GetAgentStatus(r.AgentType)
if imc.Spec.State == clusterv1beta1.ClusterStateJoin {
// The member cluster still has an active membership in the fleet.
meta.SetStatusCondition(&agentStatus.Conditions, metav1.Condition{
Type: string(clusterv1beta1.AgentJoined),
Status: metav1.ConditionTrue,
Reason: conditionReasonJoined,
ObservedGeneration: imc.GetGeneration(),
})
// Update the last received heartbeat value.
agentStatus.LastReceivedHeartbeat = metav1.NewTime(time.Now())
} else {
// The member cluster has left the fleet.
meta.SetStatusCondition(&agentStatus.Conditions, metav1.Condition{
Type: string(clusterv1beta1.AgentJoined),
Status: metav1.ConditionFalse,
Reason: conditionReasonLeft,
ObservedGeneration: imc.GetGeneration(),
})
// No need to send more heartbeats to the hub cluster as the meber cluster has left.
}
if err := r.HubClient.Status().Update(ctx, imc); err != nil {
if apierrors.IsConflict(err) {
klog.V(2).InfoS("Failed to update internal member cluster status due to conflicts", "internalMemberCluster", klog.KObj(imc))
return nil
}
klog.ErrorS(err, "Failed to update internal member cluster status", "internalMemberCluster", klog.KObj(imc))
return err
}
return nil
}
// cleanupMCSRelatedResources deletes the MCS related resources.
// Ideally it should stop the controllers.
// For now, it tries its best to delete the existing MCS and won't handle the newly created resources for now.
func (r *Reconciler) cleanupMCSRelatedResources(ctx context.Context) error {
list := &fleetnetv1alpha1.MultiClusterServiceList{}
if err := r.MemberClient.List(ctx, list); err != nil {
klog.ErrorS(err, "Failed to list MCS")
return err
}
for i := range list.Items {
if list.Items[i].ObjectMeta.DeletionTimestamp != nil {
continue
}
deleteFunc := func() error {
return r.MemberClient.Delete(ctx, &list.Items[i])
}
if err := apiretry.Do(deleteFunc); err != nil && !apierrors.IsNotFound(err) {
klog.ErrorS(err, "Failed to delete MCS", "multiClusterService", klog.KObj(&list.Items[i]))
return err
}
}
for i := range list.Items {
name := types.NamespacedName{Namespace: list.Items[i].GetNamespace(), Name: list.Items[i].GetName()}
mcs := fleetnetv1alpha1.MultiClusterService{}
getFunc := func() error {
err := r.MemberClient.Get(ctx, name, &mcs)
return err
}
if err := apiretry.WaitUntilObjectDeleted(ctx, getFunc); err != nil {
klog.ErrorS(err, "The MCS has not been deleted in time", "multiClusterService", name)
return err
}
}
klog.V(2).InfoS("Cleanup of MCS related resources has been completed", "objectCounter", len(list.Items))
return nil
}
// cleanupServiceExportRelatedResources deletes the serviceExport related resources.
// Ideally it should stop the controllers.
// For now, it tries its best to delete the existing serviceExport and won't handle the newly created resources for now.
func (r *Reconciler) cleanupServiceExportRelatedResources(ctx context.Context) error {
list := &fleetnetv1alpha1.ServiceExportList{}
if err := r.MemberClient.List(ctx, list); err != nil {
klog.ErrorS(err, "Failed to list service export")
return err
}
for i := range list.Items {
if list.Items[i].ObjectMeta.DeletionTimestamp != nil {
continue
}
deleteFunc := func() error {
return r.MemberClient.Delete(ctx, &list.Items[i])
}
if err := apiretry.Do(deleteFunc); err != nil && !apierrors.IsNotFound(err) {
klog.ErrorS(err, "Failed to delete service export", "serviceExport", klog.KObj(&list.Items[i]))
return err
}
}
for i := range list.Items {
name := types.NamespacedName{Namespace: list.Items[i].GetNamespace(), Name: list.Items[i].GetName()}
svcExport := fleetnetv1alpha1.ServiceExport{}
getFunc := func() error {
return r.MemberClient.Get(ctx, name, &svcExport)
}
if err := apiretry.WaitUntilObjectDeleted(ctx, getFunc); err != nil {
klog.ErrorS(err, "The service export has not been deleted in time", "serviceExport", name)
return err
}
}
klog.V(2).InfoS("Cleanup of service export related resources has been completed", "objectCounter", len(list.Items))
return nil
}
// SetupWithManager sets up the controller with the Manager.
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&clusterv1beta1.InternalMemberCluster{}).
Complete(r)
}