pkg/controllers/hub/trafficmanagerprofile/controller.go (324 lines of code) (raw):
/*
Copyright (c) Microsoft Corporation.
Licensed under the MIT license.
*/
// Package trafficmanagerprofile features the TrafficManagerProfile controller to reconcile TrafficManagerProfile CRs.
package trafficmanagerprofile
import (
"context"
"errors"
"fmt"
"strconv"
"time"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/trafficmanager/armtrafficmanager"
"github.com/prometheus/client_golang/prometheus"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"go.goms.io/fleet/pkg/utils/controller"
fleetnetv1beta1 "go.goms.io/fleet-networking/api/v1beta1"
"go.goms.io/fleet-networking/pkg/common/azureerrors"
"go.goms.io/fleet-networking/pkg/common/defaulter"
"go.goms.io/fleet-networking/pkg/common/metrics"
"go.goms.io/fleet-networking/pkg/common/objectmeta"
)
func init() {
// Register the custom metrics
prometheus.MustRegister(trafficManagerProfileStatusLastTimestampSeconds)
}
const (
// DNSRelativeNameFormat consists of "Profile-Namespace" and "Profile-Name".
DNSRelativeNameFormat = "%s-%s"
// AzureResourceProfileNameFormat is the name format of the Azure Traffic Manager Profile created by the fleet controller.
AzureResourceProfileNameFormat = "fleet-%s"
// DefaultDNSTTL is in seconds. This informs the local DNS resolvers and DNS clients how long to cache DNS responses
// provided by this Traffic Manager profile.
// Defaults to 60 which is the same as the portal's default config.
DefaultDNSTTL = int64(60)
)
var (
// create the func as a variable so that the integration test can use a customized function.
generateAzureTrafficManagerProfileNameFunc = func(profile *fleetnetv1beta1.TrafficManagerProfile) string {
return GenerateAzureTrafficManagerProfileName(profile)
}
// trafficManagerProfileStatusLastTimestampSeconds is a prometheus metric that holds the last update timestamp of
// traffic manager profile status in seconds.
trafficManagerProfileStatusLastTimestampSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.MetricsNamespace,
Subsystem: metrics.MetricsSubsystem,
Name: "traffic_manager_profile_status_last_timestamp_seconds",
Help: "Last update timestamp of traffic manager profile status in seconds",
}, []string{"namespace", "name", "generation", "condition", "status", "reason"})
)
// GenerateAzureTrafficManagerProfileName generates the Azure Traffic Manager profile name based on the profile.
func GenerateAzureTrafficManagerProfileName(profile *fleetnetv1beta1.TrafficManagerProfile) string {
return fmt.Sprintf(AzureResourceProfileNameFormat, profile.UID)
}
// Reconciler reconciles a TrafficManagerProfile object.
type Reconciler struct {
client.Client
ProfilesClient *armtrafficmanager.ProfilesClient
}
//+kubebuilder:rbac:groups=networking.fleet.azure.com,resources=trafficmanagerprofiles,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=networking.fleet.azure.com,resources=trafficmanagerprofiles/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=networking.fleet.azure.com,resources=trafficmanagerprofiles/finalizers,verbs=get;update
//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch
// Reconcile triggers a single reconcile round.
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
name := req.NamespacedName
profileKRef := klog.KRef(name.Namespace, name.Name)
startTime := time.Now()
klog.V(2).InfoS("Reconciliation starts", "trafficManagerProfile", profileKRef)
defer func() {
latency := time.Since(startTime).Milliseconds()
klog.V(2).InfoS("Reconciliation ends", "trafficManagerProfile", profileKRef, "latency", latency)
}()
profile := &fleetnetv1beta1.TrafficManagerProfile{}
if err := r.Client.Get(ctx, name, profile); err != nil {
if apierrors.IsNotFound(err) {
klog.V(2).InfoS("Ignoring NotFound trafficManagerProfile", "trafficManagerProfile", profileKRef)
return ctrl.Result{}, nil
}
klog.ErrorS(err, "Failed to get trafficManagerProfile", "trafficManagerProfile", profileKRef)
return ctrl.Result{}, controller.NewAPIServerError(true, err)
}
if !profile.ObjectMeta.DeletionTimestamp.IsZero() {
// TODO: handle the deletion when backends are still attached to the profile
return r.handleDelete(ctx, profile)
}
// register metrics finalizer
if !controllerutil.ContainsFinalizer(profile, objectmeta.MetricsFinalizer) {
controllerutil.AddFinalizer(profile, objectmeta.MetricsFinalizer)
if err := r.Update(ctx, profile); err != nil {
klog.ErrorS(err, "Failed to add trafficManagerProfile metrics finalizer", "trafficManagerProfile", profileKRef)
return ctrl.Result{}, err
}
}
defer emitTrafficManagerProfileStatusMetric(profile)
// TODO: replace the following with defaulter wehbook
defaulter.SetDefaultsTrafficManagerProfile(profile)
return r.handleUpdate(ctx, profile)
}
func (r *Reconciler) handleDelete(ctx context.Context, profile *fleetnetv1beta1.TrafficManagerProfile) (ctrl.Result, error) {
profileKObj := klog.KObj(profile)
needUpdate := false
// The profile is being deleted
if controllerutil.ContainsFinalizer(profile, objectmeta.MetricsFinalizer) {
klog.V(2).InfoS("TrafficManagerProfile is being deleted and cleaning up its metrics", "trafficManagerProfile", profileKObj)
// The controller registers profile finalizer only before creating atm profile to avoid the deletion stuck for the 403 error.
// We use a separate finalizer to clean up the metrics for the profile.
trafficManagerProfileStatusLastTimestampSeconds.DeletePartialMatch(prometheus.Labels{"namespace": profile.GetNamespace(), "name": profile.GetName()})
controllerutil.RemoveFinalizer(profile, objectmeta.MetricsFinalizer)
needUpdate = true
}
if controllerutil.ContainsFinalizer(profile, objectmeta.TrafficManagerProfileFinalizer) {
atmProfileName := generateAzureTrafficManagerProfileNameFunc(profile)
klog.V(2).InfoS("Deleting Azure Traffic Manager profile", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
if _, err := r.ProfilesClient.Delete(ctx, profile.Spec.ResourceGroup, atmProfileName, nil); err != nil {
if !azureerrors.IsNotFound(err) {
klog.ErrorS(err, "Failed to delete Azure Traffic Manager profile", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
return ctrl.Result{}, err
}
}
klog.V(2).InfoS("Deleted Azure Traffic Manager profile", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
controllerutil.RemoveFinalizer(profile, objectmeta.TrafficManagerProfileFinalizer)
needUpdate = true
}
if !needUpdate {
klog.V(2).InfoS("No need to remove finalizer", "trafficManagerProfile", profileKObj)
return ctrl.Result{}, nil
}
if err := r.Client.Update(ctx, profile); err != nil {
klog.ErrorS(err, "Failed to remove trafficManagerProfile finalizers", "trafficManagerProfile", profileKObj)
return ctrl.Result{}, controller.NewUpdateIgnoreConflictError(err)
}
klog.V(2).InfoS("Removed trafficManagerProfile finalizers", "trafficManagerProfile", profileKObj)
return ctrl.Result{}, nil
}
func (r *Reconciler) handleUpdate(ctx context.Context, profile *fleetnetv1beta1.TrafficManagerProfile) (ctrl.Result, error) {
profileKObj := klog.KObj(profile)
atmProfileName := generateAzureTrafficManagerProfileNameFunc(profile)
desiredATMProfile := generateAzureTrafficManagerProfile(profile)
var responseError *azcore.ResponseError
getRes, getErr := r.ProfilesClient.Get(ctx, profile.Spec.ResourceGroup, atmProfileName, nil)
if getErr != nil {
if !azureerrors.IsNotFound(getErr) {
klog.ErrorS(getErr, "Failed to get the profile", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
// If a user specifies an invalid resource group or the agent does not have the permission to access the resource,
// Return invalid profile
if azureerrors.IsForbidden(getErr) {
return r.updateProfileStatus(ctx, profile, nil, getErr)
}
return ctrl.Result{}, getErr
}
klog.V(2).InfoS("Azure Traffic Manager profile does not exist", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
} else {
if equalAzureTrafficManagerProfile(getRes.Profile, desiredATMProfile) {
// skip creating or updating the profile
klog.V(2).InfoS("No profile update needed", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
return r.updateProfileStatus(ctx, profile, &getRes.Profile, nil)
}
// build the desired profile based on the current profile and reset any managed fields' value
desiredATMProfile = buildAzureTrafficManagerProfileRequest(getRes.Profile, desiredATMProfile)
}
// register finalizer only before creating atm profile
// So that when a user specifies an invalid resource group, the controller will fail to create the profile because of the 403 error.
// Otherwise, the deletion will be stuck because of the 403 error and the finalizer cannot be removed.
if !controllerutil.ContainsFinalizer(profile, objectmeta.TrafficManagerProfileFinalizer) {
controllerutil.AddFinalizer(profile, objectmeta.TrafficManagerProfileFinalizer)
if err := r.Update(ctx, profile); err != nil {
klog.ErrorS(err, "Failed to add finalizer to trafficManagerProfile", "trafficManagerProfile", profileKObj)
return ctrl.Result{}, controller.NewUpdateIgnoreConflictError(err)
}
}
res, updateErr := r.ProfilesClient.CreateOrUpdate(ctx, profile.Spec.ResourceGroup, atmProfileName, desiredATMProfile, nil)
if updateErr != nil {
if !errors.As(updateErr, &responseError) {
klog.ErrorS(updateErr, "Failed to send the createOrUpdate request", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
return ctrl.Result{}, updateErr
}
klog.ErrorS(updateErr, "Failed to create or update a profile", "trafficManagerProfile", profileKObj,
"atmProfileName", atmProfileName,
"errorCode", responseError.ErrorCode, "statusCode", responseError.StatusCode)
}
klog.V(2).InfoS("Created or updated Azure Traffic Manager Profile", "trafficManagerProfile", profileKObj, "atmProfileName", atmProfileName)
return r.updateProfileStatus(ctx, profile, &res.Profile, updateErr)
}
// equalAzureTrafficManagerProfile compares only few fields of the current and desired Azure Traffic Manager profiles
// by ignoring others.
// The desired profile is built by the controllers and all the required fields should not be nil.
func equalAzureTrafficManagerProfile(current, desired armtrafficmanager.Profile) bool {
// location and dnsConfig (excluding TTL) is immutable
if current.Properties == nil || current.Properties.MonitorConfig == nil || current.Properties.ProfileStatus == nil || current.Properties.TrafficRoutingMethod == nil || current.Properties.DNSConfig == nil {
return false
}
if current.Properties.MonitorConfig.IntervalInSeconds == nil || current.Properties.MonitorConfig.Path == nil ||
current.Properties.MonitorConfig.Port == nil || current.Properties.MonitorConfig.Protocol == nil ||
current.Properties.MonitorConfig.TimeoutInSeconds == nil || current.Properties.MonitorConfig.ToleratedNumberOfFailures == nil {
return false
}
if *current.Properties.MonitorConfig.IntervalInSeconds != *desired.Properties.MonitorConfig.IntervalInSeconds ||
*current.Properties.MonitorConfig.Path != *desired.Properties.MonitorConfig.Path ||
*current.Properties.MonitorConfig.Port != *desired.Properties.MonitorConfig.Port ||
*current.Properties.MonitorConfig.Protocol != *desired.Properties.MonitorConfig.Protocol ||
*current.Properties.MonitorConfig.TimeoutInSeconds != *desired.Properties.MonitorConfig.TimeoutInSeconds ||
*current.Properties.MonitorConfig.ToleratedNumberOfFailures != *desired.Properties.MonitorConfig.ToleratedNumberOfFailures {
return false
}
if *current.Properties.ProfileStatus != *desired.Properties.ProfileStatus || *current.Properties.TrafficRoutingMethod != *desired.Properties.TrafficRoutingMethod {
return false
}
if current.Properties.DNSConfig.TTL == nil || *current.Properties.DNSConfig.TTL != *desired.Properties.DNSConfig.TTL {
return false
}
if current.Tags == nil {
return false
}
for key, value := range desired.Tags {
currentValue := current.Tags[key]
if (value == nil && currentValue != nil) || (value != nil && currentValue == nil) || (currentValue == nil || *currentValue != *value) {
return false
}
}
return true
}
func (r *Reconciler) updateProfileStatus(ctx context.Context, profile *fleetnetv1beta1.TrafficManagerProfile, atmProfile *armtrafficmanager.Profile, armErr error) (ctrl.Result, error) {
profileKObj := klog.KObj(profile)
if armErr == nil {
// atmProfile.Properties.DNSConfig.Fqdn should not be nil
if atmProfile.Properties != nil && atmProfile.Properties.DNSConfig != nil {
profile.Status.DNSName = atmProfile.Properties.DNSConfig.Fqdn
} else {
err := fmt.Errorf("got nil DNSConfig for Azure Traffic Manager profile")
klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Unexpected value returned by the Azure Traffic Manager", "trafficManagerProfile", profileKObj, "resourceGroup", profile.Spec.ResourceGroup, "atmProfileName", atmProfile.Name)
profile.Status.DNSName = nil // reset the DNS name
}
if atmProfile.ID != nil {
profile.Status.ResourceID = *atmProfile.ID
} else {
profile.Status.ResourceID = "" // reset the resource ID
err := controller.NewUnexpectedBehaviorError(fmt.Errorf("got nil ID for Azure Traffic Manager profile"))
klog.ErrorS(err, "Unexpected value returned by the Azure Traffic Manager", "trafficManagerProfile", profileKObj, "resourceGroup", profile.Spec.ResourceGroup, "atmProfileName", atmProfile.Name)
}
} else {
profile.Status.DNSName = nil // reset the DNS name
profile.Status.ResourceID = "" // reset the resource ID
}
cond := metav1.Condition{
Type: string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed),
Status: metav1.ConditionTrue,
ObservedGeneration: profile.Generation,
Reason: string(fleetnetv1beta1.TrafficManagerProfileReasonProgrammed),
Message: "Successfully configured the Azure Traffic Manager profile",
}
if azureerrors.IsConflict(armErr) {
cond = metav1.Condition{
Type: string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed),
Status: metav1.ConditionFalse,
ObservedGeneration: profile.Generation,
Reason: string(fleetnetv1beta1.TrafficManagerProfileReasonDNSNameNotAvailable),
Message: "Domain name is not available. Please choose a different profile name or namespace",
}
} else if azureerrors.IsClientError(armErr) && !azureerrors.IsThrottled(armErr) {
cond = metav1.Condition{
Type: string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed),
Status: metav1.ConditionFalse,
ObservedGeneration: profile.Generation,
Reason: string(fleetnetv1beta1.TrafficManagerProfileReasonInvalid),
Message: fmt.Sprintf("Invalid profile: %v", armErr),
}
} else if armErr != nil {
cond = metav1.Condition{
Type: string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed),
Status: metav1.ConditionUnknown,
ObservedGeneration: profile.Generation,
Reason: string(fleetnetv1beta1.TrafficManagerProfileReasonPending),
Message: fmt.Sprintf("Failed to configure profile and retrying: %v", armErr),
}
}
meta.SetStatusCondition(&profile.Status.Conditions, cond)
if err := r.Client.Status().Update(ctx, profile); err != nil {
klog.ErrorS(err, "Failed to update trafficManagerProfile status", "trafficManagerProfile", profileKObj)
return ctrl.Result{}, controller.NewUpdateIgnoreConflictError(err)
}
klog.V(2).InfoS("Updated the trafficProfile status", "trafficManagerProfile", profileKObj, "status", profile.Status)
return ctrl.Result{}, armErr // return the error to retry the reconciliation
}
func generateAzureTrafficManagerProfile(profile *fleetnetv1beta1.TrafficManagerProfile) armtrafficmanager.Profile {
mc := profile.Spec.MonitorConfig
namespacedName := types.NamespacedName{Name: profile.Name, Namespace: profile.Namespace}
return armtrafficmanager.Profile{
Location: ptr.To("global"),
Properties: &armtrafficmanager.ProfileProperties{
DNSConfig: &armtrafficmanager.DNSConfig{
RelativeName: ptr.To(fmt.Sprintf(DNSRelativeNameFormat, profile.Namespace, profile.Name)),
TTL: ptr.To(DefaultDNSTTL), // no default value on the server side, using 60s same as portal's default config
},
MonitorConfig: &armtrafficmanager.MonitorConfig{
IntervalInSeconds: mc.IntervalInSeconds,
Path: mc.Path,
Port: mc.Port,
Protocol: ptr.To(armtrafficmanager.MonitorProtocol(*mc.Protocol)),
TimeoutInSeconds: mc.TimeoutInSeconds,
ToleratedNumberOfFailures: mc.ToleratedNumberOfFailures,
},
ProfileStatus: ptr.To(armtrafficmanager.ProfileStatusEnabled),
// By default, the routing method is set to Weighted.
TrafficRoutingMethod: ptr.To(armtrafficmanager.TrafficRoutingMethodWeighted),
},
Tags: map[string]*string{
objectmeta.AzureTrafficManagerProfileTagKey: ptr.To(namespacedName.String()),
},
}
}
// buildAzureTrafficManagerProfileRequest assumes desired is always valid.
func buildAzureTrafficManagerProfileRequest(current, desired armtrafficmanager.Profile) armtrafficmanager.Profile {
current.Location = desired.Location // reset the location fields
if current.Properties == nil {
current.Properties = desired.Properties
} else {
current.Properties.DNSConfig = desired.Properties.DNSConfig // reset the dns config
if current.Properties.MonitorConfig == nil {
current.Properties.MonitorConfig = desired.Properties.MonitorConfig
} else {
// reset the monitor config fields
current.Properties.MonitorConfig.IntervalInSeconds = desired.Properties.MonitorConfig.IntervalInSeconds
current.Properties.MonitorConfig.Path = desired.Properties.MonitorConfig.Path
current.Properties.MonitorConfig.Port = desired.Properties.MonitorConfig.Port
current.Properties.MonitorConfig.Protocol = desired.Properties.MonitorConfig.Protocol
current.Properties.MonitorConfig.TimeoutInSeconds = desired.Properties.MonitorConfig.TimeoutInSeconds
current.Properties.MonitorConfig.ToleratedNumberOfFailures = desired.Properties.MonitorConfig.ToleratedNumberOfFailures
}
current.Properties.ProfileStatus = desired.Properties.ProfileStatus
current.Properties.TrafficRoutingMethod = desired.Properties.TrafficRoutingMethod
}
if current.Tags == nil {
current.Tags = desired.Tags
} else {
for key, value := range desired.Tags {
current.Tags[key] = value
}
}
return current
}
// emitTrafficManagerProfileStatusMetric emits the traffic manager profile status metric based on status conditions.
func emitTrafficManagerProfileStatusMetric(profile *fleetnetv1beta1.TrafficManagerProfile) {
generation := profile.Generation
genStr := strconv.FormatInt(generation, 10)
cond := meta.FindStatusCondition(profile.Status.Conditions, string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed))
if cond != nil && cond.ObservedGeneration == generation {
trafficManagerProfileStatusLastTimestampSeconds.WithLabelValues(profile.GetNamespace(), profile.GetName(), genStr,
string(fleetnetv1beta1.TrafficManagerProfileConditionProgrammed), string(cond.Status), cond.Reason).SetToCurrentTime()
return
}
// We should rarely reach here, it can only happen when updating status fails.
klog.V(2).InfoS("There's no programmed status condition on trafficManagerProfile, status updating failed possibly", "trafficManagerProfile", klog.KObj(profile))
}
// SetupWithManager sets up the controller with the Manager.
func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&fleetnetv1beta1.TrafficManagerProfile{}).
Complete(r)
}