pkg/providers/imagefamily/bootstrap/aksbootstrap.go (330 lines of code) (raw):

/* Portions Copyright (c) Microsoft Corporation. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package bootstrap import ( "bytes" "encoding/base64" "fmt" "strings" "github.com/blang/semver/v4" "github.com/samber/lo" v1 "k8s.io/api/core/v1" "github.com/Azure/karpenter-provider-azure/pkg/utils" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) type AKS struct { Options Arch string TenantID string SubscriptionID string KubeletIdentityClientID string Location string ResourceGroup string ClusterID string APIServerName string KubeletClientTLSBootstrapToken string NetworkPlugin string NetworkPolicy string KubernetesVersion string } var _ Bootstrapper = (*AKS)(nil) // assert AKS implements Bootstrapper func (a AKS) Script() (string, error) { bootstrapScript, err := a.aksBootstrapScript() if err != nil { return "", fmt.Errorf("error getting AKS bootstrap script: %w", err) } return base64.StdEncoding.EncodeToString([]byte(bootstrapScript)), nil } // Config item types classified by code: // // - : known unnecessary or unused - (empty) value set in code, until dropped from template // n : not (yet?) supported, set to empty or something reasonable in code // s : static/constant (or very slow changing), value set in code; // also the choice for something that does not have to be exposed for customization yet // // a : known argument/parameter, passed in (usually from environment) // x : unique per cluster, extracted or specified. (Candidates for exposure/accessibility via API) // X : unique per nodepool, extracted or specified. (Candidates for exposure/accessibility via API) // c : user input, Options (provider-specific), e.g., could be from environment variables // p : user input, part of standard Nodepool CR spec. Example: custom labels, kubelet config // t : user input, AKSNodeClass (potentially per node) // k : computed (at runtime) by Karpenter (e.g. based on VM SKU, extra labels, etc.) // (xk - computed from per cluster data, such as cluster id) // // ? : needs more investigation // // multiple codes: combined from several sources // Config sources for types: // // Hardcoded (this file) : unused (-), static (s) and unsupported (n), as well as selected defaults (s) // Computed at runtime : computed (k) // Options (provider-specific) : cluster-level user input (c) - ALL DEFAULTED FOR NOW // : as well as unique per cluster (x) - until we have a better place for these // (TBD) : unique per nodepool. extracted or specified (X) // AKSNodeClass : user input that could be per-node (t) - ALL DEFAULTED FOR NOW // Nodepool spec : selected nodepool-level user input (p) // NodeBootstrapVariables carries all variables needed to bootstrap a node // It is used as input rendering the bootstrap script Go template (retrieved from getCustomDataTemplate) type NodeBootstrapVariables struct { IsAKSCustomCloud bool // n (false) InitAKSCustomCloudFilepath string // n (static) AKSCustomCloudRepoDepotEndpoint string // n derived from custom cloud env? AdminUsername string // t typically azureuser but can be user input MobyVersion string // - unnecessary TenantID string // p environment derived, unnecessary? KubernetesVersion string // ? cluster/node pool specific, derived from user input HyperkubeURL string // - should be unnecessary KubeBinaryURL string // - necessary only for non-cached versions / static-ish CredentialProviderDownloadURL string // - necessary only for non-cached versions / static-ish CustomKubeBinaryURL string // - unnecessary KubeproxyURL string // - should be unnecessary or bug APIServerPublicKey string // - unique per cluster, actually not sure best way to extract? [should not be needed on agent nodes] SubscriptionID string // a can be derived from environment/imds ResourceGroup string // a can be derived from environment/imds Location string // a can be derived from environment/imds VMType string // xd derived from cluster but unnecessary (?) only used by CCM [will default to "vmss" for now] Subnet string // xd derived from cluster but unnecessary (?) only used by CCM [will default to "aks-subnet for now] NetworkSecurityGroup string // xk derived from cluster but unnecessary (?) only used by CCM [= "aks-agentpool-<clusterid>-nsg" for now] VirtualNetwork string // xk derived from cluster but unnecessary (?) only used by CCM [= "aks-vnet-<clusterid>" for now] VirtualNetworkResourceGroup string // xd derived from cluster but unnecessary (?) only used by CCM [default to empty, looks like unused] RouteTable string // xk derived from cluster but unnecessary (?) only used by CCM [= "aks-agentpool-<clusterid>-routetable" for now] PrimaryAvailabilitySet string // - derived from cluster but unnecessary (?) only used by CCM PrimaryScaleSet string // - derived from cluster but unnecessary (?) only used by CCM ServicePrincipalClientID string // ad user input NetworkPlugin string // x user input (? actually derived from cluster, right?) NetworkPolicy string // x user input / unique per cluster. user-specified. VNETCNILinuxPluginsURL string // - unnecessary [actually, currently required] CNIPluginsURL string // - unnecessary [actually, currently required] CloudProviderBackoff bool // s BEGIN CLOUD CONFIG for azure stuff, static/derived from user inputs CloudProviderBackoffMode string // s [static until has to be exposed; could propagate Karpenter RL config, but won't] CloudProviderBackoffRetries string // s CloudProviderBackoffExponent string // s CloudProviderBackoffDuration string // s CloudProviderBackoffJitter string // s CloudProviderRatelimit bool // s CloudProviderRatelimitQPS string // s CloudProviderRatelimitQPSWrite string // s CloudProviderRatelimitBucket string // s CloudProviderRatelimitBucketWrite string // s LoadBalancerDisableOutboundSNAT bool // xd [= false for now] UseManagedIdentityExtension bool // s [always true, as long as we only support managed identity] UseInstanceMetadata bool // s [always true?] LoadBalancerSKU string // xd [= "Standard" for now] ExcludeMasterFromStandardLB bool // s [always true?] MaximumLoadbalancerRuleCount int // xd END CLOUD CONFIG [will default to 250 for now] ContainerRuntime string // s always containerd CLITool string // s static/unnecessary ContainerdDownloadURLBase string // - unnecessary NetworkMode string // c user input UserAssignedIdentityID string // a user input APIServerName string // x unique per cluster IsVHD bool // s static-ish GPUNode bool // k derived from VM size SGXNode bool // - unused MIGNode bool // t user input ConfigGPUDriverIfNeeded bool // s depends on hardware, unnecessary for oss, but aks provisions gpu drivers EnableGPUDevicePluginIfNeeded bool // - deprecated/preview only, don't do this for OSS TeleportdPluginDownloadURL string // - user input, don't do this for OSS ContainerdVersion string // - unused ContainerdPackageURL string // - only for testing RuncVersion string // - unused RuncPackageURL string // - testing only EnableHostsConfigAgent bool // n derived from private cluster user input...I think? DisableSSH bool // t user input NeedsContainerd bool // s static true TeleportEnabled bool // t user input ShouldConfigureHTTPProxy bool // c user input ShouldConfigureHTTPProxyCA bool // c user input [secret] HTTPProxyTrustedCA string // c user input [secret] ShouldConfigureCustomCATrust bool // c user input CustomCATrustConfigCerts []string // c user input [secret] IsKrustlet bool // t user input GPUNeedsFabricManager bool // v determined by GPU hardware type NeedsDockerLogin bool // t user input [still needed?] IPv6DualStackEnabled bool // t user input OutboundCommand string // s mostly static/can be EnableUnattendedUpgrades bool // c user input [presumably cluster level, correct?] EnsureNoDupePromiscuousBridge bool // k derived {{ and NeedsContainerd IsKubenet (not HasCalicoNetworkPolicy) }} [could be computed by template ...] ShouldConfigSwapFile bool // t user input ShouldConfigTransparentHugePage bool // t user input TargetCloud string // n derive from environment/user input TargetEnvironment string // n derive from environment/user input CustomEnvJSON string // n derive from environment/user input IsCustomCloud bool // n derive from environment/user input CSEHelpersFilepath string // s static CSEDistroHelpersFilepath string // s static CSEInstallFilepath string // s static CSEDistroInstallFilepath string // s static CSEConfigFilepath string // s static AzurePrivateRegistryServer string // c user input HasCustomSearchDomain bool // c user input CustomSearchDomainFilepath string // s static HTTPProxyURLs string // c user input [presumably cluster-level] HTTPSProxyURLs string // c user input [presumably cluster-level] NoProxyURLs string // c user input [presumably cluster-level] TLSBootstrappingEnabled bool // s static true SecureTLSBootstrappingEnabled bool // s static false DHCPv6ServiceFilepath string // k derived from user input [how?] DHCPv6ConfigFilepath string // k derived from user input [how?] THPEnabled string // c user input [presumably cluster-level][should be bool?] THPDefrag string // c user input [presumably cluster-level][should be bool?] ServicePrincipalFileContent string // s only required for RP cluster [static: msi?] KubeletClientContent string // - unnecessary [if using TLS bootstrapping] KubeletClientCertContent string // - unnecessary KubeletConfigFileEnabled bool // s can be static [should kubelet config be actually used/preferred instead of flags?] KubeletConfigFileContent string // s mix of user/static/RP-generated. SwapFileSizeMB int // t user input GPUImageSHA string // s static sha rarely updated GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator. GPUDriverType string // k GPUInstanceProfile string // t user-specified CustomSearchDomainName string // c user-specified [presumably cluster-level] CustomSearchRealmUser string // c user-specified [presumably cluster-level] CustomSearchRealmPassword string // c user-specified [presumably cluster-level] MessageOfTheDay string // t user-specified [presumably node-level] HasKubeletDiskType bool // t user-specified [presumably node-level] NeedsCgroupV2 bool // k can be automatically determined SysctlContent string // t user-specified TLSBootstrapToken string // X nodepool or node specific. can be created automatically KubeletFlags string // psX unique per nodepool. partially user-specified, static, and RP-generated KubeletNodeLabels string // pk node-pool specific. user-specified. AzureEnvironmentFilepath string // s can be made static [usually "/etc/kubernetes/azure.json", but my examples use ""?] KubeCACrt string // x unique per cluster ContainerdConfigContent string // k determined by GPU VM size, WASM support, Kata support IsKata bool // n user-specified } func (a AKS) aksBootstrapScript() (string, error) { // use these as the base / defaults nbv := getStaticNodeBootstrapVars() // apply overrides from passed in options a.applyOptions(nbv) containerdConfigTemplate, err := containerdConfigFromNodeBootstrapVars(nbv) if err != nil { return "", fmt.Errorf("error getting containerd config from node bootstrap variables: %w", err) } nbv.ContainerdConfigContent = base64.StdEncoding.EncodeToString([]byte(containerdConfigTemplate)) // generate script from template using the variables customData, err := getCustomDataFromNodeBootstrapVars(nbv) if err != nil { return "", fmt.Errorf("error getting custom data from node bootstrap variables: %w", err) } return customData, nil } // Download URL for KUBE_BINARY_URL publishes each k8s version in the URL. func kubeBinaryURL(kubernetesVersion, cpuArch string) string { return fmt.Sprintf("%s/kubernetes/v%s/binaries/kubernetes-node-linux-%s.tar.gz", globalAKSMirror, kubernetesVersion, cpuArch) } // CredentialProviderURL returns the URL for OOT credential provider, // or an empty string if OOT provider is not to be used func CredentialProviderURL(kubernetesVersion, arch string) string { minorVersion := semver.MustParse(kubernetesVersion).Minor if minorVersion < 30 { // use from 1.30; 1.29 supports it too, but we have not fully tested it with Karpenter return "" } // credential provider has its own release outside of k8s version, and there'll be one credential provider binary for each k8s release, // as credential provider release goes with cloud-provider-azure, not every credential provider release will be picked up unless // there are CVE or bug fixes. var credentialProviderVersion string switch minorVersion { case 29: credentialProviderVersion = "1.29.13" case 30: credentialProviderVersion = "1.30.10" case 31: credentialProviderVersion = "1.31.4" case 32: fallthrough // to default, which is same as latest default: credentialProviderVersion = "1.32.3" } return fmt.Sprintf("%s/cloud-provider-azure/v%s/binaries/azure-acr-credential-provider-linux-%s-v%s.tar.gz", globalAKSMirror, credentialProviderVersion, arch, credentialProviderVersion) } func (a AKS) applyOptions(nbv *NodeBootstrapVariables) { nbv.KubeCACrt = *a.CABundle nbv.APIServerName = a.APIServerName nbv.TLSBootstrapToken = a.KubeletClientTLSBootstrapToken nbv.TenantID = a.TenantID nbv.SubscriptionID = a.SubscriptionID nbv.Location = a.Location nbv.ResourceGroup = a.ResourceGroup nbv.UserAssignedIdentityID = a.KubeletIdentityClientID nbv.NetworkPlugin = a.NetworkPlugin nbv.NetworkPolicy = a.NetworkPolicy nbv.KubernetesVersion = a.KubernetesVersion nbv.KubeBinaryURL = kubeBinaryURL(a.KubernetesVersion, a.Arch) nbv.VNETCNILinuxPluginsURL = fmt.Sprintf("%s/azure-cni/v1.4.32/binaries/azure-vnet-cni-linux-%s-v1.4.32.tgz", globalAKSMirror, a.Arch) nbv.CNIPluginsURL = fmt.Sprintf("%s/cni-plugins/v1.1.1/binaries/cni-plugins-linux-%s-v1.1.1.tgz", globalAKSMirror, a.Arch) // calculated values nbv.NetworkSecurityGroup = fmt.Sprintf("aks-agentpool-%s-nsg", a.ClusterID) nbv.RouteTable = fmt.Sprintf("aks-agentpool-%s-routetable", a.ClusterID) if a.GPUNode { nbv.GPUNode = true nbv.ConfigGPUDriverIfNeeded = true nbv.GPUDriverVersion = a.GPUDriverVersion nbv.GPUDriverType = a.GPUDriverType nbv.GPUImageSHA = a.GPUImageSHA } // merge and stringify labels kubeletLabels := lo.Assign(getBaseKubeletNodeLabels(), a.Labels) getAgentbakerGeneratedLabels(a.ResourceGroup, kubeletLabels) subnetParts, _ := utils.GetVnetSubnetIDComponents(a.SubnetID) nbv.Subnet = subnetParts.SubnetName nbv.VirtualNetworkResourceGroup = subnetParts.ResourceGroupName nbv.VirtualNetwork = subnetParts.VNetName nbv.KubeletNodeLabels = strings.Join(lo.MapToSlice(kubeletLabels, func(k, v string) string { return fmt.Sprintf("%s=%s", k, v) }), ",") // Assign Per K8s version kubelet flags minorVersion := semver.MustParse(a.KubernetesVersion).Minor kubeletFlagsBase := getBaseKubeletFlags() if minorVersion < 31 { kubeletFlagsBase["--keep-terminated-pod-volumes"] = "false" } credentialProviderURL := CredentialProviderURL(a.KubernetesVersion, a.Arch) if credentialProviderURL != "" { // use OOT credential provider nbv.CredentialProviderDownloadURL = credentialProviderURL kubeletFlagsBase["--image-credential-provider-config"] = "/var/lib/kubelet/credential-provider-config.yaml" kubeletFlagsBase["--image-credential-provider-bin-dir"] = "/var/lib/kubelet/credential-provider" } else { // Versions Less than 1.30 // we can make this logic smarter later when we have more than one // for now just adding here. kubeletFlagsBase["--feature-gates"] = "DisableKubeletCloudCredentialProviders=false" kubeletFlagsBase["--azure-container-registry-config"] = "/etc/kubernetes/azure.json" } // merge and stringify taints kubeletFlags := lo.Assign(kubeletFlagsBase) if len(a.Taints) > 0 { taintStrs := lo.Map(a.Taints, func(taint v1.Taint, _ int) string { return taint.ToString() }) kubeletFlags = lo.Assign(kubeletFlags, map[string]string{"--register-with-taints": strings.Join(taintStrs, ",")}) } nodeclaimKubeletConfig := KubeletConfigToMap(a.KubeletConfig) kubeletFlags = lo.Assign(kubeletFlags, nodeclaimKubeletConfig) // striginify kubelet flags (including taints) nbv.KubeletFlags = strings.Join(lo.MapToSlice(kubeletFlags, func(k, v string) string { return fmt.Sprintf("%s=%s", k, v) }), " ") } func containerdConfigFromNodeBootstrapVars(nbv *NodeBootstrapVariables) (string, error) { var buffer bytes.Buffer if err := getContainerdConfigTemplate().Execute(&buffer, *nbv); err != nil { return "", fmt.Errorf("error executing containerd config template: %w", err) } return buffer.String(), nil } func getCustomDataFromNodeBootstrapVars(nbv *NodeBootstrapVariables) (string, error) { var buffer bytes.Buffer if err := getCustomDataTemplate().Execute(&buffer, *nbv); err != nil { return "", fmt.Errorf("error executing custom data template: %w", err) } return buffer.String(), nil } func getAgentbakerGeneratedLabels(nodeResourceGroup string, nodeLabels map[string]string) { nodeLabels["kubernetes.azure.com/role"] = "agent" nodeLabels["kubernetes.azure.com/cluster"] = normalizeResourceGroupNameForLabel(nodeResourceGroup) } func normalizeResourceGroupNameForLabel(resourceGroupName string) string { truncated := resourceGroupName truncated = strings.ReplaceAll(truncated, "(", "-") truncated = strings.ReplaceAll(truncated, ")", "-") const maxLen = 63 if len(truncated) > maxLen { truncated = truncated[0:maxLen] } if strings.HasSuffix(truncated, "-") || strings.HasSuffix(truncated, "_") || strings.HasSuffix(truncated, ".") { if len(truncated) > 62 { return truncated[0:len(truncated)-1] + "z" } return truncated + "z" } return truncated } func KubeletConfigToMap(kubeletConfig *KubeletConfiguration) map[string]string { args := make(map[string]string) if kubeletConfig == nil { return args } args["--max-pods"] = fmt.Sprintf("%d", kubeletConfig.MaxPods) JoinParameterArgsToMap(args, "--system-reserved", kubeletConfig.SystemReserved, "=") JoinParameterArgsToMap(args, "--kube-reserved", kubeletConfig.KubeReserved, "=") JoinParameterArgsToMap(args, "--eviction-hard", kubeletConfig.EvictionHard, "<") JoinParameterArgsToMap(args, "--eviction-soft", kubeletConfig.EvictionSoft, "<") JoinParameterArgsToMap(args, "--eviction-soft-grace-period", lo.MapValues(kubeletConfig.EvictionSoftGracePeriod, func(v metav1.Duration, _ string) string { return v.Duration.String() }), "=") if kubeletConfig.EvictionMaxPodGracePeriod != nil { args["--eviction-max-pod-grace-period"] = fmt.Sprintf("%d", lo.FromPtr(kubeletConfig.EvictionMaxPodGracePeriod)) } if kubeletConfig.ImageGCHighThresholdPercent != nil { args["--image-gc-high-threshold"] = fmt.Sprintf("%d", lo.FromPtr(kubeletConfig.ImageGCHighThresholdPercent)) } if kubeletConfig.ImageGCLowThresholdPercent != nil { args["--image-gc-low-threshold"] = fmt.Sprintf("%d", lo.FromPtr(kubeletConfig.ImageGCLowThresholdPercent)) } if kubeletConfig.CPUCFSQuota != nil { args["--cpu-cfs-quota"] = fmt.Sprintf("%t", lo.FromPtr(kubeletConfig.CPUCFSQuota)) } return args } // joinParameterArgsToMap joins a map of keys and values by their separator. The separator will sit between the // arguments in a comma-separated list i.e. arg1<sep>val1,arg2<sep>val2 func JoinParameterArgsToMap[K comparable, V any](result map[string]string, name string, m map[K]V, separator string) { var args []string for k, v := range m { args = append(args, fmt.Sprintf("%v%s%v", k, separator, v)) } if len(args) > 0 { result[name] = strings.Join(args, ",") } }