pkg/util/cluster/cluster.go (1,012 lines of code) (raw):

package cluster // Copyright (c) Microsoft Corporation. // Licensed under the Apache License 2.0. import ( "bytes" "context" "crypto/tls" "encoding/json" "errors" "fmt" "math/rand" "net" "net/http" "os" "strings" "time" armsdk "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azidentity" sdkkeyvault "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/keyvault/armkeyvault" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi" sdknetwork "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v2" mgmtauthorization "github.com/Azure/azure-sdk-for-go/services/preview/authorization/mgmt/2018-09-01-preview/authorization" mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/azure" "github.com/Azure/go-autorest/autorest/to" "github.com/jongio/azidext/go/azidext" "github.com/sirupsen/logrus" "github.com/spf13/viper" "k8s.io/apimachinery/pkg/util/wait" "github.com/Azure/ARO-RP/pkg/api" v20240812preview "github.com/Azure/ARO-RP/pkg/api/v20240812preview" mgmtredhatopenshift20240812preview "github.com/Azure/ARO-RP/pkg/client/services/redhatopenshift/mgmt/2024-08-12-preview/redhatopenshift" "github.com/Azure/ARO-RP/pkg/deploy/assets" "github.com/Azure/ARO-RP/pkg/deploy/generator" "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/util/arm" "github.com/Azure/ARO-RP/pkg/util/azureclient" "github.com/Azure/ARO-RP/pkg/util/azureclient/azuresdk/armkeyvault" "github.com/Azure/ARO-RP/pkg/util/azureclient/azuresdk/armnetwork" "github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/authorization" "github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/compute" "github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/features" redhatopenshift20240812preview "github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/redhatopenshift/2024-08-12-preview/redhatopenshift" "github.com/Azure/ARO-RP/pkg/util/azureerrors" utilgraph "github.com/Azure/ARO-RP/pkg/util/graph" "github.com/Azure/ARO-RP/pkg/util/pointerutils" "github.com/Azure/ARO-RP/pkg/util/rbac" "github.com/Azure/ARO-RP/pkg/util/uuid" "github.com/Azure/ARO-RP/pkg/util/version" ) type ClusterConfig struct { ClusterName string `mapstructure:"CLUSTER"` SubscriptionID string `mapstructure:"AZURE_SUBSCRIPTION_ID"` TenantID string `mapstructure:"AZURE_TENANT_ID"` Location string `mapstructure:"LOCATION"` AzureEnvironment string `mapstructure:"AZURE_ENVIRONMENT"` UseWorkloadIdentity bool `mapstructure:"USE_WI"` WorkloadIdentityRoles string `mapstructure:"PLATFORM_WORKLOAD_IDENTITY_ROLE_SETS"` IsCI bool `mapstructure:"CI"` RpMode string `mapstructure:"RP_MODE"` VnetResourceGroup string `mapstructure:"CLUSTER_RESOURCEGROUP"` RPResourceGroup string `mapstructure:"RESOURCEGROUP"` OSClusterVersion string `mapstructure:"OS_CLUSTER_VERSION"` FPServicePrincipalID string `mapstructure:"AZURE_FP_SERVICE_PRINCIPAL_ID"` IsPrivate bool `mapstructure:"PRIVATE_CLUSTER"` NoInternet bool `mapstructure:"NO_INTERNET"` MockMSIObjectID string `mapstructure:"MOCK_MSI_OBJECT_ID"` MasterVMSize string `mapstructure:"MASTER_VM_SIZE"` WorkerVMSize string `mapstructure:"WORKER_VM_SIZE"` } func (cc *ClusterConfig) IsLocalDevelopmentMode() bool { return strings.EqualFold(cc.RpMode, "development") } type Cluster struct { log *logrus.Entry Config *ClusterConfig ciParentVnet string workloadIdentities map[string]api.PlatformWorkloadIdentity spGraphClient *utilgraph.GraphServiceClient deployments features.DeploymentsClient groups features.ResourceGroupsClient openshiftclusters InternalClient securitygroups armnetwork.SecurityGroupsClient subnets armnetwork.SubnetsClient routetables armnetwork.RouteTablesClient roleassignments authorization.RoleAssignmentsClient roledefinitions authorization.RoleDefinitionsClient peerings armnetwork.VirtualNetworkPeeringsClient ciParentVnetPeerings armnetwork.VirtualNetworkPeeringsClient vaultsClient armkeyvault.VaultsClient msiClient armmsi.UserAssignedIdentitiesClient diskEncryptionSetsClient compute.DiskEncryptionSetsClient } const GenerateSubnetMaxTries = 100 const localDefaultURL string = "https://localhost:8443" const DefaultMasterVmSize = api.VMSizeStandardD8sV5 const DefaultWorkerVmSize = api.VMSizeStandardD4sV5 func insecureLocalClient() *http.Client { return &http.Client{ Transport: &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: true, }, }, } } func NewClusterConfigFromEnv() (*ClusterConfig, error) { var conf ClusterConfig viper.AutomaticEnv() viper.SetOptions(viper.ExperimentalBindStruct()) err := viper.Unmarshal(&conf) if err != nil { return nil, fmt.Errorf("error parsing env vars: %w", err) } if conf.ClusterName == "" { return nil, fmt.Errorf("cluster Name must be set") } if conf.UseWorkloadIdentity && conf.WorkloadIdentityRoles == "" { return nil, fmt.Errorf("workload Identity Role Set must be set") } if conf.RPResourceGroup == "" { return nil, fmt.Errorf("resource group must be set") } if conf.FPServicePrincipalID == "" { return nil, fmt.Errorf("fP Service Principal ID must be set") } if conf.IsCI { conf.VnetResourceGroup = conf.ClusterName } else { conf.VnetResourceGroup = conf.RPResourceGroup } if !conf.IsCI && conf.VnetResourceGroup == "" { return nil, fmt.Errorf("resource Group must be set") } if conf.OSClusterVersion == "" { conf.OSClusterVersion = version.DefaultInstallStream.Version.String() } if conf.AzureEnvironment == "" { conf.AzureEnvironment = "AZUREPUBLICCLOUD" } if conf.MasterVMSize == "" { conf.MasterVMSize = DefaultMasterVmSize.String() } if conf.WorkerVMSize == "" { conf.WorkerVMSize = DefaultWorkerVmSize.String() } return &conf, nil } func New(log *logrus.Entry, conf *ClusterConfig) (*Cluster, error) { azEnvironment, err := azureclient.EnvironmentFromName(conf.AzureEnvironment) if err != nil { return nil, fmt.Errorf("can't parse Azure environment: %w", err) } options := azEnvironment.EnvironmentCredentialOptions() spTokenCredential, err := azidentity.NewEnvironmentCredential(options) if err != nil { return nil, err } spGraphClient, err := azEnvironment.NewGraphServiceClient(spTokenCredential) if err != nil { return nil, err } scopes := []string{azEnvironment.ResourceManagerScope} authorizer := azidext.NewTokenCredentialAdapter(spTokenCredential, scopes) armOption := armsdk.ClientOptions{ ClientOptions: policy.ClientOptions{ Cloud: options.Cloud, }, } clientOptions := azEnvironment.ArmClientOptions() vaultClient, err := armkeyvault.NewVaultsClient(conf.SubscriptionID, spTokenCredential, &armOption) if err != nil { return nil, err } securityGroupsClient, err := armnetwork.NewSecurityGroupsClient(conf.SubscriptionID, spTokenCredential, clientOptions) diskEncryptionSetsClient := compute.NewDiskEncryptionSetsClient(conf.SubscriptionID, authorizer) if err != nil { return nil, err } subnetsClient, err := armnetwork.NewSubnetsClient(conf.SubscriptionID, spTokenCredential, clientOptions) if err != nil { return nil, err } routeTablesClient, err := armnetwork.NewRouteTablesClient(conf.SubscriptionID, spTokenCredential, clientOptions) if err != nil { return nil, err } virtualNetworkPeeringsClient, err := armnetwork.NewVirtualNetworkPeeringsClient(conf.SubscriptionID, spTokenCredential, clientOptions) if err != nil { return nil, err } msiClient, err := armmsi.NewUserAssignedIdentitiesClient(conf.SubscriptionID, spTokenCredential, clientOptions) if err != nil { return nil, err } clusterClient := &internalClient[mgmtredhatopenshift20240812preview.OpenShiftCluster, v20240812preview.OpenShiftCluster]{ externalClient: redhatopenshift20240812preview.NewOpenShiftClustersClient(&azEnvironment, conf.SubscriptionID, authorizer), converter: api.APIs[v20240812preview.APIVersion].OpenShiftClusterConverter, } c := &Cluster{ log: log, Config: conf, // env: environment, workloadIdentities: make(map[string]api.PlatformWorkloadIdentity), spGraphClient: spGraphClient, deployments: features.NewDeploymentsClient(&azEnvironment, conf.SubscriptionID, authorizer), groups: features.NewResourceGroupsClient(&azEnvironment, conf.SubscriptionID, authorizer), openshiftclusters: clusterClient, securitygroups: securityGroupsClient, subnets: subnetsClient, routetables: routeTablesClient, roleassignments: authorization.NewRoleAssignmentsClient(&azEnvironment, conf.SubscriptionID, authorizer), roledefinitions: authorization.NewRoleDefinitionsClient(&azEnvironment, conf.SubscriptionID, authorizer), peerings: virtualNetworkPeeringsClient, vaultsClient: vaultClient, msiClient: *msiClient, diskEncryptionSetsClient: diskEncryptionSetsClient, } if c.Config.IsCI && c.Config.IsLocalDevelopmentMode() { // Only peer if CI=true and RP_MODE=development c.ciParentVnet = fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vpn-vnet", c.Config.SubscriptionID, c.Config.RPResourceGroup) r, err := azure.ParseResourceID(c.ciParentVnet) if err != nil { return nil, err } ciVirtualNetworkPeeringsClient, err := armnetwork.NewVirtualNetworkPeeringsClient(r.SubscriptionID, spTokenCredential, clientOptions) if err != nil { return nil, err } c.ciParentVnetPeerings = ciVirtualNetworkPeeringsClient } return c, nil } type appDetails struct { applicationId string applicationSecret string SPId string } func (c *Cluster) createApp(ctx context.Context, clusterName string) (applicationDetails appDetails, err error) { c.log.Infof("Creating AAD application") appID, appSecret, err := c.createApplication(ctx, "aro-"+clusterName) if err != nil { return appDetails{}, err } c.log.Infof("Creating service principal") spID, err := c.createServicePrincipal(ctx, appID) if err != nil { return appDetails{}, err } return appDetails{appID, appSecret, spID}, nil } func (c *Cluster) SetupServicePrincipalRoleAssignments(ctx context.Context, diskEncryptionSetID string, clusterServicePrincipalID string) error { c.log.Info("creating role assignments") for _, scope := range []struct{ resource, role string }{ {"/subscriptions/" + c.Config.SubscriptionID + "/resourceGroups/" + c.Config.VnetResourceGroup + "/providers/Microsoft.Network/virtualNetworks/dev-vnet", rbac.RoleNetworkContributor}, {"/subscriptions/" + c.Config.SubscriptionID + "/resourceGroups/" + c.Config.VnetResourceGroup + "/providers/Microsoft.Network/routeTables/" + c.Config.ClusterName + "-rt", rbac.RoleNetworkContributor}, {diskEncryptionSetID, rbac.RoleReader}, } { for _, principalID := range []string{clusterServicePrincipalID, c.Config.FPServicePrincipalID} { for i := 0; i < 5; i++ { _, err := c.roleassignments.Create( ctx, scope.resource, uuid.DefaultGenerator.Generate(), mgmtauthorization.RoleAssignmentCreateParameters{ RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{ RoleDefinitionID: to.StringPtr("/subscriptions/" + c.Config.SubscriptionID + "/providers/Microsoft.Authorization/roleDefinitions/" + scope.role), PrincipalID: &principalID, PrincipalType: mgmtauthorization.ServicePrincipal, }, }, ) // Ignore if the role assignment already exists if detailedError, ok := err.(autorest.DetailedError); ok { if detailedError.StatusCode == http.StatusConflict { err = nil } } if err != nil && i < 4 { // Sometimes we see HashConflictOnDifferentRoleAssignmentIds. // Retry a few times. c.log.Print(err) continue } if err != nil { return err } break } } } return nil } func (c *Cluster) GetPlatformWIRoles() ([]api.PlatformWorkloadIdentityRole, error) { var wiRoleSets []api.PlatformWorkloadIdentityRoleSetProperties if err := json.Unmarshal([]byte(c.Config.WorkloadIdentityRoles), &wiRoleSets); err != nil { return nil, fmt.Errorf("failed to parse JSON: %w", err) } for _, rs := range wiRoleSets { if strings.HasPrefix(c.Config.OSClusterVersion, rs.OpenShiftVersion) { return rs.PlatformWorkloadIdentityRoles, nil } } return nil, fmt.Errorf("workload identity role sets for version %s not found", c.Config.OSClusterVersion) } func (c *Cluster) SetupWorkloadIdentity(ctx context.Context, vnetResourceGroup string) error { platformWorkloadIdentityRoles, err := c.GetPlatformWIRoles() if err != nil { return fmt.Errorf("failed parsing platformWI Roles: %w", err) } platformWorkloadIdentityRoles = append(platformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{ OperatorName: "aro-Cluster", RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e", }) c.log.Info("Assigning role to mock msi client") c.roleassignments.Create( ctx, fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", c.Config.SubscriptionID, vnetResourceGroup), uuid.DefaultGenerator.Generate(), mgmtauthorization.RoleAssignmentCreateParameters{ RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{ RoleDefinitionID: to.StringPtr("/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e"), PrincipalID: &c.Config.MockMSIObjectID, PrincipalType: mgmtauthorization.ServicePrincipal, }, }, ) for _, wi := range platformWorkloadIdentityRoles { c.log.Infof("creating WI: %s", wi.OperatorName) resp, err := c.msiClient.CreateOrUpdate(ctx, vnetResourceGroup, wi.OperatorName, armmsi.Identity{ Location: to.StringPtr(c.Config.Location), }, nil) if err != nil { return err } _, err = c.roleassignments.Create( ctx, fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", c.Config.SubscriptionID, vnetResourceGroup), uuid.DefaultGenerator.Generate(), mgmtauthorization.RoleAssignmentCreateParameters{ RoleAssignmentProperties: &mgmtauthorization.RoleAssignmentProperties{ RoleDefinitionID: &wi.RoleDefinitionID, PrincipalID: resp.Properties.PrincipalID, PrincipalType: mgmtauthorization.ServicePrincipal, }, }, ) if err != nil { return err } if wi.OperatorName != "aro-Cluster" { c.workloadIdentities[wi.OperatorName] = api.PlatformWorkloadIdentity{ ResourceID: *resp.ID, } } } return nil } func (c *Cluster) Create(ctx context.Context) error { c.log.Info("Creating cluster") clusterGet, err := c.openshiftclusters.Get(ctx, c.Config.VnetResourceGroup, c.Config.ClusterName) c.log.Info("Got cluster ref") if err == nil { if clusterGet.Properties.ProvisioningState == api.ProvisioningStateFailed { return fmt.Errorf("cluster exists and is in failed provisioning state, please delete and retry: %s, %s", clusterGet.ID, c.Config.VnetResourceGroup) } c.log.Print("cluster already exists, skipping create") return nil } appDetails := appDetails{} if !c.Config.UseWorkloadIdentity { c.log.Info("Creating app") appDetails, err = c.createApp(ctx, c.Config.ClusterName) if err != nil { return err } } visibility := api.VisibilityPublic if c.Config.IsPrivate || c.Config.NoInternet { visibility = api.VisibilityPrivate } if c.Config.IsCI { c.log.Infof("creating resource group") _, err = c.groups.CreateOrUpdate(ctx, c.Config.VnetResourceGroup, mgmtfeatures.ResourceGroup{ Location: to.StringPtr(c.Config.Location), }) if err != nil { return err } } asset, err := assets.EmbeddedFiles.ReadFile(generator.FileClusterPredeploy) if err != nil { return err } var template map[string]interface{} err = json.Unmarshal(asset, &template) if err != nil { return err } addressPrefix, masterSubnet, workerSubnet, err := c.generateSubnets() if err != nil { return err } diskEncryptionSetName := fmt.Sprintf( "%s%s", c.Config.VnetResourceGroup, generator.SharedDiskEncryptionSetNameSuffix, ) var kvName string if !c.Config.IsCI { if len(c.Config.VnetResourceGroup) > 10 { // keyvault names need to have a maximum length of 24, // so we need to cut off some chars if the resource group name is too long kvName = c.Config.VnetResourceGroup[:10] + generator.SharedDiskEncryptionKeyVaultNameSuffix } else { kvName = c.Config.VnetResourceGroup + generator.SharedDiskEncryptionKeyVaultNameSuffix } } else { // if DES already exists in RG, then reuse KV hosting the key of this DES, // otherwise, name is limited to 24 characters, but must be globally unique, // so we generate a name randomly until it is available diskEncryptionSet, err := c.diskEncryptionSetsClient.Get(ctx, c.Config.VnetResourceGroup, diskEncryptionSetName) if err == nil { if diskEncryptionSet.EncryptionSetProperties == nil || diskEncryptionSet.EncryptionSetProperties.ActiveKey == nil || diskEncryptionSet.EncryptionSetProperties.ActiveKey.SourceVault == nil || diskEncryptionSet.EncryptionSetProperties.ActiveKey.SourceVault.ID == nil { return fmt.Errorf("no valid Key Vault found in Disk Encryption Set: %v. Delete the Disk Encryption Set and retry", diskEncryptionSet) } ID := *diskEncryptionSet.EncryptionSetProperties.ActiveKey.SourceVault.ID var found bool _, kvName, found = strings.Cut(ID, "/providers/Microsoft.KeyVault/vaults/") if !found { return fmt.Errorf("could not find Key Vault name in ID: %v", ID) } } else { if autorestErr, ok := err.(autorest.DetailedError); !ok || autorestErr.Response == nil || autorestErr.Response.StatusCode != http.StatusNotFound { return fmt.Errorf("failed to get Disk Encryption Set: %v", err) } for { kvName = "kv-" + uuid.DefaultGenerator.Generate()[:21] result, err := c.vaultsClient.CheckNameAvailability( ctx, sdkkeyvault.VaultCheckNameAvailabilityParameters{Name: &kvName, Type: to.StringPtr("Microsoft.KeyVault/vaults")}, nil, ) if err != nil { return err } if result.NameAvailable == nil { return fmt.Errorf("have unexpected nil NameAvailable for key vault: %v", kvName) } if *result.NameAvailable { break } c.log.Infof("key vault %v is not available and we will try an other one", kvName) } } } parameters := map[string]*arm.ParametersParameter{ "clusterName": {Value: c.Config.ClusterName}, "ci": {Value: c.Config.IsCI}, "vnetAddressPrefix": {Value: addressPrefix}, "masterAddressPrefix": {Value: masterSubnet}, "workerAddressPrefix": {Value: workerSubnet}, "kvName": {Value: kvName}, } // TODO: ick if os.Getenv("NO_INTERNET") != "" { parameters["routes"] = &arm.ParametersParameter{ Value: []sdknetwork.Route{ { Properties: &sdknetwork.RoutePropertiesFormat{ AddressPrefix: pointerutils.ToPtr("0.0.0.0/0"), NextHopType: pointerutils.ToPtr(sdknetwork.RouteNextHopTypeNone), }, Name: pointerutils.ToPtr("blackhole"), }, }, } } armctx, cancel := context.WithTimeout(ctx, 10*time.Minute) defer cancel() c.log.Info("predeploying ARM template") err = c.deployments.CreateOrUpdateAndWait(armctx, c.Config.VnetResourceGroup, c.Config.ClusterName, mgmtfeatures.Deployment{ Properties: &mgmtfeatures.DeploymentProperties{ Template: template, Parameters: parameters, Mode: mgmtfeatures.Incremental, }, }) if err != nil { return err } diskEncryptionSetID := fmt.Sprintf( "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/diskEncryptionSets/%s", c.Config.SubscriptionID, c.Config.VnetResourceGroup, diskEncryptionSetName, ) if c.Config.UseWorkloadIdentity { c.log.Info("creating WIs") if err := c.SetupWorkloadIdentity(ctx, c.Config.VnetResourceGroup); err != nil { return fmt.Errorf("error setting up Workload Identity Roles: %w", err) } } else { c.log.Info("creating Classic role assignments") c.SetupServicePrincipalRoleAssignments(ctx, diskEncryptionSetID, appDetails.SPId) } fipsMode := true // Don't install with FIPS in a local dev, non-CI environment if !c.Config.IsCI && c.Config.IsLocalDevelopmentMode() { fipsMode = false } c.log.Info("creating cluster") err = c.createCluster(ctx, c.Config.VnetResourceGroup, c.Config.ClusterName, appDetails.applicationId, appDetails.applicationSecret, diskEncryptionSetID, visibility, c.Config.OSClusterVersion, fipsMode) if err != nil { return err } if c.Config.IsCI { c.log.Info("fixing up NSGs") err = c.fixupNSGs(ctx, c.Config.VnetResourceGroup, c.Config.ClusterName) if err != nil { return err } if env.IsLocalDevelopmentMode() { c.log.Info("peering subnets to CI infra") err = c.peerSubnetsToCI(ctx, c.Config.VnetResourceGroup) if err != nil { return err } } } c.log.Info("done") return nil } // ipRangesContainCIDR checks, weather any of the ipRanges overlap with the cidr string. In case cidr isn't valid, false is returned. func ipRangesContainCIDR(ipRanges []*net.IPNet, cidr string) (bool, error) { _, cidrNet, err := net.ParseCIDR(cidr) if err != nil { return false, err } for _, snet := range ipRanges { if snet.Contains(cidrNet.IP) || cidrNet.Contains(snet.IP) { return true, nil } } return false, nil } // GetIPRangesFromSubnet converts a given azure subnet to a list if IPNets. // Because an az subnet can cover multiple ipranges, we need to return a slice // instead of just a single ip range. This function never errors. If something // goes wrong, it instead returns an empty list. func GetIPRangesFromSubnet(subnet *sdknetwork.Subnet) []*net.IPNet { ipRanges := []*net.IPNet{} if subnet.Properties.AddressPrefix != nil { _, ipRange, err := net.ParseCIDR(*subnet.Properties.AddressPrefix) if err == nil { ipRanges = append(ipRanges, ipRange) } } if subnet.Properties.AddressPrefixes == nil { return ipRanges } for _, snetPrefix := range subnet.Properties.AddressPrefixes { _, ipRange, err := net.ParseCIDR(*snetPrefix) if err == nil { ipRanges = append(ipRanges, ipRange) } } return ipRanges } func (c *Cluster) generateSubnets() (vnetPrefix string, masterSubnet string, workerSubnet string, err error) { // pick a random 23 in range [10.3.0.0, 10.127.255.0], making sure it doesn't // conflict with other subnets present in out dev-vnet // 10.0.0.0/16 is used by dev-vnet to host CI // 10.1.0.0/24 is used by rp-vnet to host Proxy VM // 10.2.0.0/24 is used by dev-vpn-vnet to host VirtualNetworkGateway allSubnets, err := c.subnets.List(context.Background(), c.Config.VnetResourceGroup, "dev-vnet", nil) if err != nil { c.log.Warnf("Error getting existing subnets. Continuing regardless: %v", err) } ipRanges := []*net.IPNet{} for _, snet := range allSubnets { ipRanges = append(ipRanges, GetIPRangesFromSubnet(snet)...) } for i := 1; i < GenerateSubnetMaxTries; i++ { var x, y int // Local Dev clusters are limited to /16 dev-vnet if !c.Config.IsCI { x, y = 0, 2*rand.Intn(128) } else { x, y = rand.Intn((124))+3, 2*rand.Intn(128) } c.log.Infof("Generate Subnet try: %d\n", i) vnetPrefix = fmt.Sprintf("10.%d.%d.0/23", x, y) masterSubnet = fmt.Sprintf("10.%d.%d.0/24", x, y) workerSubnet = fmt.Sprintf("10.%d.%d.0/24", x, y+1) masterSubnetOverlaps, err := ipRangesContainCIDR(ipRanges, masterSubnet) if err != nil || masterSubnetOverlaps { continue } workerSubnetOverlaps, err := ipRangesContainCIDR(ipRanges, workerSubnet) if err != nil || workerSubnetOverlaps { continue } c.log.Infof("Generated subnets: vnet: %s, master: %s, worker: %s\n", vnetPrefix, masterSubnet, workerSubnet) return vnetPrefix, masterSubnet, workerSubnet, nil } return vnetPrefix, masterSubnet, workerSubnet, fmt.Errorf("was not able to generate master and worker subnets after %v tries", GenerateSubnetMaxTries) } func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName string) error { c.log.Infof("Deleting cluster %s in resource group %s", clusterName, vnetResourceGroup) var errs []error if c.Config.IsCI { oc, err := c.openshiftclusters.Get(ctx, vnetResourceGroup, clusterName) clusterResourceGroup := fmt.Sprintf("aro-%s", clusterName) if err != nil { c.log.Errorf("CI E2E cluster %s not found in resource group %s", clusterName, vnetResourceGroup) errs = append(errs, err) } if oc.Properties.ServicePrincipalProfile != nil { errs = append(errs, c.deleteApplication(ctx, oc.Properties.ServicePrincipalProfile.ClientID), ) } errs = append(errs, c.deleteCluster(ctx, vnetResourceGroup, clusterName), c.deleteWimiRoleAssignments(ctx, vnetResourceGroup), c.deleteWI(ctx, vnetResourceGroup), c.ensureResourceGroupDeleted(ctx, clusterResourceGroup), c.deleteResourceGroup(ctx, vnetResourceGroup), ) if env.IsLocalDevelopmentMode() { //PR E2E errs = append(errs, c.deleteVnetPeerings(ctx, vnetResourceGroup), ) } } else { errs = append(errs, c.deleteRoleAssignments(ctx, vnetResourceGroup, clusterName), c.deleteCluster(ctx, vnetResourceGroup, clusterName), c.deleteWimiRoleAssignments(ctx, vnetResourceGroup), c.deleteWI(ctx, vnetResourceGroup), c.deleteDeployment(ctx, vnetResourceGroup, clusterName), // Deleting the deployment does not clean up the associated resources c.deleteVnetResources(ctx, vnetResourceGroup, "dev-vnet", clusterName), ) } c.log.Info("done") return errors.Join(errs...) } func (c *Cluster) deleteWI(ctx context.Context, resourceGroup string) error { if !c.Config.UseWorkloadIdentity { c.log.Info("Skipping deletion of workload identity roles") return nil } c.log.Info("deleting WIs") platformWorkloadIdentityRoles, err := c.GetPlatformWIRoles() if err != nil { return fmt.Errorf("failure parsing Platform WI Roles, unable to remove them: %w", err) } platformWorkloadIdentityRoles = append(platformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{ OperatorName: "aro-Cluster", RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e", }) for _, wi := range platformWorkloadIdentityRoles { c.log.Infof("deleting WI: %s", wi.OperatorName) _, err := c.msiClient.Delete(ctx, resourceGroup, wi.OperatorName, nil) if err != nil { return err } } return nil } // createCluster created new clusters, based on where it is running. // development - using preview api // production - using stable GA api func (c *Cluster) createCluster(ctx context.Context, vnetResourceGroup, clusterName, clientID, clientSecret, diskEncryptionSetID string, visibility api.Visibility, osClusterVersion string, fipsEnabled bool) error { fipsMode := api.FipsValidatedModulesDisabled if fipsEnabled { fipsMode = api.FipsValidatedModulesEnabled } // using internal representation for "singe source" of options oc := api.OpenShiftCluster{ Properties: api.OpenShiftClusterProperties{ ClusterProfile: api.ClusterProfile{ Domain: strings.ToLower(clusterName), ResourceGroupID: fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", c.Config.SubscriptionID, "aro-"+clusterName), FipsValidatedModules: fipsMode, Version: osClusterVersion, PullSecret: api.SecureString(os.Getenv("USER_PULL_SECRET")), }, NetworkProfile: api.NetworkProfile{ PodCIDR: "10.128.0.0/14", ServiceCIDR: "172.30.0.0/16", SoftwareDefinedNetwork: api.SoftwareDefinedNetworkOpenShiftSDN, }, MasterProfile: api.MasterProfile{ VMSize: api.VMSize(c.Config.MasterVMSize), SubnetID: fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet/subnets/%s-master", c.Config.SubscriptionID, vnetResourceGroup, clusterName), EncryptionAtHost: api.EncryptionAtHostEnabled, DiskEncryptionSetID: diskEncryptionSetID, }, WorkerProfiles: []api.WorkerProfile{ { Name: "worker", VMSize: api.VMSize(c.Config.WorkerVMSize), DiskSizeGB: 128, SubnetID: fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet/subnets/%s-worker", c.Config.SubscriptionID, vnetResourceGroup, clusterName), Count: 3, EncryptionAtHost: api.EncryptionAtHostEnabled, DiskEncryptionSetID: diskEncryptionSetID, }, }, APIServerProfile: api.APIServerProfile{ Visibility: visibility, }, IngressProfiles: []api.IngressProfile{ { Name: "default", Visibility: visibility, }, }, }, Location: c.Config.Location, } if c.Config.UseWorkloadIdentity { oc.Properties.PlatformWorkloadIdentityProfile = &api.PlatformWorkloadIdentityProfile{ PlatformWorkloadIdentities: c.workloadIdentities, } oc.Identity = &api.ManagedServiceIdentity{ Type: api.ManagedServiceIdentityUserAssigned, TenantID: c.Config.TenantID, UserAssignedIdentities: map[string]api.UserAssignedIdentity{ fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.Config.SubscriptionID, vnetResourceGroup, "aro-Cluster"): {}, }, } } else { if clientID != "" && clientSecret != "" { oc.Properties.ServicePrincipalProfile = &api.ServicePrincipalProfile{ ClientID: clientID, ClientSecret: api.SecureString(clientSecret), } } } if c.Config.IsLocalDevelopmentMode() { err := c.registerSubscription() if err != nil { return err } err = c.ensureDefaultVersionInCosmosdb(ctx) if err != nil { return err } // If we're in local dev mode and the user has not overridden the default VM size, use a smaller size for cost-saving purposes if c.Config.WorkerVMSize == DefaultWorkerVmSize.String() { oc.Properties.WorkerProfiles[0].VMSize = api.VMSizeStandardD2sV3 } } return c.openshiftclusters.CreateOrUpdateAndWait(ctx, vnetResourceGroup, clusterName, &oc) } func (c *Cluster) registerSubscription() error { b, err := json.Marshal(&api.Subscription{ State: api.SubscriptionStateRegistered, Properties: &api.SubscriptionProperties{ TenantID: c.Config.TenantID, RegisteredFeatures: []api.RegisteredFeatureProfile{ { Name: "Microsoft.RedHatOpenShift/RedHatEngineering", State: "Registered", }, }, }, }) if err != nil { return err } req, err := http.NewRequest(http.MethodPut, localDefaultURL+"/subscriptions/"+c.Config.SubscriptionID+"?api-version=2.0", bytes.NewReader(b)) if err != nil { return err } req.Header.Set("Content-Type", "application/json") resp, err := insecureLocalClient().Do(req) if err != nil { return err } return resp.Body.Close() } // getVersionsInCosmosDB connects to the local RP endpoint and queries the // available OpenShiftVersions func getVersionsInCosmosDB(ctx context.Context) ([]*api.OpenShiftVersion, error) { type getVersionResponse struct { Value []*api.OpenShiftVersion `json:"value"` } getRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, localDefaultURL+"/admin/versions", &bytes.Buffer{}) if err != nil { return nil, fmt.Errorf("error creating get versions request: %w", err) } getRequest.Header.Set("Content-Type", "application/json") getResponse, err := insecureLocalClient().Do(getRequest) if err != nil { return nil, fmt.Errorf("error couldn't retrieve versions in cosmos db: %w", err) } parsedResponse := getVersionResponse{} decoder := json.NewDecoder(getResponse.Body) err = decoder.Decode(&parsedResponse) return parsedResponse.Value, err } // ensureDefaultVersionInCosmosdb puts a default openshiftversion into the // cosmos DB IF it doesn't already contain an entry for the default version. It // is hardcoded to use the local-RP endpoint // // It returns without an error when a default version is already present or a // default version was successfully put into the db. func (c *Cluster) ensureDefaultVersionInCosmosdb(ctx context.Context) error { versionsInDB, err := getVersionsInCosmosDB(ctx) if err != nil { return fmt.Errorf("couldn't query versions in cosmosdb: %w", err) } for _, versionFromDB := range versionsInDB { if versionFromDB.Properties.Version == version.DefaultInstallStream.Version.String() { c.log.Debugf("Version %s already in DB. Not overwriting existing one.", version.DefaultInstallStream.Version.String()) return nil } } defaultVersion := version.DefaultInstallStream b, err := json.Marshal(&api.OpenShiftVersion{ Properties: api.OpenShiftVersionProperties{ Version: defaultVersion.Version.String(), OpenShiftPullspec: defaultVersion.PullSpec, // HACK: we hardcode this to the latest installer image in arointsvc // if it is not overridden with ARO_HIVE_DEFAULT_INSTALLER_PULLSPEC or LiveConfig InstallerPullspec: fmt.Sprintf("arointsvc.azurecr.io/aro-installer:release-%s", version.DefaultInstallStream.Version.MinorVersion()), Enabled: true, }, }) if err != nil { return err } req, err := http.NewRequest(http.MethodPut, localDefaultURL+"/admin/versions", bytes.NewReader(b)) if err != nil { return err } req.Header.Set("Content-Type", "application/json") resp, err := insecureLocalClient().Do(req) if err != nil { return err } return resp.Body.Close() } func (c *Cluster) fixupNSGs(ctx context.Context, vnetResourceGroup, clusterName string) error { timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) defer cancel() // very occasionally c.securitygroups.List returns an empty list in // production. No idea why. Let's try retrying it... var nsgs []*sdknetwork.SecurityGroup err := wait.PollImmediateUntil(10*time.Second, func() (bool, error) { var err error nsgs, err = c.securitygroups.List(ctx, "aro-"+clusterName, nil) return len(nsgs) > 0, err }, timeoutCtx.Done()) if err != nil { return err } for _, subnetName := range []string{clusterName + "-master", clusterName + "-worker"} { resp, err := c.subnets.Get(ctx, vnetResourceGroup, "dev-vnet", subnetName, nil) if err != nil { return err } subnet := resp.Subnet subnet.Properties.NetworkSecurityGroup = &sdknetwork.SecurityGroup{ ID: nsgs[0].ID, } err = c.subnets.CreateOrUpdateAndWait(ctx, vnetResourceGroup, "dev-vnet", subnetName, subnet, nil) if err != nil { return err } } return nil } func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, clusterName string) error { if c.Config.UseWorkloadIdentity { c.log.Print("Skipping deletion of service principal role assignments") } c.log.Print("deleting role assignments") oc, err := c.openshiftclusters.Get(ctx, vnetResourceGroup, clusterName) if err != nil { return fmt.Errorf("error getting cluster document: %w", err) } spObjID, err := utilgraph.GetServicePrincipalIDByAppID(ctx, c.spGraphClient, oc.Properties.ServicePrincipalProfile.ClientID) if err != nil { return fmt.Errorf("error getting service principal for cluster: %w", err) } if spObjID == nil { return nil } roleAssignments, err := c.roleassignments.ListForResourceGroup(ctx, vnetResourceGroup, fmt.Sprintf("principalId eq '%s'", *spObjID)) if err != nil { return fmt.Errorf("error listing role assignments for service principal: %w", err) } for _, roleAssignment := range roleAssignments { if strings.HasPrefix( strings.ToLower(*roleAssignment.Scope), strings.ToLower("/subscriptions/"+c.Config.SubscriptionID+"/resourceGroups/"+vnetResourceGroup), ) { // Don't delete inherited role assignments, only those resource group level or below c.log.Infof("deleting role assignment %s", *roleAssignment.Name) _, err = c.roleassignments.Delete(ctx, *roleAssignment.Scope, *roleAssignment.Name) if err != nil { return fmt.Errorf("error deleting role assignment %s: %w", *roleAssignment.Name, err) } } } return nil } func (c *Cluster) deleteWimiRoleAssignments(ctx context.Context, vnetResourceGroup string) error { if !c.Config.UseWorkloadIdentity { c.log.Print("Skipping deletion of wimi roleassignments") return nil } c.log.Print("deleting wimi role assignments") var wiRoleSets []api.PlatformWorkloadIdentityRoleSetProperties if err := json.Unmarshal([]byte(c.Config.WorkloadIdentityRoles), &wiRoleSets); err != nil { return fmt.Errorf("failed to parse JSON: %w", err) } platformWorkloadIdentityRoles := append(wiRoleSets[0].PlatformWorkloadIdentityRoles, api.PlatformWorkloadIdentityRole{ OperatorName: "aro-Cluster", RoleDefinitionID: "/providers/Microsoft.Authorization/roleDefinitions/ef318e2a-8334-4a05-9e4a-295a196c6a6e", }) for _, wi := range platformWorkloadIdentityRoles { resp, err := c.msiClient.Get(ctx, vnetResourceGroup, wi.OperatorName, nil) if err != nil { return err } roleAssignments, err := c.roleassignments.ListForResourceGroup(ctx, vnetResourceGroup, fmt.Sprintf("principalId eq '%s'", *resp.Properties.PrincipalID)) if err != nil { return fmt.Errorf("error listing role assignments for service principal: %w", err) } for _, roleAssignment := range roleAssignments { if strings.HasPrefix( strings.ToLower(*roleAssignment.Scope), strings.ToLower("/subscriptions/"+c.Config.SubscriptionID+"/resourceGroups/"+vnetResourceGroup), ) { // Don't delete inherited role assignments, only those resource group level or below c.log.Infof("deleting role assignment %s", *roleAssignment.Name) _, err = c.roleassignments.Delete(ctx, *roleAssignment.Scope, *roleAssignment.Name) if err != nil { return fmt.Errorf("error deleting role assignment %s: %w", *roleAssignment.Name, err) } } } } return nil } func (c *Cluster) deleteCluster(ctx context.Context, resourceGroup, clusterName string) error { c.log.Printf("deleting cluster %s", clusterName) if err := c.openshiftclusters.DeleteAndWait(ctx, resourceGroup, clusterName); err != nil { return fmt.Errorf("error deleting cluster %s: %w", clusterName, err) } return nil } func (c *Cluster) ensureResourceGroupDeleted(ctx context.Context, resourceGroupName string) error { c.log.Printf("deleting resource group %s", resourceGroupName) timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) defer cancel() return wait.PollImmediateUntil(5*time.Second, func() (bool, error) { _, err := c.groups.Get(ctx, resourceGroupName) if azureerrors.ResourceGroupNotFound(err) { c.log.Infof("finished deleting resource group %s", resourceGroupName) return true, nil } return false, fmt.Errorf("failed to delete resource group %s with %s", resourceGroupName, err) }, timeoutCtx.Done()) } func (c *Cluster) deleteResourceGroup(ctx context.Context, resourceGroup string) error { c.log.Printf("deleting resource group %s", resourceGroup) if _, err := c.groups.Get(ctx, resourceGroup); err != nil { c.log.Printf("error getting resource group %s, skipping deletion: %v", resourceGroup, err) return nil } if err := c.groups.DeleteAndWait(ctx, resourceGroup); err != nil { return fmt.Errorf("error deleting resource group: %w", err) } return nil } func (c *Cluster) deleteVnetPeerings(ctx context.Context, resourceGroup string) error { r, err := azure.ParseResourceID(c.ciParentVnet) if err == nil { err = c.ciParentVnetPeerings.DeleteAndWait(ctx, r.ResourceGroup, r.ResourceName, resourceGroup+"-peer", nil) } if err != nil { return fmt.Errorf("error deleting vnet peerings: %w", err) } return nil } func (c *Cluster) deleteDeployment(ctx context.Context, resourceGroup, clusterName string) error { c.log.Info("deleting deployment") if err := c.deployments.DeleteAndWait(ctx, resourceGroup, clusterName); err != nil { return fmt.Errorf("error deleting deployment: %w", err) } return nil } func (c *Cluster) deleteVnetResources(ctx context.Context, resourceGroup, vnetName, clusterName string) error { var errs []error c.log.Info("deleting master/worker subnets") if err := c.subnets.DeleteAndWait(ctx, resourceGroup, vnetName, clusterName+"-master", nil); err != nil { c.log.Errorf("error when deleting master subnet: %v", err) errs = append(errs, err) } if err := c.subnets.DeleteAndWait(ctx, resourceGroup, vnetName, clusterName+"-worker", nil); err != nil { c.log.Errorf("error when deleting worker subnet: %v", err) errs = append(errs, err) } c.log.Info("deleting route table") if err := c.routetables.DeleteAndWait(ctx, resourceGroup, clusterName+"-rt", nil); err != nil { c.log.Errorf("error when deleting route table: %v", err) errs = append(errs, err) } return errors.Join(errs...) } func (c *Cluster) peerSubnetsToCI(ctx context.Context, vnetResourceGroup string) error { cluster := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet", c.Config.SubscriptionID, vnetResourceGroup) r, err := azure.ParseResourceID(c.ciParentVnet) if err != nil { return err } clusterProp := &sdknetwork.VirtualNetworkPeeringPropertiesFormat{ RemoteVirtualNetwork: &sdknetwork.SubResource{ ID: &c.ciParentVnet, }, AllowVirtualNetworkAccess: to.BoolPtr(true), AllowForwardedTraffic: to.BoolPtr(true), UseRemoteGateways: to.BoolPtr(true), } rpProp := &sdknetwork.VirtualNetworkPeeringPropertiesFormat{ RemoteVirtualNetwork: &sdknetwork.SubResource{ ID: &cluster, }, AllowVirtualNetworkAccess: to.BoolPtr(true), AllowForwardedTraffic: to.BoolPtr(true), AllowGatewayTransit: to.BoolPtr(true), } err = c.peerings.CreateOrUpdateAndWait(ctx, vnetResourceGroup, "dev-vnet", r.ResourceGroup+"-peer", sdknetwork.VirtualNetworkPeering{Properties: clusterProp}, nil) if err != nil { return err } err = c.ciParentVnetPeerings.CreateOrUpdateAndWait(ctx, r.ResourceGroup, r.ResourceName, vnetResourceGroup+"-peer", sdknetwork.VirtualNetworkPeering{Properties: rpProp}, nil) if err != nil { return err } return err }