pkg/providers/nutanix/validator.go (804 lines of code) (raw):

package nutanix import ( "context" "fmt" "net" "net/http" "regexp" "strconv" "strings" "github.com/nutanix-cloud-native/prism-go-client/environment/credentials" v3 "github.com/nutanix-cloud-native/prism-go-client/v3" "k8s.io/apimachinery/pkg/api/resource" anywherev1 "github.com/aws/eks-anywhere/pkg/api/v1alpha1" "github.com/aws/eks-anywhere/pkg/cluster" "github.com/aws/eks-anywhere/pkg/constants" "github.com/aws/eks-anywhere/pkg/crypto" "github.com/aws/eks-anywhere/pkg/logger" "github.com/aws/eks-anywhere/pkg/networkutils" ) const ( minNutanixCPUSockets = 1 minNutanixCPUPerSocket = 1 minNutanixMemoryMiB = 2048 minNutanixDiskGiB = 20 ) // IPValidator is an interface that defines methods to validate the control plane IP. type IPValidator interface { ValidateControlPlaneIPUniqueness(cluster *anywherev1.Cluster) error } // Validator is a client to validate nutanix resources. type Validator struct { httpClient *http.Client certValidator crypto.TlsValidator clientCache *ClientCache } // NewValidator returns a new validator client. func NewValidator(clientCache *ClientCache, certValidator crypto.TlsValidator, httpClient *http.Client) *Validator { return &Validator{ clientCache: clientCache, certValidator: certValidator, httpClient: httpClient, } } func (v *Validator) validateControlPlaneIP(ip string) error { // check if controlPlaneEndpointIp is valid parsedIP := net.ParseIP(ip) if parsedIP == nil { return fmt.Errorf("cluster controlPlaneConfiguration.Endpoint.Host is invalid: %s", ip) } return nil } // ValidateClusterSpec validates the cluster spec. func (v *Validator) ValidateClusterSpec(ctx context.Context, spec *cluster.Spec, creds credentials.BasicAuthCredential) error { logger.Info("ValidateClusterSpec for Nutanix datacenter", "NutanixDatacenter", spec.NutanixDatacenter.Name) client, err := v.clientCache.GetNutanixClient(spec.NutanixDatacenter, creds) if err != nil { return err } if err := v.ValidateDatacenterConfig(ctx, client, spec); err != nil { return err } if err := v.validateControlPlaneIP(spec.Cluster.Spec.ControlPlaneConfiguration.Endpoint.Host); err != nil { return err } for _, conf := range spec.NutanixMachineConfigs { if err := v.ValidateMachineConfig(ctx, client, spec.Cluster, conf); err != nil { return fmt.Errorf("failed to validate machine config: %v", err) } } if err := v.validateFreeGPU(ctx, client, spec); err != nil { return err } return v.checkImageNameMatchesKubernetesVersion(ctx, spec, client) } func (v *Validator) checkImageNameMatchesKubernetesVersion(ctx context.Context, spec *cluster.Spec, client Client) error { controlPlaneMachineConfig := spec.NutanixMachineConfigs[spec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name] if controlPlaneMachineConfig == nil { return fmt.Errorf("cannot find NutanixMachineConfig %v for control plane", spec.Cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name) } // validate template field name contains cluster kubernetes version for the control plane machine. if err := v.validateTemplateMatchesKubernetesVersion(ctx, controlPlaneMachineConfig.Spec.Image, client, string(spec.Cluster.Spec.KubernetesVersion)); err != nil { return fmt.Errorf("machine config %s validation failed: %v", controlPlaneMachineConfig.Name, err) } if spec.Cluster.Spec.ExternalEtcdConfiguration != nil { etcdMachineConfig := spec.NutanixMachineConfigs[spec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name] if etcdMachineConfig == nil { return fmt.Errorf("cannot find NutanixMachineConfig %v for etcd machines", spec.Cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name) } // validate template field name contains cluster kubernetes version for the external etcd machine. if err := v.validateTemplateMatchesKubernetesVersion(ctx, etcdMachineConfig.Spec.Image, client, string(spec.Cluster.Spec.KubernetesVersion)); err != nil { return fmt.Errorf("machine config %s validation failed: %v", etcdMachineConfig.Name, err) } } for _, workerNodeGroupConfiguration := range spec.Cluster.Spec.WorkerNodeGroupConfigurations { kubernetesVersion := string(spec.Cluster.Spec.KubernetesVersion) if workerNodeGroupConfiguration.KubernetesVersion != nil { kubernetesVersion = string(*workerNodeGroupConfiguration.KubernetesVersion) } // validate template field name contains cluster kubernetes version for the control plane machine. imageIdentifier := spec.NutanixMachineConfigs[workerNodeGroupConfiguration.MachineGroupRef.Name].Spec.Image if err := v.validateTemplateMatchesKubernetesVersion(ctx, imageIdentifier, client, kubernetesVersion); err != nil { return fmt.Errorf("machine config %s validation failed: %v", controlPlaneMachineConfig.Name, err) } } return nil } // ValidateDatacenterConfig validates the datacenter config. func (v *Validator) ValidateDatacenterConfig(ctx context.Context, client Client, spec *cluster.Spec) error { config := spec.NutanixDatacenter if config.Spec.Insecure { logger.Info("Warning: Skipping TLS validation for insecure connection to Nutanix Prism Central; this is not recommended for production use") } if err := v.validateEndpointAndPort(config.Spec); err != nil { return err } if err := v.validateCredentials(ctx, client); err != nil { return err } if err := v.validateTrustBundleConfig(config.Spec); err != nil { return err } if err := v.validateCredentialRef(config); err != nil { return err } if err := v.validateFailureDomains(ctx, client, spec); err != nil { return err } if config.Spec.CcmExcludeNodeIPs != nil { if err := v.validateCcmExcludeNodeIPs(config.Spec.CcmExcludeNodeIPs); err != nil { return err } } return nil } func (v *Validator) validateIPRangeForCcmExcludeNodeIPs(ipRange string) error { ipRangeStr := strings.TrimSpace(ipRange) rangeParts := strings.Split(ipRangeStr, "-") if len(rangeParts) != 2 { return fmt.Errorf("invalid IP range %s", ipRangeStr) } startIP := net.ParseIP(strings.TrimSpace(rangeParts[0])) if startIP == nil { return fmt.Errorf("invalid start IP address %s", rangeParts[0]) } endIP := net.ParseIP(strings.TrimSpace(rangeParts[1])) if endIP == nil { return fmt.Errorf("invalid end IP address %s", rangeParts[1]) } cmp, err := compareIP(startIP, endIP) if err != nil { return err } if cmp > 0 { return fmt.Errorf("start IP address %s is greater than end IP address %s", startIP.String(), endIP.String()) } return nil } func (v *Validator) validateCcmExcludeNodeIPs(ccmExcludeNodeIPs []string) error { for _, ipOrIPRange := range ccmExcludeNodeIPs { if strings.Contains(ipOrIPRange, "/") { cidrStr := strings.TrimSpace(ipOrIPRange) _, _, err := net.ParseCIDR(cidrStr) if err != nil { return fmt.Errorf("invalid CIDR %s: %v", cidrStr, err) } } else if strings.Contains(ipOrIPRange, "-") { err := v.validateIPRangeForCcmExcludeNodeIPs(ipOrIPRange) if err != nil { return err } } else { ipStr := strings.TrimSpace(ipOrIPRange) ip := net.ParseIP(ipStr) if ip == nil { return fmt.Errorf("invalid IP address %s", ipStr) } } } return nil } func (v *Validator) validateFailureDomains(ctx context.Context, client Client, spec *cluster.Spec) error { config := spec.NutanixDatacenter regexName, err := regexp.Compile("^[a-z0-9]([-a-z0-9]*[a-z0-9])?$") if err != nil { return err } failureDomainCount := len(config.Spec.FailureDomains) for _, fd := range config.Spec.FailureDomains { if res := regexName.MatchString(fd.Name); !res { errorStr := `failure domain name should contains only small letters, digits, and hyphens. it should start with small letter or digit` return fmt.Errorf(errorStr) } if err := v.validateClusterConfig(ctx, client, fd.Cluster); err != nil { return err } for _, subnet := range fd.Subnets { if err := v.validateSubnetConfig(ctx, client, fd.Cluster, subnet); err != nil { return err } } workerMachineGroups := getWorkerMachineGroups(spec) for _, workerMachineGroupName := range fd.WorkerMachineGroups { if err := v.validateWorkerMachineGroup(workerMachineGroups, workerMachineGroupName, failureDomainCount); err != nil { return err } } } return nil } func (v *Validator) validateWorkerMachineGroup(workerMachineGroups map[string]anywherev1.WorkerNodeGroupConfiguration, workerMachineGroupName string, fdCount int) error { if _, ok := workerMachineGroups[workerMachineGroupName]; !ok { return fmt.Errorf("worker machine group %s not found in the cluster worker node group definitions", workerMachineGroupName) } if workerMachineGroups[workerMachineGroupName].Count != nil && *workerMachineGroups[workerMachineGroupName].Count > fdCount { return fmt.Errorf("count %d of machines in workerNodeGroupConfiguration %s shouldn't be greater than the failure domain count %d where those machines should be spreaded accross", *workerMachineGroups[workerMachineGroupName].Count, workerMachineGroupName, fdCount) } return nil } func (v *Validator) validateCredentialRef(config *anywherev1.NutanixDatacenterConfig) error { if config.Spec.CredentialRef == nil { return fmt.Errorf("credentialRef must be provided") } if config.Spec.CredentialRef.Kind != constants.SecretKind { return fmt.Errorf("credentialRef kind must be %s", constants.SecretKind) } if config.Spec.CredentialRef.Name == "" { return fmt.Errorf("credentialRef name must be provided") } return nil } func (v *Validator) validateEndpointAndPort(dcConf anywherev1.NutanixDatacenterConfigSpec) error { if !networkutils.IsPortValid(strconv.Itoa(dcConf.Port)) { return fmt.Errorf("nutanix prism central port %d out of range", dcConf.Port) } if dcConf.Endpoint == "" { return fmt.Errorf("nutanix prism central endpoint must be provided") } server := fmt.Sprintf("%s:%d", dcConf.Endpoint, dcConf.Port) if !strings.HasPrefix(server, "https://") { server = fmt.Sprintf("https://%s", server) } if _, err := v.httpClient.Get(server); err != nil { return fmt.Errorf("failed to reach server %s: %v", server, err) } return nil } func (v *Validator) validateCredentials(ctx context.Context, client Client) error { _, err := client.GetCurrentLoggedInUser(ctx) if err != nil { return err } return nil } func (v *Validator) validateTrustBundleConfig(dcConf anywherev1.NutanixDatacenterConfigSpec) error { if dcConf.AdditionalTrustBundle == "" { return nil } return v.certValidator.ValidateCert(dcConf.Endpoint, fmt.Sprintf("%d", dcConf.Port), dcConf.AdditionalTrustBundle) } func (v *Validator) validateMachineSpecs(machineSpec anywherev1.NutanixMachineConfigSpec) error { if machineSpec.VCPUSockets < minNutanixCPUSockets { return fmt.Errorf("vCPU sockets %d must be greater than or equal to %d", machineSpec.VCPUSockets, minNutanixCPUSockets) } if machineSpec.VCPUsPerSocket < minNutanixCPUPerSocket { return fmt.Errorf("vCPUs per socket %d must be greater than or equal to %d", machineSpec.VCPUsPerSocket, minNutanixCPUPerSocket) } minNutanixMemory, err := resource.ParseQuantity(fmt.Sprintf("%dMi", minNutanixMemoryMiB)) if err != nil { return err } if machineSpec.MemorySize.Cmp(minNutanixMemory) < 0 { return fmt.Errorf("MemorySize must be greater than or equal to %dMi", minNutanixMemoryMiB) } minNutanixDisk, err := resource.ParseQuantity(fmt.Sprintf("%dGi", minNutanixDiskGiB)) if err != nil { return err } if machineSpec.SystemDiskSize.Cmp(minNutanixDisk) < 0 { return fmt.Errorf("SystemDiskSize must be greater than or equal to %dGi", minNutanixDiskGiB) } return nil } // ValidateMachineConfig validates the Prism Element cluster, subnet, and image for the machine. func (v *Validator) ValidateMachineConfig(ctx context.Context, client Client, cluster *anywherev1.Cluster, config *anywherev1.NutanixMachineConfig) error { if err := v.validateMachineSpecs(config.Spec); err != nil { return err } if err := v.validateClusterConfig(ctx, client, config.Spec.Cluster); err != nil { return err } if err := v.validateSubnetConfig(ctx, client, config.Spec.Cluster, config.Spec.Subnet); err != nil { return err } if err := v.validateImageConfig(ctx, client, config.Spec.Image); err != nil { return err } if config.Spec.Project != nil { if err := v.validateProjectConfig(ctx, client, *config.Spec.Project); err != nil { return err } } if config.Spec.AdditionalCategories != nil { if err := v.validateAdditionalCategories(ctx, client, config.Spec.AdditionalCategories); err != nil { return err } } if err := v.validateGPUInMachineConfig(cluster, config); err != nil { return err } return nil } func (v *Validator) validateGPUInMachineConfig(cluster *anywherev1.Cluster, config *anywherev1.NutanixMachineConfig) error { if config.Spec.GPUs != nil { if err := checkMachineConfigIsForWorker(config, cluster); err != nil { return err } for _, gpu := range config.Spec.GPUs { if err := v.validateGPUConfig(gpu); err != nil { return err } } } return nil } func (v *Validator) validateClusterConfig(ctx context.Context, client Client, identifier anywherev1.NutanixResourceIdentifier) error { switch identifier.Type { case anywherev1.NutanixIdentifierName: if identifier.Name == nil || *identifier.Name == "" { return fmt.Errorf("missing cluster name") } else { clusterName := *identifier.Name if _, err := findClusterUUIDByName(ctx, client, clusterName); err != nil { return fmt.Errorf("failed to find cluster with name %q: %v", clusterName, err) } } case anywherev1.NutanixIdentifierUUID: if identifier.UUID == nil || *identifier.UUID == "" { return fmt.Errorf("missing cluster uuid") } else { clusterUUID := *identifier.UUID if _, err := client.GetCluster(ctx, clusterUUID); err != nil { return fmt.Errorf("failed to find cluster with uuid %v: %v", clusterUUID, err) } } default: return fmt.Errorf("invalid cluster identifier type: %s; valid types are: %q and %q", identifier.Type, anywherev1.NutanixIdentifierName, anywherev1.NutanixIdentifierUUID) } return nil } func (v *Validator) validateImageConfig(ctx context.Context, client Client, identifier anywherev1.NutanixResourceIdentifier) error { switch identifier.Type { case anywherev1.NutanixIdentifierName: if identifier.Name == nil || *identifier.Name == "" { return fmt.Errorf("missing image name") } else { imageName := *identifier.Name if _, err := findImageUUIDByName(ctx, client, imageName); err != nil { return fmt.Errorf("failed to find image with name %q: %v", imageName, err) } } case anywherev1.NutanixIdentifierUUID: if identifier.UUID == nil || *identifier.UUID == "" { return fmt.Errorf("missing image uuid") } else { imageUUID := *identifier.UUID if _, err := client.GetImage(ctx, imageUUID); err != nil { return fmt.Errorf("failed to find image with uuid %s: %v", imageUUID, err) } } default: return fmt.Errorf("invalid image identifier type: %s; valid types are: %q and %q", identifier.Type, anywherev1.NutanixIdentifierName, anywherev1.NutanixIdentifierUUID) } return nil } func (v *Validator) validateTemplateMatchesKubernetesVersion(ctx context.Context, identifier anywherev1.NutanixResourceIdentifier, client Client, kubernetesVersionName string) error { var templateName string if identifier.Type == anywherev1.NutanixIdentifierUUID { imageUUID := *identifier.UUID imageDetails, err := client.GetImage(ctx, imageUUID) if err != nil { return fmt.Errorf("failed to find image with uuid %s: %v", imageUUID, err) } if imageDetails.Spec == nil || imageDetails.Spec.Name == nil { return fmt.Errorf("failed to find image details with uuid %s", imageUUID) } templateName = *imageDetails.Spec.Name } else { templateName = *identifier.Name } // Replace 1.23, 1-23, 1_23 to 123 in the template name string. templateReplacer := strings.NewReplacer("-", "", ".", "", "_", "") template := templateReplacer.Replace(templateName) // Replace 1-23 to 123 in the kubernetesversion string. replacer := strings.NewReplacer(".", "") kubernetesVersion := replacer.Replace(string(kubernetesVersionName)) // This will return an error if the template name does not contain specified kubernetes version. // For ex if the kubernetes version is 1.23, // the template name should include 1.23 or 1-23, 1_23 or 123 i.e. kubernetes-1-23-eks in the string. if !strings.Contains(template, kubernetesVersion) { return fmt.Errorf("missing kube version from the machine config template name: template=%s, version=%s", templateName, string(kubernetesVersionName)) } return nil } func (v *Validator) validateSubnetConfig(ctx context.Context, client Client, cluster, subnet anywherev1.NutanixResourceIdentifier) error { clusterUUID, err := getClusterUUID(ctx, client, cluster) if err != nil { return err } switch subnet.Type { case anywherev1.NutanixIdentifierName: if subnet.Name == nil || *subnet.Name == "" { return fmt.Errorf("missing subnet name") } else { subnetName := *subnet.Name if _, err = findSubnetUUIDByName(ctx, client, clusterUUID, subnetName); err != nil { return fmt.Errorf("failed to find subnet with name %s: %v", subnetName, err) } } case anywherev1.NutanixIdentifierUUID: if subnet.UUID == nil || *subnet.UUID == "" { return fmt.Errorf("missing subnet uuid") } else { subnetUUID := *subnet.UUID if _, err = client.GetSubnet(ctx, subnetUUID); err != nil { return fmt.Errorf("failed to find subnet with uuid %s: %v", subnetUUID, err) } } default: return fmt.Errorf("invalid subnet identifier type: %s; valid types are: %q and %q", subnet.Type, anywherev1.NutanixIdentifierName, anywherev1.NutanixIdentifierUUID) } return nil } func (v *Validator) validateProjectConfig(ctx context.Context, client Client, identifier anywherev1.NutanixResourceIdentifier) error { switch identifier.Type { case anywherev1.NutanixIdentifierName: if identifier.Name == nil || *identifier.Name == "" { return fmt.Errorf("missing project name") } projectName := *identifier.Name if _, err := findProjectUUIDByName(ctx, client, projectName); err != nil { return fmt.Errorf("failed to find project with name %q: %v", projectName, err) } case anywherev1.NutanixIdentifierUUID: if identifier.UUID == nil || *identifier.UUID == "" { return fmt.Errorf("missing project uuid") } projectUUID := *identifier.UUID if _, err := client.GetProject(ctx, projectUUID); err != nil { return fmt.Errorf("failed to find project with uuid %s: %v", projectUUID, err) } default: return fmt.Errorf("invalid project identifier type: %s; valid types are: %q and %q", identifier.Type, anywherev1.NutanixIdentifierName, anywherev1.NutanixIdentifierUUID) } return nil } func (v *Validator) validateAdditionalCategories(ctx context.Context, client Client, categories []anywherev1.NutanixCategoryIdentifier) error { for _, category := range categories { if category.Key == "" { return fmt.Errorf("missing category key") } if category.Value == "" { return fmt.Errorf("missing category value") } if _, err := client.GetCategoryKey(ctx, category.Key); err != nil { return fmt.Errorf("failed to find category with key %q: %v", category.Key, err) } if _, err := client.GetCategoryValue(ctx, category.Key, category.Value); err != nil { return fmt.Errorf("failed to find category value %q for category %q: %v", category.Value, category.Key, err) } } return nil } func (v *Validator) validateGPUConfig(gpu anywherev1.NutanixGPUIdentifier) error { if gpu.Type == "" { return fmt.Errorf("missing GPU type") } if gpu.Type != anywherev1.NutanixGPUIdentifierDeviceID && gpu.Type != anywherev1.NutanixGPUIdentifierName { return fmt.Errorf("invalid GPU identifier type: %s; valid types are: %q and %q", gpu.Type, anywherev1.NutanixGPUIdentifierDeviceID, anywherev1.NutanixGPUIdentifierName) } if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { if gpu.DeviceID == nil { return fmt.Errorf("missing GPU device ID") } } else { if gpu.Name == "" { return fmt.Errorf("missing GPU name") } } return nil } func getRequestedGPUsForAllMachines(machineCount int, requestedGpus []anywherev1.NutanixGPUIdentifier) []anywherev1.NutanixGPUIdentifier { allMachinesRequestedGPUs := make([]anywherev1.NutanixGPUIdentifier, 0) for i := 0; i < machineCount; i++ { allMachinesRequestedGPUs = append(allMachinesRequestedGPUs, requestedGpus...) } return allMachinesRequestedGPUs } func (v *Validator) tryAssignGPUsToMachineConfig(machineCount int, requestedGpus []anywherev1.NutanixGPUIdentifier, clusterGpuList []v3.GPU, cluster anywherev1.NutanixResourceIdentifier) ([]v3.GPU, error) { allMachinesRequestedGPUs := getRequestedGPUsForAllMachines(machineCount, requestedGpus) for _, requestedGpu := range allMachinesRequestedGPUs { found := -1 for index, gpu := range clusterGpuList { if isRequestedGPUAssignable(gpu, requestedGpu) { found = index break } } if found == -1 { return nil, errorGPUNotFound(requestedGpu, cluster) } clusterGpuList = append(clusterGpuList[:found], clusterGpuList[found+1:]...) } return clusterGpuList, nil } func (v *Validator) isGPURequested(configs map[string]*anywherev1.NutanixMachineConfig) bool { for _, machineConfig := range configs { if machineConfig.Spec.GPUs != nil { return true } } return false } func (v *Validator) initAvailableGPUsMap(hosts []*v3.HostResponse) map[string][]v3.GPU { availableGpu := make(map[string][]v3.GPU) for _, host := range hosts { if host.Status != nil && host.Status.Resources != nil && host.Status.ClusterReference != nil && host.Status.ClusterReference.UUID != "" { clusterUUID := host.Status.ClusterReference.UUID availableGpu[clusterUUID] = make([]v3.GPU, 0) } } return availableGpu } func (v *Validator) getAvailableGPUs(hosts []*v3.HostResponse) (map[string][]v3.GPU, error) { availableGpu := v.initAvailableGPUsMap(hosts) for _, host := range hosts { if host.Status != nil && host.Status.Resources != nil && host.Status.Resources.GPUList != nil && host.Status.ClusterReference != nil && host.Status.ClusterReference.UUID != "" { clusterUUID := host.Status.ClusterReference.UUID for _, gpu := range host.Status.Resources.GPUList { availableGpu[clusterUUID] = append(availableGpu[clusterUUID], *gpu) } } } return availableGpu, nil } func (v *Validator) tryAssignGPUsToAllMachineConfigs(ctx context.Context, v3Client Client, cluster *cluster.Spec, availableGpu map[string][]v3.GPU) error { configs := cluster.NutanixMachineConfigs machineCount := v.getMachineCountForAllMachineConfigs(cluster) for _, machineConfig := range configs { clusterUUID, err := getClusterUUID(ctx, v3Client, machineConfig.Spec.Cluster) if err != nil { return err } if machineConfig.Spec.GPUs != nil { if _, ok := machineCount[machineConfig.Name]; ok { availableGpu[clusterUUID], err = v.tryAssignGPUsToMachineConfig(machineCount[machineConfig.Name], machineConfig.Spec.GPUs, availableGpu[clusterUUID], machineConfig.Spec.Cluster) if err != nil { return err } } } } return nil } func (v *Validator) getMachineCountForAllMachineConfigs(clusterSpec *cluster.Spec) map[string]int { machineCountMap := make(map[string]int) cluster := clusterSpec.Cluster.Spec if cluster.ControlPlaneConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind { machineCountMap[cluster.ControlPlaneConfiguration.MachineGroupRef.Name] = cluster.ControlPlaneConfiguration.Count } if cluster.ExternalEtcdConfiguration != nil && cluster.ExternalEtcdConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind { machineCountMap[cluster.ExternalEtcdConfiguration.MachineGroupRef.Name] = cluster.ExternalEtcdConfiguration.Count } for _, workerNodeGroupConfiguration := range cluster.WorkerNodeGroupConfigurations { if workerNodeGroupConfiguration.MachineGroupRef.Kind == constants.NutanixMachineConfigKind && workerNodeGroupConfiguration.Count != nil { machineCountMap[workerNodeGroupConfiguration.MachineGroupRef.Name] = *workerNodeGroupConfiguration.Count } } return machineCountMap } func (v *Validator) getGPUModeMapping(hosts []*v3.HostResponse) (map[int64]string, map[string]string, error) { gpuDeviceIDToMode := make(map[int64]string) gpuNameToMode := make(map[string]string) for _, host := range hosts { if host.Status != nil && host.Status.Resources != nil && host.Status.Resources.GPUList != nil { for _, gpu := range host.Status.Resources.GPUList { if gpu.DeviceID != nil { gpuDeviceIDToMode[*gpu.DeviceID] = gpu.Mode } if gpu.Name != "" { gpuNameToMode[gpu.Name] = gpu.Mode } } } } return gpuDeviceIDToMode, gpuNameToMode, nil } func (v *Validator) validateGPUModeNotMixed(hosts []*v3.HostResponse, cluster *cluster.Spec) error { configs := cluster.NutanixMachineConfigs gpuDeviceIDToMode, gpuNameToMode, err := v.getGPUModeMapping(hosts) if err != nil { return err } gpuMode := "" getGpuModeFunc := createGetGpuModeFunc(gpuDeviceIDToMode, gpuNameToMode) for _, machineConfig := range configs { if machineConfig.Spec.GPUs != nil { for _, gpu := range machineConfig.Spec.GPUs { if gpuMode == "" { gpuMode = getGpuModeFunc(gpu) } else { if gpuMode != getGpuModeFunc(gpu) { return fmt.Errorf("all GPUs in a machine config must be of the same mode, vGPU or passthrough") } } } } } return nil } func createGetGpuModeFunc(gpuDeviceIDToMode map[int64]string, gpuNameToMode map[string]string) func(gpu anywherev1.NutanixGPUIdentifier) string { return func(gpu anywherev1.NutanixGPUIdentifier) string { if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { return gpuDeviceIDToMode[*gpu.DeviceID] } return gpuNameToMode[gpu.Name] } } func (v *Validator) validateFreeGPU(ctx context.Context, v3Client Client, cluster *cluster.Spec) error { if v.isGPURequested(cluster.NutanixMachineConfigs) { res, err := v3Client.ListAllHost(ctx) if err != nil || len(res.Entities) == 0 { return fmt.Errorf("no GPUs found: %v", err) } err = v.validateGPUModeNotMixed(res.Entities, cluster) if err != nil { return err } availableGpu, err := v.getAvailableGPUs(res.Entities) if err != nil { return err } if err = v.tryAssignGPUsToAllMachineConfigs(ctx, v3Client, cluster, availableGpu); err != nil { return err } } return nil } func (v *Validator) validateUpgradeRolloutStrategy(clusterSpec *cluster.Spec) error { if clusterSpec.Cluster.Spec.ControlPlaneConfiguration.UpgradeRolloutStrategy != nil { return fmt.Errorf("upgrade rollout strategy customization is not supported for nutanix provider") } for _, workerNodeGroupConfiguration := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations { if workerNodeGroupConfiguration.UpgradeRolloutStrategy != nil { return fmt.Errorf("upgrade rollout strategy customization is not supported for nutanix provider") } } return nil } func checkMachineConfigIsForWorker(config *anywherev1.NutanixMachineConfig, cluster *anywherev1.Cluster) error { if config.Name == cluster.Spec.ControlPlaneConfiguration.MachineGroupRef.Name { return fmt.Errorf("GPUs are not supported for control plane machine") } if cluster.Spec.ExternalEtcdConfiguration != nil && config.Name == cluster.Spec.ExternalEtcdConfiguration.MachineGroupRef.Name { return fmt.Errorf("GPUs are not supported for external etcd machine") } for _, workerNodeGroupConfiguration := range cluster.Spec.WorkerNodeGroupConfigurations { if config.Name == workerNodeGroupConfiguration.MachineGroupRef.Name { return nil } } return fmt.Errorf("machine config %s is not associated with any worker node group", config.Name) } // findSubnetUUIDByName retrieves the subnet uuid by the given subnet name. func findSubnetUUIDByName(ctx context.Context, v3Client Client, clusterUUID, subnetName string) (*string, error) { res, err := v3Client.ListAllSubnet(ctx, "", nil) if err != nil { return nil, fmt.Errorf("failed to list subnets: %v", err) } subnets := make([]*v3.SubnetIntentResponse, 0) for _, subnet := range res.Entities { if subnet.Spec.ClusterReference != nil && *subnet.Spec.ClusterReference.UUID != "" { if *subnet.Spec.ClusterReference.UUID == clusterUUID && subnet.Spec.Name != nil && *subnet.Spec.Name == subnetName { subnets = append(subnets, subnet) } } } if len(subnets) == 0 { return nil, fmt.Errorf("failed to find subnet by name %q: %v", subnetName, err) } if len(subnets) > 1 { return nil, fmt.Errorf("found more than one (%v) subnet with name %q and cluster uuid %v", len(subnets), subnetName, clusterUUID) } return subnets[0].Metadata.UUID, nil } // getWorkerMachineGroups retrieves the worker machine group names from the cluster spec. func getWorkerMachineGroups(spec *cluster.Spec) map[string]anywherev1.WorkerNodeGroupConfiguration { result := make(map[string]anywherev1.WorkerNodeGroupConfiguration) for _, workerNodeGroupConf := range spec.Cluster.Spec.WorkerNodeGroupConfigurations { result[workerNodeGroupConf.MachineGroupRef.Name] = workerNodeGroupConf } return result } // getClusterUUID retrieves the cluster uuid by the given cluster identifier. func getClusterUUID(ctx context.Context, v3Client Client, cluster anywherev1.NutanixResourceIdentifier) (string, error) { var clusterUUID string var err error if cluster.Type == anywherev1.NutanixIdentifierUUID { if cluster.UUID == nil || *cluster.UUID == "" { return "", fmt.Errorf("missing cluster uuid") } clusterUUID = *cluster.UUID } if cluster.Type == anywherev1.NutanixIdentifierName { clusterName := *cluster.Name var uuid *string if uuid, err = findClusterUUIDByName(ctx, v3Client, clusterName); err != nil { return "", fmt.Errorf("failed to find cluster with name %q: %v", clusterName, err) } clusterUUID = *uuid } return clusterUUID, nil } // findClusterUUIDByName retrieves the cluster uuid by the given cluster name. func findClusterUUIDByName(ctx context.Context, v3Client Client, clusterName string) (*string, error) { res, err := v3Client.ListAllCluster(ctx, "") if err != nil { return nil, fmt.Errorf("failed to list clusters: %v", err) } entities := make([]*v3.ClusterIntentResponse, 0) for _, entity := range res.Entities { if entity.Status != nil && entity.Status.Resources != nil && entity.Status.Resources.Config != nil { serviceList := entity.Status.Resources.Config.ServiceList isPrismCentral := false for _, svc := range serviceList { // Prism Central is also internally a cluster, but we filter that out here as we only care about prism element clusters if svc != nil && strings.ToUpper(*svc) == "PRISM_CENTRAL" { isPrismCentral = true } } if !isPrismCentral && *entity.Spec.Name == clusterName { entities = append(entities, entity) } } } if len(entities) == 0 { return nil, fmt.Errorf("failed to find cluster by name %q: %v", clusterName, err) } if len(entities) > 1 { return nil, fmt.Errorf("found more than one (%v) cluster with name %q", len(entities), clusterName) } return entities[0].Metadata.UUID, nil } // findImageUUIDByName retrieves the image uuid by the given image name. func findImageUUIDByName(ctx context.Context, v3Client Client, imageName string) (*string, error) { res, err := v3Client.ListAllImage(ctx, "") if err != nil { return nil, fmt.Errorf("failed to list images: %v", err) } images := make([]*v3.ImageIntentResponse, 0) for _, image := range res.Entities { if image.Spec.Name != nil && *image.Spec.Name == imageName { images = append(images, image) } } if len(images) == 0 { return nil, fmt.Errorf("failed to find image by name %q: %v", imageName, err) } if len(images) > 1 { return nil, fmt.Errorf("found more than one (%v) image with name %q", len(images), imageName) } return images[0].Metadata.UUID, nil } // findProjectUUIDByName retrieves the project uuid by the given image name. func findProjectUUIDByName(ctx context.Context, v3Client Client, projectName string) (*string, error) { res, err := v3Client.ListAllProject(ctx, "") if err != nil { return nil, fmt.Errorf("failed to list projects: %v", err) } projects := make([]*v3.Project, 0) for _, project := range res.Entities { if project.Spec.Name == projectName { projects = append(projects, project) } } if len(projects) == 0 { return nil, fmt.Errorf("failed to find project by name %q: %v", projectName, err) } if len(projects) > 1 { return nil, fmt.Errorf("found more than one (%v) project with name %q", len(res.Entities), projectName) } return projects[0].Metadata.UUID, nil } func isRequestedGPUAssignable(gpu v3.GPU, requestedGpu anywherev1.NutanixGPUIdentifier) bool { if requestedGpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { return (*gpu.DeviceID == *requestedGpu.DeviceID) && gpu.Assignable } return (gpu.Name == requestedGpu.Name) && gpu.Assignable } func errorGPUNotFound(gpu anywherev1.NutanixGPUIdentifier, cluster anywherev1.NutanixResourceIdentifier) error { clusterAddonString := "" if cluster.Type == anywherev1.NutanixIdentifierUUID { if cluster.UUID != nil { clusterAddonString = fmt.Sprintf("on cluster with UUID %s", *cluster.UUID) } } else { if cluster.Name != nil { clusterAddonString = fmt.Sprintf("on cluster with name %s", *cluster.Name) } } if gpu.Type == anywherev1.NutanixGPUIdentifierDeviceID { return fmt.Errorf("GPU with device ID %d not found %s", *gpu.DeviceID, clusterAddonString) } return fmt.Errorf("GPU with name %s not found %s", gpu.Name, clusterAddonString) }