calculator/calculator.go (430 lines of code) (raw):

// Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package calculator import ( "context" "fmt" "log" "math" "strconv" "strings" "github.com/GoogleCloudPlatform/autopilot-cost-calculator/cluster" "gopkg.in/ini.v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) const CLUSTER_FEE = 0.1 type PricingService struct { AutopilotPricing AutopilotPriceList GCEPricing GCEPriceList Config *ini.File clientset *kubernetes.Clientset metricsClientset *metricsv.Clientset } func NewService(sku map[string]string, region string, clientset *kubernetes.Clientset, metricsClientset *metricsv.Clientset, config *ini.File) (*PricingService, error) { apPricing, err := GetAutopilotPricing(sku["autopilot"], region) if err != nil { return nil, err } gcePricing, err := GetGCEPricing(sku["gce"], region) if err != nil { return nil, err } service := &PricingService{ AutopilotPricing: apPricing, GCEPricing: gcePricing, clientset: clientset, metricsClientset: metricsClientset, Config: config, } return service, nil } func (service *PricingService) CalculatePricing(cpu int64, memory int64, storage int64, gpu int64, gpuModel string, class cluster.ComputeClass, instanceType string, spot bool) float64 { // If spot, calculations are done based on spot pricing if spot { switch class { case cluster.ComputeClassPerformance: perfPrice := service.AutopilotPricing.SpotPerformanceCpuPricePremium*float64(cpu)/1000 + service.AutopilotPricing.SpotPerformanceMemoryPricePremium*float64(memory)/1000 + service.AutopilotPricing.SpotPerformanceLocalSSDPricePremium*float64(storage)/1000 if perfPrice == 0 { log.Printf("Requested Spot Performance (%s) pricing is not available in %s region.", instanceType, service.AutopilotPricing.Region) } gcePrice, _ := service.GetGCEMachinePrice(instanceType, spot) return perfPrice + gcePrice case cluster.ComputeClassAccelerator: // TODO lookup machine type and add to the price acceleratorPrice := service.AutopilotPricing.SpotAcceleratorCpuPricePremium*float64(cpu)/1000 + service.AutopilotPricing.SpotAcceleratorMemoryGPUPricePremium*float64(memory)/1000 + service.AutopilotPricing.AcceleratorLocalSSDPricePremium*float64(storage)/1000 switch gpuModel { case "nvidia-tesla-t4": acceleratorPrice += service.AutopilotPricing.SpotAcceleratorT4GPUPricePremium * float64(gpu) case "nvidia-l4": acceleratorPrice += service.AutopilotPricing.SpotAcceleratorL4GPUPricePremium * float64(gpu) case "nvidia-tesla-a100": acceleratorPrice += service.AutopilotPricing.SpotAcceleratorA10040GGPUPricePremium * float64(gpu) case "nvidia-a100-80gb": acceleratorPrice += service.AutopilotPricing.SpotAcceleratorA10080GGPUPricePremium * float64(gpu) case "nvidia-h100-80gb": acceleratorPrice += service.AutopilotPricing.SpotAcceleratorH100GPUPricePremium * float64(gpu) default: acceleratorPrice = 0 log.Printf("Requested Spot GPU (%s) pricing for Accelerator compute class (%s) is not available in %s region.", gpuModel, instanceType, service.AutopilotPricing.Region) } gcePrice, _ := service.GetGCEMachinePrice(instanceType, spot) return acceleratorPrice + gcePrice case cluster.ComputeClassGPUPod: acceleratorPrice := service.AutopilotPricing.SpotGPUPodvCPUPrice*float64(cpu)/1000 + service.AutopilotPricing.SpotGPUPodMemoryPrice*float64(memory)/1000 + service.AutopilotPricing.SpotGPUPodLocalSSDPrice*float64(storage)/1000 switch gpuModel { case "nvidia-tesla-t4": acceleratorPrice += service.AutopilotPricing.SpotNVIDIAT4PodGPUPrice * float64(gpu) case "nvidia-l4": acceleratorPrice += service.AutopilotPricing.SpotNVIDIAL4PodGPUPrice * float64(gpu) case "nvidia-tesla-a100": acceleratorPrice += service.AutopilotPricing.SpotNVIDIAA10040GPodGPUPrice * float64(gpu) case "nvidia-a100-80gb": acceleratorPrice += service.AutopilotPricing.SpotNVIDIAA10080GPodGPUPrice * float64(gpu) default: acceleratorPrice = 0 log.Printf("Requested Spot GPU (%s) pricing is not available in %s region.", gpuModel, service.AutopilotPricing.Region) } return acceleratorPrice case cluster.ComputeClassBalanced: return service.AutopilotPricing.SpotCpuPrice*float64(cpu)/1000 + service.AutopilotPricing.SpotMemoryPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 case cluster.ComputeClassScaleout: return service.AutopilotPricing.SpotCpuScaleoutPrice*float64(cpu)/1000 + service.AutopilotPricing.SpotMemoryScaleoutPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 case cluster.ComputeClassScaleoutArm: armPrice := service.AutopilotPricing.SpotArmCpuScaleoutPrice*float64(cpu)/1000 + service.AutopilotPricing.SpotArmMemoryScaleoutPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 if armPrice == 0 { log.Printf("Request Spot ARM (%s) pricing is not available in %s region.", instanceType, service.AutopilotPricing.Region) } return armPrice default: return service.AutopilotPricing.SpotCpuPrice*float64(cpu)/1000 + service.AutopilotPricing.SpotMemoryPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 } } switch class { case cluster.ComputeClassPerformance: perfPrice := service.AutopilotPricing.PerformanceCpuPricePremium*float64(cpu)/1000 + service.AutopilotPricing.PerformanceMemoryPricePremium*float64(memory)/1000 + service.AutopilotPricing.PerformanceLocalSSDPricePremium*float64(storage)/1000 if perfPrice == 0 { log.Printf("Requested Performance(%s) pricing is not available in %s region.", instanceType, service.AutopilotPricing.Region) } gcePrice, _ := service.GetGCEMachinePrice(instanceType, spot) return perfPrice + gcePrice case cluster.ComputeClassAccelerator: acceleratorPrice := service.AutopilotPricing.AcceleratorCpuPricePremium*float64(cpu)/1000 + service.AutopilotPricing.AcceleratorMemoryGPUPricePremium*float64(memory)/1000 + service.AutopilotPricing.AcceleratorLocalSSDPricePremium*float64(storage)/1000 switch gpuModel { case "nvidia-tesla-t4": acceleratorPrice += service.AutopilotPricing.AcceleratorT4GPUPricePremium * float64(gpu) case "nvidia-l4": acceleratorPrice += service.AutopilotPricing.AcceleratorL4GPUPricePremium * float64(gpu) case "nvidia-tesla-a100": acceleratorPrice += service.AutopilotPricing.AcceleratorA10040GGPUPricePremium * float64(gpu) case "nvidia-a100-80gb": acceleratorPrice += service.AutopilotPricing.AcceleratorA10080GGPUPricePremium * float64(gpu) case "nvidia-h100-80gb": acceleratorPrice += service.AutopilotPricing.AcceleratorH100GPUPricePremium * float64(gpu) default: acceleratorPrice = 0 log.Printf("Requested spot GPU (%s) pricing for Accelerator compute class (%s) is not available in %s region.", gpuModel, instanceType, service.AutopilotPricing.Region) } gcePrice, _ := service.GetGCEMachinePrice(instanceType, spot) return acceleratorPrice + gcePrice case cluster.ComputeClassGPUPod: acceleratorPrice := service.AutopilotPricing.GPUPodvCPUPrice*float64(cpu)/1000 + service.AutopilotPricing.GPUPodMemoryPrice*float64(memory)/1000 + service.AutopilotPricing.GPUPodLocalSSDPrice*float64(storage)/1000 switch gpuModel { case "nvidia-tesla-t4": acceleratorPrice += service.AutopilotPricing.NVIDIAT4PodGPUPrice * float64(gpu) case "nvidia-l4": acceleratorPrice += service.AutopilotPricing.NVIDIAL4PodGPUPrice * float64(gpu) case "nvidia-tesla-a100": acceleratorPrice += service.AutopilotPricing.NVIDIAA10040GPodGPUPrice * float64(gpu) case "nvidia-a100-80gb": acceleratorPrice += service.AutopilotPricing.NVIDIAA10080GPodGPUPrice * float64(gpu) default: acceleratorPrice = 0 log.Printf("Requested GPU (%s) pricing is not available in %s region.", gpuModel, service.AutopilotPricing.Region) } return acceleratorPrice case cluster.ComputeClassBalanced: return service.AutopilotPricing.CpuBalancedPrice*float64(cpu)/1000 + service.AutopilotPricing.MemoryBalancedPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 case cluster.ComputeClassScaleout: return service.AutopilotPricing.CpuScaleoutPrice*float64(cpu)/1000 + service.AutopilotPricing.MemoryScaleoutPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 case cluster.ComputeClassScaleoutArm: armPrice := service.AutopilotPricing.CpuArmScaleoutPrice*float64(cpu)/1000 + service.AutopilotPricing.MemoryArmScaleoutPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 if armPrice == 0 { log.Printf("Request ARM (%s) pricing is not available in %s region.", instanceType, service.AutopilotPricing.Region) } return armPrice default: return service.AutopilotPricing.CpuPrice*float64(cpu)/1000 + service.AutopilotPricing.MemoryPrice*float64(memory)/1000 + service.AutopilotPricing.StoragePrice*float64(storage)/1000 } } func (service *PricingService) GetGCEMachinePrice(instanceType string, spot bool) (float64, error) { instanceInfo := strings.Split(instanceType, "-") cpus, _ := strconv.Atoi(instanceInfo[2]) ram := 0.0 classType := instanceInfo[1] machineType := instanceInfo[0] switch classType { case "standard": ram = float64(cpus) * 4 case "highcpu": ram = float64(cpus) * 2 case "highmem": ram = float64(cpus) * 4 case "highgpu": ram = float64(cpus) * 7.0833 case "ultragpu": ram = float64(cpus) * 14.1666 } ram = math.Ceil(ram) fmt.Printf("Parsing %s - %d %f %s %s", instanceType, cpus, ram, machineType, classType) if spot { switch machineType { case "a2": return service.GCEPricing.SpotA2CpuPrice*float64(cpus) + service.GCEPricing.SpotA2MemoryPrice*ram, nil case "a3": return service.GCEPricing.SpotA3CpuPrice*float64(cpus) + service.GCEPricing.SpotA3MemoryPrice*ram, nil case "g2": return service.GCEPricing.SpotG2DCpuPrice*float64(cpus) + service.GCEPricing.SpotG2DMemoryPrice*ram, nil case "h3": fmt.Printf("H3 Machine type is not available in Preemptible Spot format. Defaulting to a regular price.") return service.GCEPricing.H3CpuPrice*float64(cpus) + service.GCEPricing.H3MemoryPrice*ram, nil case "c2": return service.GCEPricing.SpotC2CpuPrice*float64(cpus) + service.GCEPricing.SpotC2MemoryPrice*ram, nil case "c2d": return service.GCEPricing.SpotC2DCpuPrice*float64(cpus) + service.GCEPricing.SpotC2DMemoryPrice*ram, nil default: fmt.Printf("GCE Machine type %s is not implemented for price querying. Only supported ones are A2, A3, G2, H3, C2 and C2D", instanceType) } return 0, nil } fmt.Printf("%#v", service.GCEPricing) switch machineType { case "a2": return service.GCEPricing.A2CpuPrice*float64(cpus) + service.GCEPricing.A2MemoryPrice*ram, nil case "a3": return service.GCEPricing.A3CpuPrice*float64(cpus) + service.GCEPricing.A3MemoryPrice*ram, nil case "g2": return service.GCEPricing.G2CpuPrice*float64(cpus) + service.GCEPricing.G2MemoryPrice*ram, nil case "h3": return service.GCEPricing.H3CpuPrice*float64(cpus) + service.GCEPricing.H3MemoryPrice*ram, nil case "c2": return service.GCEPricing.C2CpuPrice*float64(cpus) + service.GCEPricing.C2MemoryPrice*ram, nil case "c2d": return service.GCEPricing.C2DCpuPrice*float64(cpus) + service.GCEPricing.C2DMemoryPrice*ram, nil default: fmt.Printf("GCE Machine type %s is not implemented for price querying. Only supported ones are A2, A3, G2, H3, C2 and C2D", instanceType) } return 0, nil } func (service *PricingService) PopulateWorkloads(nodes map[string]cluster.Node) ([]cluster.Workload, error) { var workloads []cluster.Workload podMetricsList, err := service.metricsClientset.MetricsV1beta1().PodMetricses("").List(context.TODO(), metav1.ListOptions{FieldSelector: "metadata.namespace!=kube-system,metadata.namespace!=gke-gmp-system,metadata.namespace!=gmp-system"}) if err != nil { log.Fatalf(err.Error()) } for _, v := range podMetricsList.Items { pod, err := cluster.DescribePod(service.clientset, v.Name, v.Namespace) if err != nil { return nil, err } var cpu int64 = 0 var memory int64 = 0 var storage int64 = 0 var gpu int64 = 0 podContainerCount := 0 gpuModel := pod.Spec.NodeSelector["cloud.google.com/gke-accelerator"] // Sum used resources from the Pod for _, container := range v.Containers { cpuUsage := container.Usage.Cpu().MilliValue() memoryUsage := container.Usage.Memory().MilliValue() / 1000000000 // Division to get MiB storageUsage := container.Usage.StorageEphemeral().MilliValue() / 1000000000 // Division to get MiB gpuUsage := int64(0) for _, specContainer := range pod.Spec.Containers { if container.Name == specContainer.Name { cpuRequest := specContainer.Resources.Requests[corev1.ResourceCPU] memoryRequest := specContainer.Resources.Requests[corev1.ResourceMemory] storageRequest := specContainer.Resources.Requests[corev1.ResourceStorage] gpuRequests := specContainer.Resources.Requests["nvidia.com/gpu"] // Usage is less than requests, so we set request as usage since the billing works like that if cpuUsage < cpuRequest.MilliValue() { cpuUsage = cpuRequest.MilliValue() } if memoryUsage < memoryRequest.MilliValue()/1000000000 { memoryUsage = memoryRequest.MilliValue() / 1000000000 } if storageUsage < storageRequest.MilliValue()/1000000000 { storageUsage = memoryRequest.MilliValue() / 1000000000 } gpuUsage = gpuRequests.Value() } } cpu += cpuUsage memory += memoryUsage storage += storageUsage gpu += gpuUsage podContainerCount++ } // Check and modify the limits of summed workloads from the Pod cpu, memory, storage = ValidateAndRoundResources(cpu, memory, storage) computeClass := service.DecideComputeClass( v.Name, nodes[pod.Spec.NodeName].InstanceType, cpu, memory, gpu, gpuModel, strings.Contains(nodes[pod.Spec.NodeName].InstanceType, service.Config.Section("").Key("gce_arm64_prefix").String()), ) cost := service.CalculatePricing(cpu, memory, storage, gpu, gpuModel, computeClass, nodes[pod.Spec.NodeName].InstanceType, nodes[pod.Spec.NodeName].Spot) workloadObject := cluster.Workload{ Name: v.Name, Containers: podContainerCount, Node_name: pod.Spec.NodeName, Cpu: cpu, Memory: memory, Storage: storage, AcceleratorType: gpuModel, AcceleratorAmount: gpu, Cost: cost, ComputeClass: computeClass, } workloads = append(workloads, workloadObject) if entry, ok := nodes[pod.Spec.NodeName]; ok { entry.Workloads = append(entry.Workloads, workloadObject) entry.Cost += cost nodes[pod.Spec.NodeName] = entry } } return workloads, nil } func (service *PricingService) DecideComputeClass(workloadName string, machineType string, mCPU int64, memory int64, gpu int64, gpuModel string, arm64 bool) cluster.ComputeClass { ratio := math.Ceil(float64(memory) / float64(mCPU)) ratioRegularMin, _ := service.Config.Section("ratios").Key("generalpurpose_min").Float64() ratioRegularMax, _ := service.Config.Section("ratios").Key("generalpurpose_max").Float64() ratioBalancedMin, _ := service.Config.Section("ratios").Key("balanced_min").Float64() ratioBalancedMax, _ := service.Config.Section("ratios").Key("balanced_max").Float64() ratioScaleoutMin, _ := service.Config.Section("ratios").Key("scaleout_min").Float64() ratioScaleoutMax, _ := service.Config.Section("ratios").Key("scaleout_max").Float64() ratioPerformanceMin, _ := service.Config.Section("ratios").Key("performance_min").Float64() ratioPerformanceMax, _ := service.Config.Section("ratios").Key("performance_max").Float64() scaleoutMcpuMax, _ := service.Config.Section("limits").Key("scaleout_mcpu_max").Int64() scaleoutMemoryMax, _ := service.Config.Section("limits").Key("scaleout_memory_max").Int64() scaleoutArmMcpuMax, _ := service.Config.Section("limits").Key("scaleout_arm_mcpu_max").Int64() scaleoutArmMemoryMax, _ := service.Config.Section("limits").Key("scaleout_arm_memory_max").Int64() regularMcpuMax, _ := service.Config.Section("limits").Key("generalpurpose_mcpu_max").Int64() regularMemoryMax, _ := service.Config.Section("limits").Key("generalpurpose_memory_max").Int64() balancedMcpuMax, _ := service.Config.Section("limits").Key("balanced_mcpu_max").Int64() balancedMemoryMax, _ := service.Config.Section("limits").Key("balanced_mcpu_max").Int64() performanceMcpuMax, _ := service.Config.Section("limits").Key("performance_mcpu_max").Int64() performanceMemoryMax, _ := service.Config.Section("limits").Key("performance_memory_max").Int64() gpupodT4McpuMin, _ := service.Config.Section("limits").Key("gpupod_t4_mcpu_min").Int64() gpupodT4McpuMax, _ := service.Config.Section("limits").Key("gpupod_t4_mcpu_max").Int64() gpupodT4MemoryMin, _ := service.Config.Section("limits").Key("gpupod_t4_memory_min").Int64() gpupodT4MemoryMax, _ := service.Config.Section("limits").Key("gpupod_t4_memory_max").Int64() gpupodL4McpuMin, _ := service.Config.Section("limits").Key("gpupod_l4_mcpu_min").Int64() gpupodL4McpuMax, _ := service.Config.Section("limits").Key("gpupod_l4_mcpu_max").Int64() gpupodL4MemoryMin, _ := service.Config.Section("limits").Key("gpupod_l4_memory_min").Int64() gpupodL4MemoryMax, _ := service.Config.Section("limits").Key("gpupod_l4_memory_max").Int64() gpupodA10040McpuMin, _ := service.Config.Section("limits").Key("gpupod_a100_40_mcpu_min").Int64() gpupodA10040McpuMax, _ := service.Config.Section("limits").Key("gpupod_a100_40_mcpu_max").Int64() gpupodA10040MemoryMin, _ := service.Config.Section("limits").Key("gpupod_a100_40_memory_min").Int64() gpupodA10040MemoryMax, _ := service.Config.Section("limits").Key("gpupod_a100_40_memory_max").Int64() gpupodA10080McpuMin, _ := service.Config.Section("limits").Key("gpupod_a100_80_mcpu_min").Int64() gpupodA10080McpuMax, _ := service.Config.Section("limits").Key("gpupod_a100_80_mcpu_max").Int64() gpupodA10080MemoryMin, _ := service.Config.Section("limits").Key("gpupod_a100_80_memory_min").Int64() gpupodA10080MemoryMax, _ := service.Config.Section("limits").Key("gpupod_a100_80_memory_max").Int64() accelerator_mcpu_min, _ := service.Config.Section("limits").Key("accelerator_mcpu_min").Int64() accelerator_memory_min, _ := service.Config.Section("limits").Key("accelerator_memory_min").Int64() accelerator_h100_80_mcpu_max, _ := service.Config.Section("limits").Key("accelerator_h100_80_mcpu_max").Int64() accelerator_h100_80_memory_max, _ := service.Config.Section("limits").Key("accelerator_h100_80_memory_max").Int64() computeOptimizedMachineTypes := strings.Split(service.Config.Section("").Key("gce_compute_optimized_prefixed").String(), ",") for _, computeOptimizedMachineType := range computeOptimizedMachineTypes { if strings.Contains(machineType, computeOptimizedMachineType) { return cluster.ComputeClassPerformance } } // check if GPU is H100, then return ComputeClassAccelerator since it's the only one supporting these GPUs if gpuModel == service.Config.Section("").Key("nvidia_h100_identifier").String() { if ratio < ratioPerformanceMin || ratio > ratioPerformanceMax || mCPU > performanceMcpuMax || memory > performanceMemoryMax { log.Printf("Requested memory or CPU out of acceptable range for Performance compute class (%s) workload (%s).\n", machineType, workloadName) } return cluster.ComputeClassPerformance } acceleratorOptimizedMachineTypes := strings.Split(service.Config.Section("").Key("gce_accelerator_optimized_prefixed").String(), ",") for _, acceleratorOptimizedMachineType := range acceleratorOptimizedMachineTypes { if strings.Contains(machineType, acceleratorOptimizedMachineType) { switch gpuModel { case "nvidia-tesla-t4": if mCPU > gpupodT4McpuMax || mCPU < accelerator_mcpu_min || memory > gpupodT4MemoryMax || memory < accelerator_memory_min { log.Printf("Requested memory or CPU out of acceptable range for %s Accelerator compute class (%s) workload (%s).\n", machineType, gpuModel, workloadName) } case "nvidia-l4": if mCPU > gpupodL4McpuMax || mCPU < accelerator_mcpu_min || memory > gpupodL4MemoryMax || memory < accelerator_memory_min { log.Printf("Requested memory or CPU out of acceptable range for %s Accelerator compute class (%s) workload (%s).\n", machineType, gpuModel, workloadName) } case "nvidia-tesla-a100": if mCPU > gpupodA10040McpuMax || mCPU < accelerator_mcpu_min || memory > gpupodA10040MemoryMax || memory < accelerator_memory_min { log.Printf("Requested memory or CPU out of acceptable range for %s Accelerator compute class (%s) workload (%s).\n", machineType, gpuModel, workloadName) } case "nvidia-a100-80gb": if mCPU > gpupodA10080McpuMax || mCPU < accelerator_mcpu_min || memory > gpupodA10080MemoryMax || memory < accelerator_memory_min { log.Printf("Requested memory or CPU out of acceptable range for %s Accelerator compute class (%s) workload (%s).\n", machineType, gpuModel, workloadName) } case "nvidia-h100-80gb": if mCPU > accelerator_h100_80_mcpu_max || mCPU < accelerator_mcpu_min || memory > accelerator_h100_80_memory_max || memory < accelerator_memory_min { log.Printf("Requested memory or CPU out of acceptable range for %s Accelerator compute class (%s) workload (%s).\n", machineType, gpuModel, workloadName) } } return cluster.ComputeClassAccelerator } } // Ok, not an accelerator based workload nor is H100, so we can get a regular GPU Pod type if gpu > 0 { switch gpuModel { case "nvidia-tesla-t4": if mCPU > gpupodT4McpuMax || mCPU < gpupodT4McpuMin || memory > gpupodT4MemoryMax || memory < gpupodT4MemoryMin { log.Printf("Requested memory or CPU out of acceptable range for %s GPU workload (%s).\n", gpuModel, workloadName) } case "nvidia-l4": if mCPU > gpupodL4McpuMax || mCPU < gpupodL4McpuMin || memory > gpupodL4MemoryMax || memory < gpupodL4MemoryMin { log.Printf("Requested memory or CPU out of acceptable range for %s GPU workload (%s).\n", gpuModel, workloadName) } case "nvidia-tesla-a100": if mCPU > gpupodA10040McpuMax || mCPU < gpupodA10040McpuMin || memory > gpupodA10040MemoryMax || memory < gpupodA10040MemoryMin { log.Printf("Requested memory or CPU out of acceptable range for %s GPU workload (%s).\n", gpuModel, workloadName) } case "nvidia-a100-80gb": if mCPU > gpupodA10080McpuMax || mCPU < gpupodA10080McpuMin || memory > gpupodA10080MemoryMax || memory < gpupodA10080MemoryMin { log.Printf("Requested memory or CPU out of acceptable range for %s GPU workload (%s).\n", gpuModel, workloadName) } } return cluster.ComputeClassGPUPod } // ARM64 is still experimental if arm64 { if ratio < ratioScaleoutMin || ratio > ratioScaleoutMax || mCPU > scaleoutArmMcpuMax || memory > scaleoutArmMemoryMax { log.Printf("Requesting arm64 but requested mCPU () or memory or ratio are out of accepted range(%s).\n", workloadName) } return cluster.ComputeClassScaleoutArm } // For T2a machines, default to scale-out compute class, since it's the only one supporting it if ratio >= ratioRegularMin && ratio <= ratioRegularMax && mCPU <= regularMcpuMax && memory <= regularMemoryMax { return cluster.ComputeClassGeneralPurpose } // If we are out of Regular range, suggest Scale-Out if ratio >= ratioScaleoutMin && ratio <= ratioScaleoutMax && mCPU <= scaleoutMcpuMax && memory <= scaleoutMemoryMax { return cluster.ComputeClassScaleout } // If usage is more than general-purpose limits, default to balanced if ratio >= ratioBalancedMin && ratio <= ratioBalancedMax && mCPU <= balancedMcpuMax && memory <= balancedMemoryMax { return cluster.ComputeClassBalanced } log.Printf("Couldn't find a matching compute class for %s. Defaulting to 'General-purpose'. Please check the pricing manually.\n", workloadName) return cluster.ComputeClassGeneralPurpose } // TODO: implement ini file minimums func ValidateAndRoundResources(mCPU int64, memory int64, storage int64) (int64, int64, int64) { // Lowest possible mCPU request, but this is different for DaemonSets that are not yet implemented if mCPU < 50 { mCPU = 50 } // Minumum memory request, however it's 1G for Scaleout, we don't yet account for this if memory < 52 { memory = 52 } if storage < 10 { storage = 10 } mCPUMissing := (50 - (mCPU % 50)) if mCPUMissing == 50 { // Nothing to do here, return original values return mCPU, memory, storage } // Add missing value to reach nearst 250mCPU step mCPU += mCPUMissing return mCPU, memory, storage }