pkg/noderesourcetopology/least_numa.go (167 lines of code) (raw):
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesourcetopology
import (
v1 "k8s.io/api/core/v1"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/scheduler/framework"
"github.com/go-logr/logr"
topologyv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
"gonum.org/v1/gonum/stat/combin"
"sigs.k8s.io/scheduler-plugins/pkg/util"
)
const (
// 255 is max value as defined by ACPI SLIT(System Locality Information Tables), which means unknown/undefined
maxDistanceValue = 255
)
func leastNUMAContainerScopeScore(lh logr.Logger, pod *v1.Pod, zones topologyv1alpha2.ZoneList) (int64, *framework.Status) {
nodes := createNUMANodeList(lh, zones)
qos := v1qos.GetPodQOS(pod)
maxNUMANodesCount := 0
allContainersMinAvgDistance := true
// the order how TopologyManager asks for hint is important so doing it in the same order
// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/cm/topologymanager/scope_container.go#L52
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
// if a container requests only non NUMA just continue
if onlyNonNUMAResources(nodes, container.Resources.Requests) {
continue
}
numaNodes, isMinAvgDistance := numaNodesRequired(lh, qos, nodes, container.Resources.Requests)
// container's resources can't fit onto node, return MinNodeScore for whole pod
if numaNodes == nil {
// score plugin should be running after resource filter plugin so we should always find sufficient amount of NUMA nodes
lh.Info("cannot calculate how many NUMA nodes are required", "container", container.Name)
return framework.MinNodeScore, nil
}
if !isMinAvgDistance {
allContainersMinAvgDistance = false
}
if numaNodes.Count() > maxNUMANodesCount {
maxNUMANodesCount = numaNodes.Count()
}
// subtract the resources requested by the container from the given NUMA.
// this is necessary, so we won't allocate the same resources for the upcoming containers
subtractFromNUMAs(container.Resources.Requests, nodes, numaNodes.GetBits()...)
}
if maxNUMANodesCount == 0 {
return framework.MaxNodeScore, nil
}
return normalizeScore(maxNUMANodesCount, allContainersMinAvgDistance), nil
}
func leastNUMAPodScopeScore(lh logr.Logger, pod *v1.Pod, zones topologyv1alpha2.ZoneList) (int64, *framework.Status) {
nodes := createNUMANodeList(lh, zones)
qos := v1qos.GetPodQOS(pod)
resources := util.GetPodEffectiveRequest(pod)
// if a pod requests only non NUMA resources return max score
if onlyNonNUMAResources(nodes, resources) {
return framework.MaxNodeScore, nil
}
numaNodes, isMinAvgDistance := numaNodesRequired(lh, qos, nodes, resources)
// pod's resources can't fit onto node, return MinNodeScore
if numaNodes == nil {
// score plugin should be running after resource filter plugin so we should always find sufficient amount of NUMA nodes
lh.Info("cannot calculate how many NUMA nodes are required")
return framework.MinNodeScore, nil
}
return normalizeScore(numaNodes.Count(), isMinAvgDistance), nil
}
func normalizeScore(numaNodesCount int, isMinAvgDistance bool) int64 {
numaNodeScore := framework.MaxNodeScore / highestNUMAID
score := framework.MaxNodeScore - int64(numaNodesCount)*numaNodeScore
if isMinAvgDistance {
// if distance between NUMA domains is optimal add half of numaNodeScore to make this node more favorable
return score + numaNodeScore/2
}
return score
}
func minAvgDistanceInCombinations(lh logr.Logger, numaNodes NUMANodeList, numaNodesCombination [][]int) float32 {
// max distance for NUMA node
var minDistance float32 = maxDistanceValue
for _, combination := range numaNodesCombination {
avgDistance := nodesAvgDistance(lh, numaNodes, combination...)
if avgDistance < minDistance {
minDistance = avgDistance
}
}
return minDistance
}
func nodesAvgDistance(lh logr.Logger, numaNodes NUMANodeList, nodes ...int) float32 {
if len(nodes) == 0 {
return maxDistanceValue
}
var (
accu int
)
for _, node1 := range nodes {
for _, node2 := range nodes {
cost, ok := numaNodes[node1].Costs[numaNodes[node2].NUMAID]
// we couldn't read Costs assign maxDistanceValue
if !ok {
lh.Info("cannot retrieve Costs information", "nodeID", numaNodes[node1].NUMAID)
cost = maxDistanceValue
}
accu += cost
}
}
return float32(accu) / float32(len(nodes)*len(nodes))
}
func combineResources(numaNodes NUMANodeList, combination []int) v1.ResourceList {
resources := v1.ResourceList{}
for _, nodeIndex := range combination {
for resource, quantity := range numaNodes[nodeIndex].Resources {
if value, ok := resources[resource]; ok {
value.Add(quantity)
resources[resource] = value
continue
}
resources[resource] = quantity
}
}
return resources
}
// numaNodesRequired returns bitmask with minimal NUMA nodes required to run given resources
// or nil when resources can't be fitted onto the worker node
// second value returned is a boolean indicating if bitmask is optimal from distance perspective
func numaNodesRequired(lh logr.Logger, qos v1.PodQOSClass, numaNodes NUMANodeList, resources v1.ResourceList) (bitmask.BitMask, bool) {
for bitmaskLen := 1; bitmaskLen <= len(numaNodes); bitmaskLen++ {
numaNodesCombination := combin.Combinations(len(numaNodes), bitmaskLen)
suitableCombination, isMinDistance := findSuitableCombination(lh, qos, numaNodes, resources, numaNodesCombination)
// we have found suitable combination for given bitmaskLen
if suitableCombination != nil {
bm := bitmask.NewEmptyBitMask()
for _, nodeIdx := range suitableCombination {
bm.Add(numaNodes[nodeIdx].NUMAID)
}
return bm, isMinDistance
}
}
return nil, false
}
// findSuitableCombination returns combination from numaNodesCombination that can fit resources, otherwise return nil
// second value returned is a boolean indicating if returned combination is optimal from distance perspective
// this function will always return combination that provides minimal average distance between nodes in combination
func findSuitableCombination(lh logr.Logger, qos v1.PodQOSClass, numaNodes NUMANodeList, resources v1.ResourceList, numaNodesCombination [][]int) ([]int, bool) {
minAvgDistance := minAvgDistanceInCombinations(lh, numaNodes, numaNodesCombination)
var (
minDistanceCombination []int
// init as max distance
minDistance float32 = 256
)
for _, combination := range numaNodesCombination {
if !isValidCombineResources(numaNodes, resources, combination) {
continue
}
combinationResources := combineResources(numaNodes, combination)
resourcesFit := checkResourcesFit(lh, qos, resources, combinationResources)
if resourcesFit {
distance := nodesAvgDistance(lh, numaNodes, combination...)
if distance == minAvgDistance {
// return early if we can fit resources into combination and provide minDistance
return combination, true
}
// we don't have to check which combination bitmask has lower value since we are generating them from lowest value
if distance < minDistance {
minDistance = distance
minDistanceCombination = combination
}
}
}
return minDistanceCombination, false
}
func checkResourcesFit(lh logr.Logger, qos v1.PodQOSClass, resources v1.ResourceList, combinationResources v1.ResourceList) bool {
for resource, quantity := range resources {
if quantity.IsZero() {
lh.V(4).Info("ignoring zero-qty resource request", "resource", resource)
continue
}
if combinationQuantity := combinationResources[resource]; !isResourceSetSuitable(qos, resource, quantity, combinationQuantity) {
return false
}
}
return true
}
func isValidCombineResources(numaNodes NUMANodeList, resources v1.ResourceList, combination []int) bool {
for _, nodeIndex := range combination {
for resourceName := range resources {
if _, ok := numaNodes[nodeIndex].Resources[resourceName]; !ok {
return false
}
}
}
return true
}