in pkg/scheduler/health_checker.go [235:315]
func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckInfo {
// check for negative resources
var partitionsWithNegResources []string
var nodesWithNegResources []string
// total partition resource = sum of node resources
var totalResourceMismatch []string
// node total resource = allocated resource + occupied resource + available resource
var nodeTotalMismatch []string
// check reservation/node ration
var partitionReservationRatio []float32
// check for orphan allocations
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
orphanAllocationsOnApp := make([]*objects.Allocation, 0)
for _, part := range schedulerContext.GetPartitionMapClone() {
if part.GetAllocatedResource().HasNegativeValue() {
partitionsWithNegResources = append(partitionsWithNegResources, part.Name)
}
if part.GetTotalPartitionResource().HasNegativeValue() {
partitionsWithNegResources = append(partitionsWithNegResources, part.Name)
}
sumNodeResources := resources.NewResource()
sumNodeAllocatedResources := resources.NewResource()
sumReservation := 0
for _, node := range part.GetNodes() {
sumNodeResources.AddTo(node.GetCapacity())
sumNodeAllocatedResources.AddTo(node.GetAllocatedResource())
sumReservation += len(node.GetReservationKeys())
calculatedTotalNodeRes := resources.Add(node.GetAllocatedResource(), node.GetOccupiedResource())
calculatedTotalNodeRes.AddTo(node.GetAvailableResource())
if !resources.Equals(node.GetCapacity(), calculatedTotalNodeRes) {
nodeTotalMismatch = append(nodeTotalMismatch, node.NodeID)
}
if node.GetAllocatedResource().HasNegativeValue() {
nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
}
if node.GetAvailableResource().HasNegativeValue() {
nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
}
if node.GetCapacity().HasNegativeValue() {
nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
}
if node.GetOccupiedResource().HasNegativeValue() {
nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
}
orphanAllocationsOnNode = append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...)
}
// check if there are allocations assigned to an app but there are missing from the nodes
for _, app := range part.GetApplications() {
orphanAllocationsOnApp = append(orphanAllocationsOnApp, checkAppAllocations(app, part.nodes)...)
}
partitionReservationRatio = append(partitionReservationRatio, float32(sumReservation)/(float32(part.GetTotalNodeCount())))
if !resources.EqualsOrEmpty(sumNodeResources, part.GetTotalPartitionResource()) {
totalResourceMismatch = append(totalResourceMismatch, part.Name)
}
}
var info = make([]dao.HealthCheckInfo, 0)
info = append(info, CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources",
"Check for negative resources in the partitions",
fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources)))
info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources",
"Check for negative resources in the nodes",
fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources)))
info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data",
"Check if total partition resource == sum of the node resources from the partition",
fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch)))
info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data",
"Check if node total resource = allocated resource + occupied resource + available resource",
fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch)))
// mark it as succeeded for a while until we will know what is not considered a normal value anymore
info = append(info, CreateCheckInfo(true, "Reservation check",
"Check the reservation nr compared to the number of nodes",
fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio)))
info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check",
"Check if there are orphan allocations on the nodes",
fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)))
info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check",
"Check if there are orphan allocations on the applications",
fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)))
return info
}