internal ScaleRecommendation MakeScaleRecommendation()

in src/DurableTask.AzureStorage/Monitoring/DisconnectedPerformanceMonitor.cs [342:423]


        internal ScaleRecommendation MakeScaleRecommendation(
            int workerCount,
            int partitionCount,
            QueueMetricHistory workItemQueueLatencyHistory,
            List<QueueMetricHistory> controlQueueLatencyHistory)
        {
            // REVIEW: Is zero latency a reliable indicator of idle?
            bool taskHubIsIdle = IsIdle(workItemQueueLatencyHistory) && controlQueueLatencyHistory.TrueForAll(IsIdle);
            if (workerCount == 0 && !taskHubIsIdle)
            {
                return new ScaleRecommendation(ScaleAction.AddWorker, keepWorkersAlive: true, reason: "First worker");
            }

            // Wait until we have enough samples before making specific recommendations
            if (!workItemQueueLatencyHistory.IsFull || !controlQueueLatencyHistory.TrueForAll(h => h.IsFull))
            {
                return new ScaleRecommendation(ScaleAction.None, keepWorkersAlive: !taskHubIsIdle, reason: "Not enough samples");
            }

            if (taskHubIsIdle)
            {
                return new ScaleRecommendation(
                    scaleAction: workerCount > 0 ? ScaleAction.RemoveWorker : ScaleAction.None,
                    keepWorkersAlive: false,
                    reason: "Task hub is idle");
            }
            else if (this.IsHighLatency(workItemQueueLatencyHistory))
            {
                return new ScaleRecommendation(
                    ScaleAction.AddWorker,
                    keepWorkersAlive: true,
                    reason: $"Work-item queue latency: {workItemQueueLatencyHistory.Latest} > {this.highLatencyThreshold}");
            }
            else if (workerCount > partitionCount && IsIdle(workItemQueueLatencyHistory))
            {
                return new ScaleRecommendation(
                    ScaleAction.RemoveWorker,
                    keepWorkersAlive: true,
                    reason: $"Work-items idle, #workers > partitions ({workerCount} > {partitionCount})");
            }

            // Control queues are partitioned; only scale-out if there are more partitions than workers.
            if (workerCount < controlQueueLatencyHistory.Count(this.IsHighLatency))
            {
                // Some control queues are busy, so scale out until workerCount == partitionCount.
                QueueMetricHistory metric = controlQueueLatencyHistory.First(this.IsHighLatency);
                return new ScaleRecommendation(
                    ScaleAction.AddWorker,
                    keepWorkersAlive: true,
                    reason: $"High control queue latency: {metric.Latest} > {this.highLatencyThreshold}");
            }
            else if (workerCount > controlQueueLatencyHistory.Count(h => !IsIdle(h)) && IsIdle(workItemQueueLatencyHistory))
            {
                // If the work item queues are idle, scale down to the number of non-idle control queues.
                return new ScaleRecommendation(
                    ScaleAction.RemoveWorker,
                    keepWorkersAlive: controlQueueLatencyHistory.Any(IsIdle),
                    reason: $"One or more control queues idle");
            }
            else if (workerCount > 1)
            {
                // If all queues are operating efficiently, it can be hard to know if we need to reduce the worker count.
                // We want to avoid the case where a constant trickle of load after a big scale-out prevents scaling back in.
                // We also want to avoid scaling in unnecessarily when we've reached optimal scale-out. To balance these
                // goals, we check for low latencies and vote to scale down 10% of the time when we see this. The thought is
                // that it's a slow scale-in that will get automatically corrected once latencies start increasing again.
                bool tryRandomScaleDown = this.EnableRandomScaleDownOnLowLatency && Random.Next(10) == 0;
                if (tryRandomScaleDown &&
                    controlQueueLatencyHistory.TrueForAll(IsLowLatency) &&
                    workItemQueueLatencyHistory.TrueForAll(latency => latency < LowLatencyThreshold))
                {
                    return new ScaleRecommendation(
                        ScaleAction.RemoveWorker,
                        keepWorkersAlive: true,
                        reason: $"All queues are not busy");
                }
            }

            // Load exists, but none of our scale filters were triggered, so we assume that the current worker
            // assignments are close to ideal for the current workload.
            return new ScaleRecommendation(ScaleAction.None, keepWorkersAlive: true, reason: $"Queue latencies are healthy");
        }