in src/DurableTask.AzureStorage/Monitoring/DisconnectedPerformanceMonitor.cs [342:423]
internal ScaleRecommendation MakeScaleRecommendation(
int workerCount,
int partitionCount,
QueueMetricHistory workItemQueueLatencyHistory,
List<QueueMetricHistory> controlQueueLatencyHistory)
{
// REVIEW: Is zero latency a reliable indicator of idle?
bool taskHubIsIdle = IsIdle(workItemQueueLatencyHistory) && controlQueueLatencyHistory.TrueForAll(IsIdle);
if (workerCount == 0 && !taskHubIsIdle)
{
return new ScaleRecommendation(ScaleAction.AddWorker, keepWorkersAlive: true, reason: "First worker");
}
// Wait until we have enough samples before making specific recommendations
if (!workItemQueueLatencyHistory.IsFull || !controlQueueLatencyHistory.TrueForAll(h => h.IsFull))
{
return new ScaleRecommendation(ScaleAction.None, keepWorkersAlive: !taskHubIsIdle, reason: "Not enough samples");
}
if (taskHubIsIdle)
{
return new ScaleRecommendation(
scaleAction: workerCount > 0 ? ScaleAction.RemoveWorker : ScaleAction.None,
keepWorkersAlive: false,
reason: "Task hub is idle");
}
else if (this.IsHighLatency(workItemQueueLatencyHistory))
{
return new ScaleRecommendation(
ScaleAction.AddWorker,
keepWorkersAlive: true,
reason: $"Work-item queue latency: {workItemQueueLatencyHistory.Latest} > {this.highLatencyThreshold}");
}
else if (workerCount > partitionCount && IsIdle(workItemQueueLatencyHistory))
{
return new ScaleRecommendation(
ScaleAction.RemoveWorker,
keepWorkersAlive: true,
reason: $"Work-items idle, #workers > partitions ({workerCount} > {partitionCount})");
}
// Control queues are partitioned; only scale-out if there are more partitions than workers.
if (workerCount < controlQueueLatencyHistory.Count(this.IsHighLatency))
{
// Some control queues are busy, so scale out until workerCount == partitionCount.
QueueMetricHistory metric = controlQueueLatencyHistory.First(this.IsHighLatency);
return new ScaleRecommendation(
ScaleAction.AddWorker,
keepWorkersAlive: true,
reason: $"High control queue latency: {metric.Latest} > {this.highLatencyThreshold}");
}
else if (workerCount > controlQueueLatencyHistory.Count(h => !IsIdle(h)) && IsIdle(workItemQueueLatencyHistory))
{
// If the work item queues are idle, scale down to the number of non-idle control queues.
return new ScaleRecommendation(
ScaleAction.RemoveWorker,
keepWorkersAlive: controlQueueLatencyHistory.Any(IsIdle),
reason: $"One or more control queues idle");
}
else if (workerCount > 1)
{
// If all queues are operating efficiently, it can be hard to know if we need to reduce the worker count.
// We want to avoid the case where a constant trickle of load after a big scale-out prevents scaling back in.
// We also want to avoid scaling in unnecessarily when we've reached optimal scale-out. To balance these
// goals, we check for low latencies and vote to scale down 10% of the time when we see this. The thought is
// that it's a slow scale-in that will get automatically corrected once latencies start increasing again.
bool tryRandomScaleDown = this.EnableRandomScaleDownOnLowLatency && Random.Next(10) == 0;
if (tryRandomScaleDown &&
controlQueueLatencyHistory.TrueForAll(IsLowLatency) &&
workItemQueueLatencyHistory.TrueForAll(latency => latency < LowLatencyThreshold))
{
return new ScaleRecommendation(
ScaleAction.RemoveWorker,
keepWorkersAlive: true,
reason: $"All queues are not busy");
}
}
// Load exists, but none of our scale filters were triggered, so we assume that the current worker
// assignments are close to ideal for the current workload.
return new ScaleRecommendation(ScaleAction.None, keepWorkersAlive: true, reason: $"Queue latencies are healthy");
}