services/MachineLearningServices/workspaces/alerts.yaml (106 lines of code) (raw):
- name: Failed Runs
description: Number of runs failed for this workspace. Count is updated when a run
fails.
type: Metric
verified: false
visible: true
tags:
- auto-generated
- agc-297
properties:
metricName: Failed Runs
metricNamespace: Microsoft.MachineLearningServices/workspaces
severity: 3
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Total
operator: GreaterThan
criterionType: StaticThresholdCriterion
threshold: 0.0
enabled: true
guid: c897902c-40a5-497b-a0ce-86c3eda7c61d
- name: Model Deploy Failed
description: Number of model deployments that failed in this workspace.
type: Metric
verified: true
visible: true
tags:
- manual
properties:
metricName: Model Deploy Failed
metricNamespace: Microsoft.MachineLearningServices/workspaces
severity: 3
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Total
operator: GreaterThan
criterionType: StaticThresholdCriterion
threshold: 0.0
enabled: true
references:
- name: Monitor Azure Machine Learning
url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules
guid: 0337a76f-238e-4d4d-9cd1-48b205874dbb
- name: Quota Utilization Percentage
description: Percent of quota utilized.
type: Metric
verified: true
visible: true
tags:
- manual
properties:
metricName: Quota Utilization Percentage
metricNamespace: Microsoft.MachineLearningServices/workspaces
severity: 3
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Average
operator: GreaterThan
criterionType: StaticThresholdCriterion
threshold: 90.0
enabled: true
references:
- name: Monitor Azure Machine Learning
url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules
guid: be3f1bfc-c21a-4399-9b9f-a33ebdc470cb
- name: Unusable Nodes
description: Number of unusable nodes. Unusable nodes are not functional due to some unresolvable issue. Azure will recycle these nodes.
type: Metric
verified: true
visible: true
tags:
- manual
properties:
metricName: Unusable Nodes
metricNamespace: Microsoft.MachineLearningServices/workspaces
severity: 3
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Total
operator: GreaterThan
criterionType: StaticThresholdCriterion
threshold: 0.0
enabled: true
references:
- name: Monitor Azure Machine Learning
url: https://learn.microsoft.com/en-us/azure/machine-learning/monitor-azure-machine-learning?view=azureml-api-2#machine-learning-alert-rules
guid: a171bc0c-676f-464b-a7b5-e50cd6c612a2
- name: Not Responding Runs
description: Number of runs not responding for this workspace. Count is updated when a run enters Not Responding state.
type: Metric
verified: true
visible: true
tags:
- manual
properties:
metricName: Not Responding Runs
metricNamespace: Microsoft.MachineLearningServices/workspaces
severity: 3
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Total
operator: GreaterThan
criterionType: StaticThresholdCriterion
threshold: 0.0
enabled: true
guid: d3b80f22-9f1d-3038-86ff-c4dd5ba02d7a