services/StorageCache/AmlFilesystems/alerts.yaml (407 lines of code) (raw):
- name: OST Files Used
description: Log an alert if OSTFilesUsed is above 85%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: GreaterThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 85
metricMeasureColumn: AggregatedValue
dimensions:
- name: UsedRatio
operator: Include
values:
- '*'
- name: OSTFilesUsed
operator: Include
values:
- '*'
- name: OSTFilesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.85;
AzureMetrics
| where MetricName == "OSTFilesTotal" or MetricName == "OSTFilesUsed"
| summarize
OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal"),
OSTFilesUsed = maxif(Total, MetricName == "OSTFilesUsed")
| extend UsedRatio = OSTFilesUsed / OSTFilesTotal
| where UsedRatio > threshold_used
| project UsedRatio, OSTFilesUsed, OSTFilesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 9d086772-1887-4893-8b9f-7e5169398bae
references:
- name: OST Files Free
description: Log an alert if OSTFilesFree is below 15%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: LessThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 15
metricMeasureColumn: AggregatedValue
dimensions:
- name: FreeRatio
operator: Include
values:
- '*'
- name: OSTFilesFree
operator: Include
values:
- '*'
- name: OSTFilesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_free = 0.15;
AzureMetrics
| where MetricName == "OSTFilesFree" or MetricName == "OSTFilesTotal"
| summarize
OSTFilesFree = maxif(Total, MetricName == "OSTFilesFree"),
OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal")
| extend FreeRatio = OSTFilesFree / OSTFilesTotal
| where FreeRatio < threshold_free
| project FreeRatio, OSTFilesFree, OSTFilesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 8f231351-c123-4e4c-8631-9978e641a3ca
references:
- name: OST Bytes Available
description: Log an alert if OSTBytesAvailable is below 15%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: LessThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 15
metricMeasureColumn: AggregatedValue
dimensions:
- name: AvailableRatio
operator: Include
values:
- '*'
- name: OSTBytesAvailable
operator: Include
values:
- '*'
- name: OSTBytesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_free = 0.15;
AzureMetrics
| where MetricName == "OSTBytesAvailable" or MetricName == "OSTBytesTotal"
| summarize
OSTBytesAvailable = maxif(Total, MetricName == "OSTBytesAvailable"),
OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal")
| extend AvailableRatio = OSTBytesAvailable / OSTBytesTotal
| where AvailableRatio < threshold_available
| project AvailableRatio, OSTBytesAvailable, OSTBytesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 4eeca790-a804-4453-b339-73ea425610bc
references:
- name: OST Bytes Used
description: Log an alert if OSTByteUsed is above 85%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: GreaterThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 85
metricMeasureColumn: AggregatedValue
dimensions:
- name: UsedRatio
operator: Include
values:
- '*'
- name: OSTBytesUsed
operator: Include
values:
- '*'
- name: OSTBytesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.85;
AzureMetrics
| where MetricName == "OSTBytesTotal" or MetricName == "OSTBytesUsed"
| summarize
OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal"),
OSTBytesUsed = maxif(Total, MetricName == "OSTBytesUsed")
| extend UsedRatio = OSTBytesUsed / OSTBytesTotal
| where UsedRatio > threshold_used
| project UsedRatio, OSTBytesUsed, OSTBytesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 59298086-ec77-4f47-b2ef-b853b79e31cb
references:
- name: MDT Files Free
description: Log an alert if MDTFilesFree is below 15%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: LessThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 15
metricMeasureColumn: AggregatedValue
dimensions:
- name: FreeRatio
operator: Include
values:
- '*'
- name: MDTFilesFree
operator: Include
values:
- '*'
- name: MDTFilesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.15;
AzureMetrics
| where MetricName == "MDTFilesFree" or MetricName == "MDTFilesTotal"
| summarize
MDTFilesFree = maxif(Total, MetricName == "MDTFilesFree"),
MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal")
| extend FreeRatio = MDTFilesFree / MDTFilesTotal
| where FreeRatio < threshold_free
| project FreeRatio, MDTFilesFree, MDTFilesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 2feba8fd-ff1e-4f48-bc01-6e2996edafa6
references:
- name: MDT Files Used
description: Log an alert if MDTFilesUsed is above 85%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: GreaterThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 85
metricMeasureColumn: AggregatedValue
dimensions:
- name: UsedRatio
operator: Include
values:
- '*'
- name: MDTFilesUsed
operator: Include
values:
- '*'
- name: MDTFilesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.85;
AzureMetrics
| where MetricName == "MDTFilesTotal" or MetricName == "MDTFilesUsed"
| summarize
MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal"),
MDTFilesUsed = maxif(Total, MetricName == "MDTFilesUsed")
| extend FreeRatio = MDTFilesFree / MDTFilesTotal
| where UsedRatio > threshold_used
| project UsedRatio, MDTFilesUsed, MDTFilesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: 48fc094d-8a00-4d3c-86d3-3230c7e5881a
references:
- name: MDT Files Available
description: Log an alert if MDTBytesAvailable is below 15%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: LessThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 15
metricMeasureColumn: AggregatedValue
dimensions:
- name: AvailableRatio
operator: Include
values:
- '*'
- name: MDTBytesAvailable
operator: Include
values:
- '*'
- name: MDTBytesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.15;
AzureMetrics
| where MetricName == "MDTBytesAvailable" or MetricName == "MDTBytesTotal"
| summarize
MDTBytesAvailable = maxif(Total, MetricName == "MDTBytesAvailable"),
MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal")
| extend AvailableRatio = MDTBytesAvailable / MDTBytesTotal
| where AvailableRatio < threshold_available
| project AvailableRatio, MDTBytesAvailable, MDTBytesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: ecec6f93-af7e-4071-b35d-cd70b3f16581
references:
- name: MDT Bytes Used
description: Log an alert if MDTBytesUsed is above 85%
type: Log
verified: true
visible: true
tags:
- hpc
properties:
severity: 2
operator: GreaterThan
timeAggregation: Average
windowSize: PT1M
evaluationFrequency: PT5M
threshold: 85
metricMeasureColumn: AggregatedValue
dimensions:
- name: UsedRatio
operator: Include
values:
- '*'
- name: MDTBytesUsed
operator: Include
values:
- '*'
- name: MDTBytesTotal
operator: Include
values:
- '*'
failingPeriods:
numberOfEvaluationPeriods: 1
minFailingPeriodsToAlert: 1
query: '
let threshold_used = 0.85;
AzureMetrics
| where MetricName == "MDTBytesTotal" or MetricName == "MDTBytesUsed"
| summarize
MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal"),
MDTBytesUsed = maxif(Total, MetricName == "MDTBytesUsed")
| extend UsedRatio = MDTBytesUsed / MDTBytesTotal
| where UsedRatio > threshold_used
| project UsedRatio, MDTBytesUsed, MDTBytesTotal
'
autoMitigate: true
autoResolve: true
autoResolveTime: 0:10:00
enabled: true
guid: ebd68fdd-9672-43e8-b7d5-6e479210535d
references:
- name: Uptime
description: Total number of client input/output operations per second
type: Metric
verified: false
visible: true
tags:
- hpc
properties:
metricName: Uptime
metricNamespace: Microsoft.StorageCache/caches
severity: 1
windowSize: PT5M
evaluationFrequency: PT1M
timeAggregation: Total
operator: LessThan
criterionType: StaticThresholdCriterion
threshold: 99
autoMitigate: false
enabled: true
references:
- name: Monitor HPC Cache with metrics and alerts
url: https://learn.microsoft.com/en-us/azure/hpc-cache/metrics#metrics-page
guid: 7f951991-c6ce-4c72-9f55-7eade2c4f57c