services/StorageCache/AmlFilesystems/alerts.yaml (407 lines of code) (raw):

- name: OST Files Used description: Log an alert if OSTFilesUsed is above 85% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: GreaterThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 85 metricMeasureColumn: AggregatedValue dimensions: - name: UsedRatio operator: Include values: - '*' - name: OSTFilesUsed operator: Include values: - '*' - name: OSTFilesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.85; AzureMetrics | where MetricName == "OSTFilesTotal" or MetricName == "OSTFilesUsed" | summarize OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal"), OSTFilesUsed = maxif(Total, MetricName == "OSTFilesUsed") | extend UsedRatio = OSTFilesUsed / OSTFilesTotal | where UsedRatio > threshold_used | project UsedRatio, OSTFilesUsed, OSTFilesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 9d086772-1887-4893-8b9f-7e5169398bae references: - name: OST Files Free description: Log an alert if OSTFilesFree is below 15% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: LessThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 15 metricMeasureColumn: AggregatedValue dimensions: - name: FreeRatio operator: Include values: - '*' - name: OSTFilesFree operator: Include values: - '*' - name: OSTFilesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_free = 0.15; AzureMetrics | where MetricName == "OSTFilesFree" or MetricName == "OSTFilesTotal" | summarize OSTFilesFree = maxif(Total, MetricName == "OSTFilesFree"), OSTFilesTotal = maxif(Total, MetricName == "OSTFilesTotal") | extend FreeRatio = OSTFilesFree / OSTFilesTotal | where FreeRatio < threshold_free | project FreeRatio, OSTFilesFree, OSTFilesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 8f231351-c123-4e4c-8631-9978e641a3ca references: - name: OST Bytes Available description: Log an alert if OSTBytesAvailable is below 15% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: LessThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 15 metricMeasureColumn: AggregatedValue dimensions: - name: AvailableRatio operator: Include values: - '*' - name: OSTBytesAvailable operator: Include values: - '*' - name: OSTBytesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_free = 0.15; AzureMetrics | where MetricName == "OSTBytesAvailable" or MetricName == "OSTBytesTotal" | summarize OSTBytesAvailable = maxif(Total, MetricName == "OSTBytesAvailable"), OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal") | extend AvailableRatio = OSTBytesAvailable / OSTBytesTotal | where AvailableRatio < threshold_available | project AvailableRatio, OSTBytesAvailable, OSTBytesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 4eeca790-a804-4453-b339-73ea425610bc references: - name: OST Bytes Used description: Log an alert if OSTByteUsed is above 85% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: GreaterThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 85 metricMeasureColumn: AggregatedValue dimensions: - name: UsedRatio operator: Include values: - '*' - name: OSTBytesUsed operator: Include values: - '*' - name: OSTBytesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.85; AzureMetrics | where MetricName == "OSTBytesTotal" or MetricName == "OSTBytesUsed" | summarize OSTBytesTotal = maxif(Total, MetricName == "OSTBytesTotal"), OSTBytesUsed = maxif(Total, MetricName == "OSTBytesUsed") | extend UsedRatio = OSTBytesUsed / OSTBytesTotal | where UsedRatio > threshold_used | project UsedRatio, OSTBytesUsed, OSTBytesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 59298086-ec77-4f47-b2ef-b853b79e31cb references: - name: MDT Files Free description: Log an alert if MDTFilesFree is below 15% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: LessThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 15 metricMeasureColumn: AggregatedValue dimensions: - name: FreeRatio operator: Include values: - '*' - name: MDTFilesFree operator: Include values: - '*' - name: MDTFilesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.15; AzureMetrics | where MetricName == "MDTFilesFree" or MetricName == "MDTFilesTotal" | summarize MDTFilesFree = maxif(Total, MetricName == "MDTFilesFree"), MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal") | extend FreeRatio = MDTFilesFree / MDTFilesTotal | where FreeRatio < threshold_free | project FreeRatio, MDTFilesFree, MDTFilesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 2feba8fd-ff1e-4f48-bc01-6e2996edafa6 references: - name: MDT Files Used description: Log an alert if MDTFilesUsed is above 85% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: GreaterThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 85 metricMeasureColumn: AggregatedValue dimensions: - name: UsedRatio operator: Include values: - '*' - name: MDTFilesUsed operator: Include values: - '*' - name: MDTFilesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.85; AzureMetrics | where MetricName == "MDTFilesTotal" or MetricName == "MDTFilesUsed" | summarize MDTFilesTotal = maxif(Total, MetricName == "MDTFilesTotal"), MDTFilesUsed = maxif(Total, MetricName == "MDTFilesUsed") | extend FreeRatio = MDTFilesFree / MDTFilesTotal | where UsedRatio > threshold_used | project UsedRatio, MDTFilesUsed, MDTFilesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: 48fc094d-8a00-4d3c-86d3-3230c7e5881a references: - name: MDT Files Available description: Log an alert if MDTBytesAvailable is below 15% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: LessThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 15 metricMeasureColumn: AggregatedValue dimensions: - name: AvailableRatio operator: Include values: - '*' - name: MDTBytesAvailable operator: Include values: - '*' - name: MDTBytesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.15; AzureMetrics | where MetricName == "MDTBytesAvailable" or MetricName == "MDTBytesTotal" | summarize MDTBytesAvailable = maxif(Total, MetricName == "MDTBytesAvailable"), MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal") | extend AvailableRatio = MDTBytesAvailable / MDTBytesTotal | where AvailableRatio < threshold_available | project AvailableRatio, MDTBytesAvailable, MDTBytesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: ecec6f93-af7e-4071-b35d-cd70b3f16581 references: - name: MDT Bytes Used description: Log an alert if MDTBytesUsed is above 85% type: Log verified: true visible: true tags: - hpc properties: severity: 2 operator: GreaterThan timeAggregation: Average windowSize: PT1M evaluationFrequency: PT5M threshold: 85 metricMeasureColumn: AggregatedValue dimensions: - name: UsedRatio operator: Include values: - '*' - name: MDTBytesUsed operator: Include values: - '*' - name: MDTBytesTotal operator: Include values: - '*' failingPeriods: numberOfEvaluationPeriods: 1 minFailingPeriodsToAlert: 1 query: ' let threshold_used = 0.85; AzureMetrics | where MetricName == "MDTBytesTotal" or MetricName == "MDTBytesUsed" | summarize MDTBytesTotal = maxif(Total, MetricName == "MDTBytesTotal"), MDTBytesUsed = maxif(Total, MetricName == "MDTBytesUsed") | extend UsedRatio = MDTBytesUsed / MDTBytesTotal | where UsedRatio > threshold_used | project UsedRatio, MDTBytesUsed, MDTBytesTotal ' autoMitigate: true autoResolve: true autoResolveTime: 0:10:00 enabled: true guid: ebd68fdd-9672-43e8-b7d5-6e479210535d references: - name: Uptime description: Total number of client input/output operations per second type: Metric verified: false visible: true tags: - hpc properties: metricName: Uptime metricNamespace: Microsoft.StorageCache/caches severity: 1 windowSize: PT5M evaluationFrequency: PT1M timeAggregation: Total operator: LessThan criterionType: StaticThresholdCriterion threshold: 99 autoMitigate: false enabled: true references: - name: Monitor HPC Cache with metrics and alerts url: https://learn.microsoft.com/en-us/azure/hpc-cache/metrics#metrics-page guid: 7f951991-c6ce-4c72-9f55-7eade2c4f57c