in cli/src/pcluster/templates/cw_dashboard_builder.py [0:0]
def _add_custom_health_metrics(self):
"""Create custom health metric filters and outputs to cloudwatch graph."""
def _generate_metric_filter_pattern(event_type, failure_type=None):
if failure_type:
return (
f"{{ $.event-type = {event_type} && $.detail.failure-type = {failure_type} && "
'$.scheduler = "slurm" }'
)
else:
return f'{{ $.event-type = {event_type} && $.scheduler = "slurm" }}'
metric_value = "$.detail.count"
launch_failure_event_type = "node-launch-failure-count"
jobs_not_starting_errors = [
_CustomMetricFilter(
metric_name="IamPolicyErrors",
filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "iam-policy-errors"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="VcpuLimitErrors",
filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "vcpu-limit-failures"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="VolumeLimitErrors",
filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "volume-limit-failures"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="InsufficientCapacityErrors",
filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "ice-failures"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="OtherInstanceLaunchFailures",
filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "other-failures"),
metric_value=metric_value,
),
]
compute_node_events = [
_CustomMetricFilter(
metric_name="InstanceBootstrapTimeoutErrors",
filter_pattern='{ $.event-type = "protected-mode-error-count" && '
'($.detail.failure-type = "static-replacement-timeout-error" || '
'$.detail.failure-type = "dynamic-resume-timeout-error" ) && '
'$.scheduler = "slurm" }',
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="EC2HealthCheckErrors",
filter_pattern=_generate_metric_filter_pattern("nodes-failing-health-check-count", "ec2_health_check"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="ScheduledEventHealthCheckErrors",
filter_pattern=_generate_metric_filter_pattern(
"nodes-failing-health-check-count", "scheduled_event_health_check"
),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="NoCorrespondingInstanceErrors",
filter_pattern=_generate_metric_filter_pattern("invalid-backing-instance-count"),
metric_value=metric_value,
),
_CustomMetricFilter(
metric_name="SlurmNodeNotRespondingErrors",
filter_pattern=_generate_metric_filter_pattern("node-not-responding-down-count"),
metric_value=metric_value,
),
]
if self.config.has_gpu_health_checks_enabled:
compute_node_events.append(
_CustomMetricFilter(
metric_name="GpuHealthCheckFailures",
filter_pattern='{ $.event-type = "compute-node-health-check" && $.scheduler = "slurm" && '
'$.detail.health-check-name = "Gpu" && $.detail.health-check-result != 0 }',
metric_value="1",
)
)
cluster_health_metrics = [
_HealthMetric(
"Instance Provisioning Errors",
jobs_not_starting_errors,
left_y_axis=cloudwatch.YAxisProps(min=0.0),
),
_HealthMetric(
"Unhealthy Instance Errors",
compute_node_events,
left_y_axis=cloudwatch.YAxisProps(min=0.0),
),
]
if self.config.has_custom_actions_in_queue:
custom_action_errors = [
_CustomMetricFilter(
metric_name="OnNodeStartDownloadErrors",
filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
'$.detail.action = "OnNodeStart" && $.detail.stage = "downloading"}',
metric_value="1",
),
_CustomMetricFilter(
metric_name="OnNodeStartRunErrors",
filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
'$.detail.action = "OnNodeStart" && $.detail.stage = "executing"}',
metric_value="1",
),
_CustomMetricFilter(
metric_name="OnNodeConfiguredDownloadErrors",
filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
'$.detail.action = "OnNodeConfigured" && $.detail.stage = "downloading"}',
metric_value="1",
),
_CustomMetricFilter(
metric_name="OnNodeConfiguredRunErrors",
filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
'$.detail.action = "OnNodeConfigured" && $.detail.stage = "executing"}',
metric_value="1",
),
]
cluster_health_metrics.append(
_HealthMetric(
"Custom Action Errors",
custom_action_errors,
left_y_axis=cloudwatch.YAxisProps(min=0.0),
)
)
cluster_health_metrics.append(
_HealthMetric(
"Compute Fleet Idle Time",
[
_CustomMetricFilter(
metric_name="MaxDynamicNodeIdleTime",
filter_pattern='{ $.event-type = "compute-node-idle-time" && $.scheduler = "slurm" && '
'$.detail.node-type = "dynamic"}',
metric_value="$.detail.longest-idle-time",
metric_statistic="max",
metric_unit="Seconds",
),
],
left_y_axis=cloudwatch.YAxisProps(min=0.0),
left_annotations=[
cloudwatch.HorizontalAnnotation(
value=self.config.scheduling.settings.scaledown_idletime * 60,
color=cloudwatch.Color.GREEN,
fill=cloudwatch.Shading.BELOW,
visible=True,
),
cloudwatch.HorizontalAnnotation(
value=self.config.scheduling.settings.scaledown_idletime * 60,
label="Idle Time Scaledown",
color=cloudwatch.Color.BLUE,
fill=cloudwatch.Shading.ABOVE,
visible=True,
),
],
)
)
self._add_text_widget("# Cluster Health Metrics")
self._add_health_metrics_graph_widgets(cluster_health_metrics)
self._add_text_widget(
"General [Troubleshooting Resources]"
"(https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting.html)"
)