def _add_custom_health_metrics()

in cli/src/pcluster/templates/cw_dashboard_builder.py [0:0]


    def _add_custom_health_metrics(self):
        """Create custom health metric filters and outputs to cloudwatch graph."""

        def _generate_metric_filter_pattern(event_type, failure_type=None):
            if failure_type:
                return (
                    f"{{ $.event-type = {event_type} && $.detail.failure-type = {failure_type} && "
                    '$.scheduler = "slurm" }'
                )
            else:
                return f'{{ $.event-type = {event_type} && $.scheduler = "slurm" }}'

        metric_value = "$.detail.count"
        launch_failure_event_type = "node-launch-failure-count"
        jobs_not_starting_errors = [
            _CustomMetricFilter(
                metric_name="IamPolicyErrors",
                filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "iam-policy-errors"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="VcpuLimitErrors",
                filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "vcpu-limit-failures"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="VolumeLimitErrors",
                filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "volume-limit-failures"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="InsufficientCapacityErrors",
                filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "ice-failures"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="OtherInstanceLaunchFailures",
                filter_pattern=_generate_metric_filter_pattern(launch_failure_event_type, "other-failures"),
                metric_value=metric_value,
            ),
        ]

        compute_node_events = [
            _CustomMetricFilter(
                metric_name="InstanceBootstrapTimeoutErrors",
                filter_pattern='{ $.event-type = "protected-mode-error-count" && '
                '($.detail.failure-type = "static-replacement-timeout-error" || '
                '$.detail.failure-type = "dynamic-resume-timeout-error" ) && '
                '$.scheduler = "slurm" }',
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="EC2HealthCheckErrors",
                filter_pattern=_generate_metric_filter_pattern("nodes-failing-health-check-count", "ec2_health_check"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="ScheduledEventHealthCheckErrors",
                filter_pattern=_generate_metric_filter_pattern(
                    "nodes-failing-health-check-count", "scheduled_event_health_check"
                ),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="NoCorrespondingInstanceErrors",
                filter_pattern=_generate_metric_filter_pattern("invalid-backing-instance-count"),
                metric_value=metric_value,
            ),
            _CustomMetricFilter(
                metric_name="SlurmNodeNotRespondingErrors",
                filter_pattern=_generate_metric_filter_pattern("node-not-responding-down-count"),
                metric_value=metric_value,
            ),
        ]

        if self.config.has_gpu_health_checks_enabled:
            compute_node_events.append(
                _CustomMetricFilter(
                    metric_name="GpuHealthCheckFailures",
                    filter_pattern='{ $.event-type = "compute-node-health-check" && $.scheduler = "slurm" && '
                    '$.detail.health-check-name = "Gpu" && $.detail.health-check-result != 0 }',
                    metric_value="1",
                )
            )

        cluster_health_metrics = [
            _HealthMetric(
                "Instance Provisioning Errors",
                jobs_not_starting_errors,
                left_y_axis=cloudwatch.YAxisProps(min=0.0),
            ),
            _HealthMetric(
                "Unhealthy Instance Errors",
                compute_node_events,
                left_y_axis=cloudwatch.YAxisProps(min=0.0),
            ),
        ]
        if self.config.has_custom_actions_in_queue:
            custom_action_errors = [
                _CustomMetricFilter(
                    metric_name="OnNodeStartDownloadErrors",
                    filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
                    '$.detail.action = "OnNodeStart" && $.detail.stage = "downloading"}',
                    metric_value="1",
                ),
                _CustomMetricFilter(
                    metric_name="OnNodeStartRunErrors",
                    filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
                    '$.detail.action = "OnNodeStart" && $.detail.stage = "executing"}',
                    metric_value="1",
                ),
                _CustomMetricFilter(
                    metric_name="OnNodeConfiguredDownloadErrors",
                    filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
                    '$.detail.action = "OnNodeConfigured" && $.detail.stage = "downloading"}',
                    metric_value="1",
                ),
                _CustomMetricFilter(
                    metric_name="OnNodeConfiguredRunErrors",
                    filter_pattern='{ $.event-type = "custom-action-error" && $.scheduler = "slurm" && '
                    '$.detail.action = "OnNodeConfigured" && $.detail.stage = "executing"}',
                    metric_value="1",
                ),
            ]

            cluster_health_metrics.append(
                _HealthMetric(
                    "Custom Action Errors",
                    custom_action_errors,
                    left_y_axis=cloudwatch.YAxisProps(min=0.0),
                )
            )

        cluster_health_metrics.append(
            _HealthMetric(
                "Compute Fleet Idle Time",
                [
                    _CustomMetricFilter(
                        metric_name="MaxDynamicNodeIdleTime",
                        filter_pattern='{ $.event-type = "compute-node-idle-time" && $.scheduler = "slurm" && '
                        '$.detail.node-type = "dynamic"}',
                        metric_value="$.detail.longest-idle-time",
                        metric_statistic="max",
                        metric_unit="Seconds",
                    ),
                ],
                left_y_axis=cloudwatch.YAxisProps(min=0.0),
                left_annotations=[
                    cloudwatch.HorizontalAnnotation(
                        value=self.config.scheduling.settings.scaledown_idletime * 60,
                        color=cloudwatch.Color.GREEN,
                        fill=cloudwatch.Shading.BELOW,
                        visible=True,
                    ),
                    cloudwatch.HorizontalAnnotation(
                        value=self.config.scheduling.settings.scaledown_idletime * 60,
                        label="Idle Time Scaledown",
                        color=cloudwatch.Color.BLUE,
                        fill=cloudwatch.Shading.ABOVE,
                        visible=True,
                    ),
                ],
            )
        )

        self._add_text_widget("# Cluster Health Metrics")
        self._add_health_metrics_graph_widgets(cluster_health_metrics)
        self._add_text_widget(
            "General [Troubleshooting Resources]"
            "(https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting.html)"
        )