constructor()

in lib/monitoring/ci-alarms.ts [17:75]


  constructor(stack: Stack, externalLoadBalancer: JenkinsExternalLoadBalancer, mainNode: JenkinsMainNode) {
    const dashboard = new Dashboard(stack, 'AlarmDashboard');

    const alarms: Alarm[] = [];
    alarms.push(new Alarm(stack, 'ExternalLoadBalancerUnhealthyHosts', {
      alarmDescription: 'If any hosts behind the load balancer are unhealthy',
      metric: externalLoadBalancer.targetGroup.metricUnhealthyHostCount(),
      evaluationPeriods: 3,
      threshold: 1,
      comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
      treatMissingData: TreatMissingData.BREACHING,
    }));

    alarms.push(new Alarm(stack, 'MainNodeTooManyJenkinsProcessesFound', {
      alarmDescription: 'Only one jenkins process should run at any given time on the main node, there might be a cloudwatch configuration issue',
      metric: mainNode.ec2InstanceMetrics.foundJenkinsProcessCount.with({ statistic: 'max' }),
      evaluationPeriods: 3,
      threshold: 1,
      comparisonOperator: ComparisonOperator.GREATER_THAN_THRESHOLD,
      treatMissingData: TreatMissingData.IGNORE,
    }));

    alarms.push(new Alarm(stack, 'MainNodeHighCpuUtilization', {
      alarmDescription: 'The jenkins process is using much more CPU that expected, it should be investigated for a stuck process/job',
      metric: mainNode.ec2InstanceMetrics.cpuTime.with({ statistic: 'max' }),
      evaluationPeriods: 5,
      threshold: 50,
      comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
      treatMissingData: TreatMissingData.IGNORE,
    }));

    alarms.push(new Alarm(stack, 'MainNodeHighMemoryUtilization', {
      alarmDescription: 'The jenkins process is using more memory than expected, it should be investigated for a large number of jobs or heavy weight jobs',
      metric: mainNode.ec2InstanceMetrics.memUsed.with({ statistic: 'max' }),
      evaluationPeriods: 5,
      threshold: 50,
      comparisonOperator: ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
      treatMissingData: TreatMissingData.IGNORE,
    }));

    alarms.push(new Alarm(stack, 'MainNodeCloudwatchEvents', {
      alarmDescription: `Cloudwatch events have stopped being received from the main node.
Use session manager to exam the host and the /opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log`,
      metric: mainNode.ec2InstanceMetrics.memUsed.with({ statistic: 'n' }),
      evaluationPeriods: 1,
      /**
       * Memory metrics are reported every second, 60 in 1 minute
       * Period is set to 5 minute, in 1 evaluation periods = 300 events
       * Allowing for 20% loss, 240 events the min threshold
       */
      threshold: 240,
      comparisonOperator: ComparisonOperator.LESS_THAN_OR_EQUAL_TO_THRESHOLD,
      treatMissingData: TreatMissingData.MISSING,
    }));

    alarms
      .map((alarm) => new AlarmWidget({ alarm }))
      .forEach((widget) => dashboard.addWidgets(widget));
  }