app/com/gu/contentapi/sanity/support/TestFailureHandlingSupport.scala (114 lines of code) (raw):
package com.gu.contentapi.sanity.support
import com.gu.contentapi.sanity.Config
import org.scalatest.concurrent.{IntegrationPatience, ScalaFutures}
import org.scalatest._
import play.api.libs.json.{JsValue, Json}
import play.api.libs.ws.WSClient
import org.scalatest.matchers.should.Matchers
import java.time.{Duration,ZonedDateTime}
/**
* Contains all the test failure handling logic:
* - counting test failures
* - sending PagerDuty alerts
*/
trait TestFailureHandlingSupport extends TestSuite {
def testFailureHandler: TestFailureHandler
override protected def withFixture(test: NoArgTest): Outcome = {
val outcome = super.withFixture(test)
outcome match {
case Failed(e) => testFailureHandler.handleTestFailure(test.name, e, tags.getOrElse(test.name, Set.empty))
case _ => // do nothing
}
outcome
}
}
trait TestFailureHandler {
def handleTestFailure(testName: String, exception: Throwable, tags: Set[String])
}
abstract class PagerDutyAlertingTestFailureHandler(override val wsClient: WSClient)
extends TestFailureHandler with HttpRequestSupport with Matchers with ScalaFutures with IntegrationPatience with OptionValues {
import PagerDutyAlertingTestFailureHandler._
protected def latestIncidentKey(): String = {
incidentKeyDateTime match {
case Some(keyTimeStamp) if Duration.between(keyTimeStamp, ZonedDateTime.now()).toMinutes < 30 =>
//re-use key if is less than 30 minutes since previous incident
val key = keyTimeStamp
key.toString
case _ =>
// generate new key at first and after 30 minutes
val key = ZonedDateTime.now
incidentKeyDateTime = Some(key)
key.toString
}
}
protected def sendPagerDutyAlert(testName: String, exception: Throwable, tags: Set[String], incidentKey: String) = {
try {
println("Reporting")
val isLowPriorityTest = tags.contains("LowPriorityTest")
val isCODETest = tags.contains("CODETest")
val serviceKey = if (isLowPriorityTest) Config.pagerDutyServiceKeyLowPriority else Config.pagerDutyServiceKey
val environmentInfo = if (isCODETest) "on environment CODE" else ""
val description = testName + " failed" + environmentInfo + ", the error reported was: " + exception.getMessage.take(250) + "..."
val data = Json.obj(
"service_key" -> serviceKey,
"event_type" -> "trigger",
"description" -> description,
"details" -> Json.arr(
Json.obj(
"name" -> testName,
"description" -> exception.getMessage
)
),
"client" -> "Content API Sanity Tests",
"client_url" -> "https://github.com/guardian/content-api-sanity-tests",
"incident_key" -> incidentKey
)
val httpRequest = request("https://events.pagerduty.com/generic/2010-04-15/create_event.json").post(data)
whenReady(httpRequest) { result =>
val pagerDutyResponse: JsValue = Json.parse(result.body)
val responseStatus = (pagerDutyResponse \ "status").asOpt[String]
val responseIncidentKey = (pagerDutyResponse \ "incident_key").asOpt[String]
responseStatus.value should be("success")
responseIncidentKey.value should be(incidentKey)
}
} catch {
case e: Exception => Console.err.println(Console.RED + "PagerDuty reporting failed with exception: " + e.getMessage + Console.RESET)
}
}
}
/**
* Holds state about PagerDuty incident keys that needs to be shared across multiple suites
* and even across multiple runs of the scheduler.
*/
object PagerDutyAlertingTestFailureHandler {
private var incidentKeyDateTime: Option[ZonedDateTime] = None
}
/**
* Handler that sends PagerDuty alerts only when a few test failures occur within a few minutes of each other.
*
* Note that this handler holds state about failure counts that needs to be shared across multiple suites
* and even across multiple runs of the scheduler.
*/
class FrequentScheduledTestFailureHandler(wsClient: WSClient) extends PagerDutyAlertingTestFailureHandler(wsClient) {
/**
* How long to wait between failed tests before resetting the incident counter
*/
private val RecentIncidentWindowMinutes = 10
/**
* How many tests should fail within a short period of time before we send a PagerDuty alert
*/
private val PagerDutyThreshold = 3
private var lastIncidentDateTime: Option[ZonedDateTime] = None
private var incidentCount = 0
override def handleTestFailure(testName: String, exception: Throwable, tags: Set[String]): Unit = {
Console.err.println(Console.RED + "Test failure: " + exception.getMessage + Console.RESET)
// increment the incident count only if there is an existing recent incident or it is the first incident {
if (lastIncidentDateTime.isEmpty || (Duration.between(lastIncidentDateTime.get, ZonedDateTime.now()).toMinutes < RecentIncidentWindowMinutes)) {
//increment counter
incrementIncidentCount()
if (incidentCount == PagerDutyThreshold) {
//report when threshold is met
sendPagerDutyAlert(testName, exception, tags, latestIncidentKey())
resetToZero()
}
}
else {
//reset counter if incident is not recent
resetToOne()
}
}
private def incrementIncidentCount() = {
incidentCount += 1
lastIncidentDateTime = Some(ZonedDateTime.now)
println(s"Incident count: $incidentCount")
}
private def resetToOne(): Unit ={
incidentCount = 0
incrementIncidentCount()
}
private def resetToZero(): Unit ={
incidentCount = 0
}
}
/**
* Handler that sends a PagerDuty alert immediately
*/
class InfrequentScheduledTestsFailureHandler(wsClient: WSClient) extends PagerDutyAlertingTestFailureHandler(wsClient) {
override def handleTestFailure(testName: String, exception: Throwable, tags: Set[String]): Unit = {
Console.err.println(Console.RED + "Test failure: " + exception.getMessage + Console.RESET)
sendPagerDutyAlert(testName, exception, tags, latestIncidentKey())
}
}
object DoNothingTestFailureHandler extends TestFailureHandler {
override def handleTestFailure(testName: String, exception: Throwable, tags: Set[String]): Unit = {}
}