in Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java [275:423]
protected AnomalyDescriptor detect(AnomalyDescriptor result, IRCFComputeDescriptor lastAnomalyDescriptor,
RandomCutForest forest) {
double[] point = result.getRCFPoint();
if (point == null) {
return result;
}
double score = forest.getAnomalyScore(point);
result.setRCFScore(score);
result.setRCFPoint(point);
long internalTimeStamp = result.getInternalTimeStamp();
if (score == 0) {
return result;
}
int shingleSize = result.getShingleSize();
int baseDimensions = result.getDimension() / shingleSize;
int startPosition = (shingleSize - 1) * baseDimensions;
result.setThreshold(thresholder.threshold());
boolean previousIsPotentialAnomaly = thresholder.isInPotentialAnomaly();
/*
* We first check if the score is high enough to be considered as a candidate
* anomaly. If not, which is hopefully 99% of the data, the computation is short
*/
if (thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly) == 0) {
result.setAnomalyGrade(0);
// inHighScoreRegion = false;
result.setInHighScoreRegion(false);
thresholder.update(score, score, 0, false);
return result;
}
// the score is now high enough to be considered an anomaly
// inHighScoreRegion = true;
result.setInHighScoreRegion(true);
/*
* We now check if (1) we have another anomaly in the current shingle (2) have
* predictions about what the values should have been and (3) replacing by those
* "should have been" makes the anomaly score of the new shingled point low
* enough to not be an anomaly. In this case we can "explain" the high score is
* due to the past and do not need to vend anomaly -- because the most recent
* point, on their own would not produce an anomalous shingle.
*
* However, the strategy is only executable if there are (A) sufficiently many
* observations and (B) enough data in each time point such that the forecast is
* reasonable. While forecasts can be corrected for very low shingle sizes and
* say 1d input, the allure of RCF is in the multivariate case. Even for 1d, a
* shingleSize of 4 or larger would produce reasonable forecast for the purposes
* of anomaly detection.
*/
int gap = (int) (internalTimeStamp - lastAnomalyDescriptor.getInternalTimeStamp());
// the forecast may not be reasonable with less data
boolean reasonableForecast = result.isReasonableForecast();
if (reasonableForecast && lastAnomalyDescriptor.getRCFPoint() != null
&& lastAnomalyDescriptor.getExpectedRCFPoint() != null && gap > 0 && gap <= shingleSize) {
double[] correctedPoint = applyBasicCorrector(point, gap, shingleSize, baseDimensions,
lastAnomalyDescriptor);
double correctedScore = forest.getAnomalyScore(correctedPoint);
// we know we are looking previous anomalies
if (thresholder.getAnomalyGrade(correctedScore, true) == 0) {
// fixing the past makes this anomaly go away; nothing to do but process the
// score
// we will not change inHighScoreRegion however, because the score has been
// larger
thresholder.update(score, correctedScore, 0, false);
result.setExpectedRCFPoint(correctedPoint);
result.setAnomalyGrade(0);
return result;
}
}
/*
* We now check the most egregious values seen in the current timestamp, as
* determined by attribution. Those locations provide information about (a)
* which attributes and (b) what the values should have been. However, those
* calculations of imputation only make sense when sufficient observations are
* available.
*/
DiVector attribution = forest.getAnomalyAttribution(point);
double[] newPoint = null;
double newScore = score;
DiVector newAttribution = null;
/*
* we now find the time slice, relative to the current time, which is indicative
* of the high score. relativeIndex = 0 is current time. It is negative if the
* most egregious attribution was due to the past values in the shingle
*/
int index = maxContribution(attribution, baseDimensions, -shingleSize) + 1;
/*
* if we are transitioning from low score to high score range (given by
* inAnomaly) then we check if the new part of the input could have triggered
* anomaly on its own That decision is vended by trigger() which extrapolates a
* partial shingle
*/
if (!previousIsPotentialAnomaly
&& trigger(attribution, gap, baseDimensions, null, false, lastAnomalyDescriptor)) {
result.setAnomalyGrade(thresholder.getAnomalyGrade(score, false));
result.setStartOfAnomaly(true);
thresholder.update(score, score, 0, true);
} else {
/*
* we again check if the new input produces an anomaly/not on its own
*/
if (reasonableForecast) {
newPoint = getExpectedPoint(attribution, startPosition, baseDimensions, point, forest);
if (newPoint != null) {
newAttribution = forest.getAnomalyAttribution(newPoint);
newScore = forest.getAnomalyScore(newPoint);
result.setExpectedRCFPoint(newPoint);
}
}
if (trigger(attribution, gap, baseDimensions, newAttribution, previousIsPotentialAnomaly,
lastAnomalyDescriptor) && score > newScore) {
result.setAnomalyGrade(thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly));
index = 0; // current point
thresholder.update(score, newScore, 0, true);
} else {
// previousIsPotentialAnomaly is true now, but not calling it anomaly either
thresholder.update(score, newScore, 0, true);
result.setAnomalyGrade(0);
return result;
}
}
result.setAttribution(attribution);
result.setRelativeIndex(index);
if (reasonableForecast) {
// anomaly in the past and detected late; repositioning the computation
// index 0 is current time
startPosition = shingleSize * baseDimensions + (result.getRelativeIndex() - 1) * baseDimensions;
newPoint = getExpectedPoint(result.getAttribution(), startPosition, baseDimensions, point, forest);
}
result.setExpectedRCFPoint(newPoint);
return result;
}