protected AnomalyDescriptor detect()

in Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/PredictorCorrector.java [275:423]


    protected AnomalyDescriptor detect(AnomalyDescriptor result, IRCFComputeDescriptor lastAnomalyDescriptor,
            RandomCutForest forest) {
        double[] point = result.getRCFPoint();
        if (point == null) {
            return result;
        }
        double score = forest.getAnomalyScore(point);
        result.setRCFScore(score);
        result.setRCFPoint(point);
        long internalTimeStamp = result.getInternalTimeStamp();

        if (score == 0) {
            return result;
        }

        int shingleSize = result.getShingleSize();
        int baseDimensions = result.getDimension() / shingleSize;
        int startPosition = (shingleSize - 1) * baseDimensions;

        result.setThreshold(thresholder.threshold());

        boolean previousIsPotentialAnomaly = thresholder.isInPotentialAnomaly();

        /*
         * We first check if the score is high enough to be considered as a candidate
         * anomaly. If not, which is hopefully 99% of the data, the computation is short
         */
        if (thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly) == 0) {
            result.setAnomalyGrade(0);
            // inHighScoreRegion = false;
            result.setInHighScoreRegion(false);
            thresholder.update(score, score, 0, false);
            return result;
        }

        // the score is now high enough to be considered an anomaly
        // inHighScoreRegion = true;
        result.setInHighScoreRegion(true);

        /*
         * We now check if (1) we have another anomaly in the current shingle (2) have
         * predictions about what the values should have been and (3) replacing by those
         * "should have been" makes the anomaly score of the new shingled point low
         * enough to not be an anomaly. In this case we can "explain" the high score is
         * due to the past and do not need to vend anomaly -- because the most recent
         * point, on their own would not produce an anomalous shingle.
         * 
         * However, the strategy is only executable if there are (A) sufficiently many
         * observations and (B) enough data in each time point such that the forecast is
         * reasonable. While forecasts can be corrected for very low shingle sizes and
         * say 1d input, the allure of RCF is in the multivariate case. Even for 1d, a
         * shingleSize of 4 or larger would produce reasonable forecast for the purposes
         * of anomaly detection.
         */

        int gap = (int) (internalTimeStamp - lastAnomalyDescriptor.getInternalTimeStamp());

        // the forecast may not be reasonable with less data
        boolean reasonableForecast = result.isReasonableForecast();

        if (reasonableForecast && lastAnomalyDescriptor.getRCFPoint() != null
                && lastAnomalyDescriptor.getExpectedRCFPoint() != null && gap > 0 && gap <= shingleSize) {
            double[] correctedPoint = applyBasicCorrector(point, gap, shingleSize, baseDimensions,
                    lastAnomalyDescriptor);
            double correctedScore = forest.getAnomalyScore(correctedPoint);
            // we know we are looking previous anomalies
            if (thresholder.getAnomalyGrade(correctedScore, true) == 0) {
                // fixing the past makes this anomaly go away; nothing to do but process the
                // score
                // we will not change inHighScoreRegion however, because the score has been
                // larger
                thresholder.update(score, correctedScore, 0, false);
                result.setExpectedRCFPoint(correctedPoint);
                result.setAnomalyGrade(0);
                return result;
            }
        }

        /*
         * We now check the most egregious values seen in the current timestamp, as
         * determined by attribution. Those locations provide information about (a)
         * which attributes and (b) what the values should have been. However, those
         * calculations of imputation only make sense when sufficient observations are
         * available.
         */

        DiVector attribution = forest.getAnomalyAttribution(point);

        double[] newPoint = null;
        double newScore = score;
        DiVector newAttribution = null;

        /*
         * we now find the time slice, relative to the current time, which is indicative
         * of the high score. relativeIndex = 0 is current time. It is negative if the
         * most egregious attribution was due to the past values in the shingle
         */

        int index = maxContribution(attribution, baseDimensions, -shingleSize) + 1;

        /*
         * if we are transitioning from low score to high score range (given by
         * inAnomaly) then we check if the new part of the input could have triggered
         * anomaly on its own That decision is vended by trigger() which extrapolates a
         * partial shingle
         */

        if (!previousIsPotentialAnomaly
                && trigger(attribution, gap, baseDimensions, null, false, lastAnomalyDescriptor)) {
            result.setAnomalyGrade(thresholder.getAnomalyGrade(score, false));
            result.setStartOfAnomaly(true);
            thresholder.update(score, score, 0, true);
        } else {
            /*
             * we again check if the new input produces an anomaly/not on its own
             */
            if (reasonableForecast) {
                newPoint = getExpectedPoint(attribution, startPosition, baseDimensions, point, forest);
                if (newPoint != null) {
                    newAttribution = forest.getAnomalyAttribution(newPoint);
                    newScore = forest.getAnomalyScore(newPoint);
                    result.setExpectedRCFPoint(newPoint);
                }
            }

            if (trigger(attribution, gap, baseDimensions, newAttribution, previousIsPotentialAnomaly,
                    lastAnomalyDescriptor) && score > newScore) {
                result.setAnomalyGrade(thresholder.getAnomalyGrade(score, previousIsPotentialAnomaly));
                index = 0; // current point
                thresholder.update(score, newScore, 0, true);
            } else {
                // previousIsPotentialAnomaly is true now, but not calling it anomaly either
                thresholder.update(score, newScore, 0, true);
                result.setAnomalyGrade(0);
                return result;
            }
        }

        result.setAttribution(attribution);
        result.setRelativeIndex(index);
        if (reasonableForecast) {
            // anomaly in the past and detected late; repositioning the computation
            // index 0 is current time
            startPosition = shingleSize * baseDimensions + (result.getRelativeIndex() - 1) * baseDimensions;
            newPoint = getExpectedPoint(result.getAttribution(), startPosition, baseDimensions, point, forest);
        }
        result.setExpectedRCFPoint(newPoint);
        return result;
    }