bool CAnomalyScore::CNormalizer::normalize()

in lib/model/CAnomalyScore.cc [274:426]


bool CAnomalyScore::CNormalizer::normalize(const CMaximumScoreScope& scope,
                                           double& score) const {
    if (score == 0.0) {
        // Nothing to do.
        return true;
    }
    if (m_RawScoreQuantileSummary.n() == 0) {
        score = 0.0;
        return true;
    }

    LOG_TRACE(<< "Normalising " << score);

    double normalizedScores[]{m_MaximumNormalizedScore, m_MaximumNormalizedScore,
                              m_MaximumNormalizedScore, m_MaximumNormalizedScore};

    std::uint32_t discreteScore = this->discreteScore(score);

    // Our normalized score is the minimum of a set of different
    // score ceilings. The idea is that we have a number of factors
    // which can reduce the score based on the other score values
    // observed so far and the absolute score for some of our
    // functions. We want the following properties:
    //   1) The noise like scores, i.e. the smallest scores which
    //      relatively frequently, to have low normalized score.
    //   2) Higher resolution of the highest scores we've seen.
    //   3) The highest raw score ever observed has the highest
    //      normalized score.
    //   4) We don't want to generate significant anomalies for
    //      moderately unusual events before we have enough history
    //      to assess their probability accurately.
    //
    // To ensure 1) we use a ceiling for the score that is a
    // multiple of a low(ish) percentile score. To ensure 2) we
    // use non-linear (actually piecewise linear) mapping from
    // percentiles to scores, i.e. low percentile map to a small
    // normalized score range and high percentiles map to a large
    // normalized score range. Finally to ensure 3) we
    // use a ceiling which is linear interpolation between a raw
    // score of zero and the maximum ever raw score for the partition.
    // To achieve 4), we cap the maximum score such that probabilities
    // near the  cutoff don't generate large normalized scores.

    // Compute the noise ceiling. Note that since the scores are
    // logarithms (base e) the difference corresponds to the scaled,
    // 10 / log(10), signal strength in dB. By default a signal of
    // 75dB corresponds to a normalized score of 75. Note that if
    // the noise percentile is zero then really it is unknown,
    // since scores are truncated to zero. In this case we don't
    // want this term to constrain the normalized score. However,
    // we also want this term to be smooth, i.e. the score should
    // be nearly continuous when F(0) = pn. Here, F(.) denotes the
    // c.d.f. of the score and pn the noise percentile. We achieve
    // this by adding "max score" * min(F(0) / "noise percentile",
    // to the score.
    std::uint32_t noiseScore;
    m_RawScoreQuantileSummary.quantile(m_NoisePercentile / 100.0, noiseScore);
    auto knotPoint = std::lower_bound(m_NormalizedScoreKnotPoints.begin(),
                                      m_NormalizedScoreKnotPoints.end(),
                                      TDoubleDoublePr(m_NoisePercentile, 0.0));
    double signalStrength =
        m_NoiseMultiplier * 10.0 / DISCRETIZATION_FACTOR *
        (static_cast<double>(discreteScore) - static_cast<double>(noiseScore));
    double l0;
    double u0;
    m_RawScoreQuantileSummary.cdf(0, 0.0, l0, u0);
    normalizedScores[0] =
        knotPoint->second * std::max(1.0 + signalStrength, 0.0) +
        m_MaximumNormalizedScore *
            std::max(2.0 * std::min(50.0 * (l0 + u0) / m_NoisePercentile, 1.0) - 1.0, 0.0);
    LOG_TRACE(<< "normalizedScores[0] = " << normalizedScores[0]
              << ", knotPoint = " << knotPoint->second << ", discreteScore = " << discreteScore
              << ", noiseScore = " << noiseScore << ", l(0) = " << l0
              << ", u(0) = " << u0 << ", signalStrength = " << signalStrength);

    // Compute the raw normalized score percentile compared to historic values.
    double lowerPercentile;
    double upperPercentile;
    this->quantile(score, 70.0 /*confidence interval*/, lowerPercentile, upperPercentile);
    lowerPercentile *= 100.0;
    upperPercentile *= 100.0;
    if (lowerPercentile > upperPercentile) {
        std::swap(lowerPercentile, upperPercentile);
    }
    lowerPercentile = maths::common::CTools::truncate(lowerPercentile, 0.0, 100.0);
    upperPercentile = maths::common::CTools::truncate(upperPercentile, 0.0, 100.0);

    std::size_t lowerKnotPoint =
        std::max(std::lower_bound(m_NormalizedScoreKnotPoints.begin(),
                                  m_NormalizedScoreKnotPoints.end(), lowerPercentile,
                                  maths::common::COrderings::SFirstLess()) -
                     m_NormalizedScoreKnotPoints.begin(),
                 std::ptrdiff_t(1));
    std::size_t upperKnotPoint =
        std::max(std::lower_bound(m_NormalizedScoreKnotPoints.begin(),
                                  m_NormalizedScoreKnotPoints.end(), upperPercentile,
                                  maths::common::COrderings::SFirstLess()) -
                     m_NormalizedScoreKnotPoints.begin(),
                 std::ptrdiff_t(1));
    if (lowerKnotPoint < m_NormalizedScoreKnotPoints.size()) {
        const TDoubleDoublePr& left = m_NormalizedScoreKnotPoints[lowerKnotPoint - 1];
        const TDoubleDoublePr& right = m_NormalizedScoreKnotPoints[lowerKnotPoint];
        normalizedScores[1] = maths::common::CTools::linearlyInterpolate(
            left.first, right.first, left.second, right.second, lowerPercentile);
    } else {
        normalizedScores[1] = m_MaximumNormalizedScore;
    }
    if (upperKnotPoint < m_NormalizedScoreKnotPoints.size()) {
        const TDoubleDoublePr& left = m_NormalizedScoreKnotPoints[upperKnotPoint - 1];
        const TDoubleDoublePr& right = m_NormalizedScoreKnotPoints[upperKnotPoint];
        normalizedScores[1] =
            (normalizedScores[1] + maths::common::CTools::linearlyInterpolate(
                                       left.first, right.first, left.second,
                                       right.second, upperPercentile)) /
            2.0;
    } else {
        normalizedScores[1] = (normalizedScores[1] + m_MaximumNormalizedScore) / 2.0;
    }
    LOG_TRACE(<< "normalizedScores[1] = " << normalizedScores[1] << ", lowerPercentile = "
              << lowerPercentile << ", upperPercentile = " << upperPercentile);

    double maxScore{0.0};
    bool hasValidMaxScore = this->maxScore(scope, maxScore);
    LOG_TRACE(<< "hasValidMaxScore = " << hasValidMaxScore
              << ", scope = " << scope.print());
    if (hasValidMaxScore) {
        // Compute the maximum score ceiling
        double ratio = score / maxScore;
        normalizedScores[2] = m_MaximumNormalizedScore *
                              std::min({0.0 + 1.5 * ratio, 0.5 + 0.5 * ratio});
        LOG_TRACE(<< "normalizedScores[2] = " << normalizedScores[2] << ", score = " << score
                  << ", maxScore = " << maxScore << ", ratio = " << ratio);
    }

    // Logarithmically interpolate the maximum score between the
    // largest significant and small probability.
    static const double M =
        (probabilityToScore(maths::common::SMALL_PROBABILITY) -
         probabilityToScore(maths::common::LARGEST_SIGNIFICANT_PROBABILITY)) /
        (std::log(maths::common::SMALL_PROBABILITY) -
         std::log(maths::common::LARGEST_SIGNIFICANT_PROBABILITY));
    static const double C = std::log(maths::common::LARGEST_SIGNIFICANT_PROBABILITY);
    normalizedScores[3] = m_MaximumNormalizedScore *
                          (0.95 * M * (std::log(scoreToProbability(score)) - C) + 0.05);
    LOG_TRACE(<< "normalizedScores[3] = " << normalizedScores[3] << ", score = " << score
              << ", probability = " << scoreToProbability(score));

    score = std::min(*std::min_element(std::begin(normalizedScores), std::end(normalizedScores)),
                     m_MaximumNormalizedScore);
    LOG_TRACE(<< "normalizedScore = " << score << ", scope = " << scope.print());

    return true;
}