bool CAnomalyScore::CNormalizer::updateQuantiles()

in lib/model/CAnomalyScore.cc [509:673]


bool CAnomalyScore::CNormalizer::updateQuantiles(const CMaximumScoreScope& scope, double score) {
    using TUInt32UInt64Pr = std::pair<std::uint32_t, std::uint64_t>;
    using TUInt32UInt64PrVec = std::vector<TUInt32UInt64Pr>;

    CMaxScore& maxScore = m_MaxScores[scope.key(m_IsForMembersOfPopulation, m_Dictionary)];
    double oldMaxScore{maxScore.score()};
    maxScore.add(score);
    bool bigChange{maxScore.score() > BIG_CHANGE_FACTOR * oldMaxScore};
    LOG_TRACE(<< "updateQuantiles: scope = " << scope.print()
              << ", oldMaxScore = " << oldMaxScore << ", score = " << score
              << ", newMaxScore = " << maxScore.score() << ", bigChange = " << bigChange);

    m_Sample = std::max(m_Sample, score);
    if (++m_CountSinceLastSample < m_CountPerSample) {
        return bigChange;
    }

    std::uint32_t discreteScore = this->discreteScore(m_Sample);
    LOG_TRACE(<< "score = " << m_Sample << ", discreteScore = " << discreteScore
              << ", maxScore = " << maxScore.score() << ", scope = " << scope.print());

    m_CountSinceLastSample = 0;
    m_Sample = 0.0;

    std::uint64_t n = m_RawScoreQuantileSummary.n();
    std::uint64_t k = m_RawScoreQuantileSummary.k();
    LOG_TRACE(<< "n = " << n << ", k = " << k);

    // We are about to compress the q-digest, at the moment it comprises
    // the unique values we have seen so far. So we extract the values
    // greater than the HIGH_PERCENTILE percentile at this point to
    // initialize the fine grain high quantile summary.
    if ((n + 1) == k) {
        LOG_TRACE(<< "Initializing H");

        TUInt32UInt64PrVec L;
        m_RawScoreQuantileSummary.summary(L);
        if (L.empty()) {
            LOG_ERROR(<< "High quantile summary is empty: "
                      << m_RawScoreQuantileSummary.print());
        } else {
            auto highPercentileCount = static_cast<std::uint64_t>(
                (HIGH_PERCENTILE / 100.0) * static_cast<double>(n) + 0.5);

            // Estimate the high percentile score and update the count.
            std::size_t i = 1;
            m_HighPercentileScore = L[0].first;
            m_HighPercentileCount = L[0].second;
            for (/**/; i < L.size(); ++i) {
                if (L[i].second > highPercentileCount) {
                    m_HighPercentileScore = L[i - 1].first;
                    m_HighPercentileCount = L[i - 1].second;
                    break;
                }
            }
            if (m_HighPercentileCount > n) {
                LOG_ERROR(<< "Invalid c(H) " << m_HighPercentileCount);
                LOG_ERROR(<< "target " << highPercentileCount);
                LOG_ERROR(<< "L " << L);
                m_HighPercentileCount = n;
            }
            LOG_TRACE(<< "s(H) = " << m_HighPercentileScore
                      << ", c(H) = " << m_HighPercentileCount << ", percentile = "
                      << 100.0 * static_cast<double>(m_HighPercentileCount) /
                             static_cast<double>(n)
                      << "%, desired c(H) = " << highPercentileCount);

            // Populate the high quantile summary.
            for (/**/; i < L.size(); ++i) {
                std::uint32_t x = L[i].first;
                std::uint64_t m = L[i].second - L[i - 1].second;

                LOG_TRACE(<< "Adding (" << x << ", " << m << ") to H");
                m_RawScoreHighQuantileSummary.add(x, m);
            }
        }
    }

    m_RawScoreQuantileSummary.add(discreteScore);

    if (discreteScore <= m_HighPercentileScore) {
        ++m_HighPercentileCount;
    } else {
        m_RawScoreHighQuantileSummary.add(discreteScore);
    }
    LOG_TRACE(<< "percentile = "
              << static_cast<double>(m_HighPercentileCount) / static_cast<double>(n + 1));

    // Periodically refresh the high percentile score.
    if ((n + 1) > k && (n + 1) % k == 0) {
        LOG_TRACE(<< "Refreshing high quantile summary");

        auto highPercentileCount = static_cast<std::uint64_t>(
            (HIGH_PERCENTILE / 100.0) * static_cast<double>(n + 1) + 0.5);

        LOG_TRACE(<< "s(H) = " << m_HighPercentileScore << ", c(H) = " << m_HighPercentileCount
                  << ", desired c(H) = " << highPercentileCount);

        if (m_HighPercentileCount > highPercentileCount) {
            TUInt32UInt64PrVec L;
            m_RawScoreQuantileSummary.summary(L);
            TUInt32UInt64PrVec H;
            m_RawScoreHighQuantileSummary.summary(H);

            std::size_t i0 = std::min(
                static_cast<std::size_t>(
                    std::lower_bound(L.begin(), L.end(), highPercentileCount,
                                     maths::common::COrderings::SSecondLess()) -
                    L.begin()),
                L.size() - 1);
            std::size_t j = std::min(
                static_cast<std::size_t>(
                    std::upper_bound(H.begin(), H.end(), L[i0],
                                     maths::common::COrderings::SFirstLess()) -
                    H.begin()),
                H.size() - 1);

            std::uint64_t r = L[i0].second;
            for (std::size_t i = i0 + 1;
                 i < L.size() && L[i0].second + m_RawScoreHighQuantileSummary.n() < n + 1;
                 ++i) {
                for (/**/; j < H.size() && H[j].first <= L[i].first; ++j) {
                    r += (H[j].second - (j == 0 ? static_cast<std::uint64_t>(0)
                                                : H[j - 1].second));
                }

                std::uint32_t x = L[i].first;
                std::uint64_t m = r < L[i].second ? L[i].second - r
                                                  : static_cast<std::uint64_t>(0);
                r += m;
                if (m > 0) {
                    LOG_TRACE(<< "Adding (" << x << ',' << m << ") to H");
                    m_RawScoreHighQuantileSummary.add(x, m);
                }
            }

            m_HighPercentileScore = L[i0].first;
            m_HighPercentileCount = L[i0].second;
            if (m_HighPercentileCount > n + 1) {
                LOG_ERROR(<< "Invalid c(H) " << m_HighPercentileCount);
                LOG_ERROR(<< "target " << highPercentileCount);
                LOG_ERROR(<< "L " << L);
                m_HighPercentileCount = n;
            }

            LOG_TRACE(<< "s(H) = " << m_HighPercentileScore
                      << ", c(H) = " << m_HighPercentileCount << ", percentile = "
                      << 100.0 * static_cast<double>(m_HighPercentileCount) /
                             static_cast<double>(n + 1)
                      << "%");
        } else {
            m_RawScoreQuantileSummary.quantile(HIGH_PERCENTILE / 100.0, m_HighPercentileScore);
            double lowerBound;
            double upperBound;
            m_RawScoreQuantileSummary.cdf(m_HighPercentileScore, 0.0, lowerBound, upperBound);
            m_HighPercentileCount = static_cast<std::uint64_t>(
                static_cast<double>(n + 1) * lowerBound + 0.5);

            LOG_TRACE(<< "s(H) = " << m_HighPercentileScore << ", c(H) = " << m_HighPercentileCount
                      << ", percentile = " << 100.0 * lowerBound << "%");
        }
    }

    return bigChange;
}