void CAdaptiveBucketing::refine()

in lib/maths/time_series/CAdaptiveBucketing.cc [395:550]
112 lines of code
18 McCabe index (conditional complexity)

void CAdaptiveBucketing::refine(core_t::TTime time) {
    using TDoubleDoublePr = std::pair<double, double>;
    using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
    using TDoubleSizePr = std::pair<double, std::size_t>;
    using TMinMaxAccumulator = common::CBasicStatistics::CMinMax<TDoubleSizePr>;

    LOG_TRACE(<< "refining at " << time);

    if (m_Endpoints.size() < 2) {
        return;
    }

    // Check if any buckets should be split based on the large error counts.
    this->maybeSplitBucket();

    std::size_t n{m_Endpoints.size() - 1};
    double a{m_Endpoints[0]};
    double b{m_Endpoints[n]};

    // Extract the bucket means.
    TDoubleDoublePrVec values;
    values.reserve(n);
    for (std::size_t i = 0; i < n; ++i) {
        values.emplace_back(this->bucketCount(i), this->predict(i, time, m_Centres[i]));
    }
    LOG_TRACE(<< "values = " << values);

    // Compute the function range in each bucket, imposing periodic
    // boundary conditions at the start and end of the interval.
    TDoubleVec ranges;
    ranges.reserve(n);
    for (std::size_t i = 0; i < n; ++i) {
        TDoubleDoublePr v[]{values[(n + i - 2) % n], values[(n + i - 1) % n],
                            values[(n + i + 0) % n], // centre
                            values[(n + i + 1) % n], values[(n + i + 2) % n]};

        TMinMaxAccumulator minmax;
        for (std::size_t j = 0; j < sizeof(v) / sizeof(v[0]); ++j) {
            if (v[j].first > 0.0) {
                minmax.add({v[j].second, j});
            }
        }

        if (minmax.initialized()) {
            ranges.push_back(WEIGHTS[minmax.max().second > minmax.min().second
                                         ? minmax.max().second - minmax.min().second
                                         : minmax.min().second - minmax.max().second] *
                             std::pow(minmax.max().first - minmax.min().first, 0.75));
        } else {
            ranges.push_back(0.0);
        }
    }

    // Smooth the ranges by convolving with a smoothing function.
    // We do this in the "time" domain because the smoothing
    // function is narrow. Estimate the averaging error in each
    // bucket by multiplying the smoothed range by the bucket width.
    TDoubleVec averagingErrors;
    averagingErrors.reserve(n);
    for (std::size_t i = 0; i < n; ++i) {
        double ai{m_Endpoints[i]};
        double bi{m_Endpoints[i + 1]};
        double error{0.0};
        for (std::size_t j = 0; j < std::size(SMOOTHING_FUNCTION); ++j) {
            error += SMOOTHING_FUNCTION[j] * ranges[(n + i + j - WIDTH) % n];
        }
        double h{bi - ai};
        error *= h / (b - a);
        averagingErrors.push_back(error);
    }
    double maxAveragingError{
        *std::max_element(averagingErrors.begin(), averagingErrors.end())};
    for (const auto& pValue : m_LargeErrorCountPValues) {
        if (pValue.first < MODERATE_SIGNIFICANCE) {
            averagingErrors[pValue.second] = maxAveragingError;
        }
    }
    double totalAveragingError{
        std::accumulate(averagingErrors.begin(), averagingErrors.end(), 0.0)};
    LOG_TRACE(<< "averagingErrors = " << averagingErrors);
    LOG_TRACE(<< "totalAveragingError = " << totalAveragingError);

    double n_{static_cast<double>(n)};
    double step{(1.0 - n_ * EPS) * totalAveragingError / n_};
    TFloatVec endpoints{m_Endpoints};
    LOG_TRACE(<< "step = " << step);

    // If all the function values are identical then the end points
    // should be equidistant. We check step in case of underflow.
    if (step == 0.0) {
        m_Endpoints[0] = a;
        for (std::size_t i = 0; i < n; ++i) {
            m_Endpoints[i] = (b - a) * static_cast<double>(i) / n_;
        }
        m_Endpoints[n] = b;
    } else {
        // Noise in the bucket mean values creates a "high" frequency
        // mean zero driving force on the buckets' end points desired
        // positions. Once they have stabilized on their desired location
        // for the trend, we are able to detect this by comparing the
        // time averaged desired displacement and the absolute desired
        // displacement. The lower the ratio the more smoothing we apply.
        // Note we want to damp the noise out because the process of
        // adjusting the buckets end points loses a small amount of
        // information, see the comments at the start of refresh for
        // more details.
        double alpha{
            ALPHA *
            (common::CBasicStatistics::mean(m_MeanAbsDesiredDisplacement) == 0.0
                 ? 1.0
                 : std::fabs(common::CBasicStatistics::mean(m_MeanDesiredDisplacement)) /
                       common::CBasicStatistics::mean(m_MeanAbsDesiredDisplacement))};
        LOG_TRACE(<< "alpha = " << alpha);
        double displacement{0.0};

        // Linearly interpolate between the current end points and points
        // separated by equal total averaging error. Interpolating is
        // equivalent to adding drag to the end point dynamics and damps
        // any oscillations.
        double unassignedAveragingError{0.0};
        for (std::size_t i = 0, j = 1; i < n && j < n + 1; ++i) {
            double ai{endpoints[i]};
            double bi{endpoints[i + 1]};
            double hi{bi - ai};
            double ei{averagingErrors[i]};
            unassignedAveragingError += ei;
            for (double ej = step - (unassignedAveragingError - ei);
                 unassignedAveragingError >= step;
                 ej += step, unassignedAveragingError -= step, ++j) {
                double xj{hi * ej / ei};
                m_Endpoints[j] = std::max(
                    m_Endpoints[j - 1] + 1e-8 * std::fabs(m_Endpoints[j - 1]),
                    endpoints[j] + alpha * (ai + xj - endpoints[j]));
                displacement += (ai + xj) - endpoints[j];
                LOG_TRACE(<< "interval = [" << ai << "," << bi << "]"
                          << " averaging error / unit length = " << ei / hi << ", desired translation "
                          << endpoints[j] << " -> " << ai + xj);
            }
        }

        // Add a margin to guaranty we'll meet the separation constraint.
        spread(a, b, (1.0 + 1e-8) * m_MinimumBucketLength, m_Endpoints);

        // By construction, the first and last end point should be
        // close "a" and "b", respectively, but we snap them to "a"
        // and "b" so that the total interval is unchanged.
        m_Endpoints[0] = a;
        m_Endpoints[n] = b;
        LOG_TRACE(<< "refinedEndpoints = " << m_Endpoints);

        m_MeanDesiredDisplacement.add(displacement);
        m_MeanAbsDesiredDisplacement.add(std::fabs(displacement));
    }

    this->refresh(endpoints);
}