void CSeasonalComponentAdaptiveBucketing::refresh()

in lib/maths/time_series/CSeasonalComponentAdaptiveBucketing.cc [355:505]
115 lines of code
9 McCabe index (conditional complexity)

void CSeasonalComponentAdaptiveBucketing::refresh(const TFloatVec& oldEndpoints) {
    // Values are assigned based on their intersection with each
    // bucket in the previous configuration. The regression and
    // variance are computed using the appropriate combination
    // rules. Note that the count is weighted by the square of
    // the fractional intersection between the old and new buckets.
    // This means that the effective weight of buckets whose end
    // points change significantly is reduced. This is reasonable
    // because the periodic trend is assumed to be unchanging
    // throughout the interval, when of course it is varying, so
    // adjusting the end points introduces error in the bucket
    // value, which we handle by reducing its significance in the
    // new bucket values.
    //
    // A better approximation is to assume that it the trend is
    // continuous. In fact, this can be done by using a spline
    // with the constraint that the mean of the spline in each
    // interval is equal to the mean value. We can additionally
    // decompose the variance into a contribution from noise and
    // a contribution from the trend. Under these assumptions it
    // is then possible (but not trivial) to update the bucket
    // means and variances based on the new end point positions.
    // This might be worth considering at some point.

    using TDoubleMeanAccumulator = common::CBasicStatistics::SSampleMean<double>::TAccumulator;
    using TMinAccumulator = common::CBasicStatistics::SMin<core_t::TTime>::TAccumulator;

    std::size_t m{m_Buckets.size()};
    std::size_t n{oldEndpoints.size()};
    if (m + 1 != n) {
        LOG_ERROR(<< "Inconsistent end points and regressions");
        return;
    }

    const TFloatVec& newEndpoints{this->endpoints()};
    const TFloatVec& oldCentres{this->centres()};
    const TFloatVec& oldLargeErrorCounts{this->largeErrorCounts()};

    TBucketVec buckets;
    TFloatVec newCentres;
    TFloatVec newLargeErrorCounts;
    buckets.reserve(m);
    newCentres.reserve(m);
    newLargeErrorCounts.reserve(m);

    for (std::size_t i = 1; i < n; ++i) {
        double yl{newEndpoints[i - 1]};
        double yr{newEndpoints[i]};
        std::size_t r(std::lower_bound(oldEndpoints.begin(), oldEndpoints.end(), yr) -
                      oldEndpoints.begin());
        r = common::CTools::truncate(r, std::size_t(1), n - 1);

        std::size_t l(std::upper_bound(oldEndpoints.begin(), oldEndpoints.end(), yl) -
                      oldEndpoints.begin());
        l = common::CTools::truncate(l, std::size_t(1), r);

        LOG_TRACE(<< "interval = [" << yl << "," << yr << "]");
        LOG_TRACE(<< "l = " << l << ", r = " << r);
        LOG_TRACE(<< "[x(l), x(r)] = [" << oldEndpoints[l - 1] << ","
                  << oldEndpoints[r] << "]");

        double xl{oldEndpoints[l - 1]};
        double xr{oldEndpoints[l]};
        if (l == r) {
            double interval{newEndpoints[i] - newEndpoints[i - 1]};
            double w{common::CTools::truncate(interval / (xr - xl), 0.0, 1.0)};
            const SBucket& bucket{m_Buckets[l - 1]};
            buckets.emplace_back(bucket.s_Regression.scaled(w * w), bucket.s_Variance,
                                 bucket.s_FirstUpdate, bucket.s_LastUpdate);
            newCentres.push_back(common::CTools::truncate(
                static_cast<double>(oldCentres[l - 1]), yl, yr));
            newLargeErrorCounts.push_back(w * oldLargeErrorCounts[l - 1]);
        } else {
            double interval{xr - newEndpoints[i - 1]};
            double w{common::CTools::truncate(interval / (xr - xl), 0.0, 1.0)};
            const SBucket* bucket{&m_Buckets[l - 1]};
            TMinAccumulator firstUpdate;
            TMinAccumulator lastUpdate;
            TDoubleRegression regression{bucket->s_Regression.scaled(w)};
            TDoubleMeanVarAccumulator variance{common::CBasicStatistics::momentsAccumulator(
                w * bucket->s_Regression.count(), bucket->s_Regression.mean(),
                static_cast<double>(bucket->s_Variance))};
            firstUpdate.add(bucket->s_FirstUpdate);
            lastUpdate.add(bucket->s_LastUpdate);
            TDoubleMeanAccumulator centre{common::CBasicStatistics::momentsAccumulator(
                w * bucket->s_Regression.count(), static_cast<double>(oldCentres[l - 1]))};
            double largeErrorCount{w * oldLargeErrorCounts[l - 1]};
            double count{w * w * bucket->s_Regression.count()};
            while (++l < r) {
                bucket = &m_Buckets[l - 1];
                regression += bucket->s_Regression;
                variance += common::CBasicStatistics::momentsAccumulator(
                    bucket->s_Regression.count(), bucket->s_Regression.mean(),
                    static_cast<double>(bucket->s_Variance));
                firstUpdate.add(bucket->s_FirstUpdate);
                lastUpdate.add(bucket->s_LastUpdate);
                centre += common::CBasicStatistics::momentsAccumulator(
                    bucket->s_Regression.count(), static_cast<double>(oldCentres[l - 1]));
                largeErrorCount += oldLargeErrorCounts[l - 1];
                count += bucket->s_Regression.count();
            }
            xl = oldEndpoints[l - 1];
            xr = oldEndpoints[l];
            bucket = &m_Buckets[l - 1];
            interval = newEndpoints[i] - xl;
            w = common::CTools::truncate(interval / (xr - xl), 0.0, 1.0);
            regression += bucket->s_Regression.scaled(w);
            variance += common::CBasicStatistics::momentsAccumulator(
                w * bucket->s_Regression.count(), bucket->s_Regression.mean(),
                static_cast<double>(bucket->s_Variance));
            firstUpdate.add(bucket->s_FirstUpdate);
            lastUpdate.add(bucket->s_LastUpdate);
            centre += common::CBasicStatistics::momentsAccumulator(
                w * bucket->s_Regression.count(), static_cast<double>(oldCentres[l - 1]));
            largeErrorCount += w * oldLargeErrorCounts[l - 1];
            count += w * w * bucket->s_Regression.count();
            double scale{count == regression.count() ? 1.0 : count / regression.count()};
            buckets.emplace_back(regression.scaled(scale),
                                 common::CBasicStatistics::maximumLikelihoodVariance(variance),
                                 firstUpdate[0], lastUpdate[0]);
            newCentres.push_back(common::CTools::truncate(
                common::CBasicStatistics::mean(centre), yl, yr));
            newLargeErrorCounts.push_back(largeErrorCount);
        }
    }

    // We want all regressions to respond at the same rate to changes
    // in the trend. To achieve this we should assign them a weight
    // that is equal to the number of points they will receive in one
    // period.
    double count{0.0};
    for (const auto& bucket : buckets) {
        count += bucket.s_Regression.count();
    }
    count /= (oldEndpoints[m] - oldEndpoints[0]);
    for (std::size_t i = 0; i < m; ++i) {
        double c{buckets[i].s_Regression.count()};
        if (c > 0.0) {
            buckets[i].s_Regression.scale(
                count * (oldEndpoints[i + 1] - oldEndpoints[i]) / c);
        }
    }

    LOG_TRACE(<< "old endpoints   = " << oldEndpoints);
    LOG_TRACE(<< "old centres     = " << oldCentres);
    LOG_TRACE(<< "new endpoints   = " << newEndpoints);
    LOG_TRACE(<< "new centres     = " << newCentres);
    m_Buckets.swap(buckets);
    this->centres().swap(newCentres);
    this->largeErrorCounts().swap(newLargeErrorCounts);
}