CSeasonalDecomposition CTimeSeriesTestForSeasonality::decompose()

in lib/maths/time_series/CTimeSeriesTestForSeasonality.cc [408:608]
119 lines of code
14 McCabe index (conditional complexity)

CSeasonalDecomposition CTimeSeriesTestForSeasonality::decompose() const {

    // The quality of anomaly detection is sensitive to bias variance tradeoff.
    // If you care about point predictions you can get away with erring on the side
    // of overfitting slightly to avoid bias. We however care about the predicted
    // distribution and are very sensitive to prediction variance. We therefore want
    // to be careful to only model seasonality which adds significant predictive
    // value.
    //
    // This is the main entry point to decomposing a signal into its seasonal
    // components. This whole process is somewhat involved. Complications include
    // all the various data characteristics we would like to deal with automatically
    // these include: signals polluted with outliers, discontinuities in the trend,
    // discontinuities in the seasonality (i.e. scaling up or down) and missing
    // values. We'd also like very high power for detecting the most common seasonal
    // components, i.e. daily, weekly, weekday/weekend modulation and yearly
    // (predictive calendar features are handled separately). We also run this
    // continuously on various window lengths and it needs to produce a stable
    // result if the seasonality is not changing whilst still being able to detect
    // changes and to initialize new components with the right size (bias variance
    // tradeoff) and seed values.
    //
    // The high-level strategy is:
    //   1. For various trend assumptions, no trend, quadratic and piecewise linear,
    //   2. Test for the seasonalities we already model, common diurnal seasonality
    //      and the best decomposition based on serial autocorrelation and select
    //      those hypotheses for which there is strong evidence.
    //   3. Ensure there is good evidence that the signal is seasonal vs the best
    //      explanation for the values which only uses a trend.
    //   4. Given a set of statistically significant seasonal hypotheses choose
    //      the one which will lead to the best modelling and avoids churn.
    //
    // I found it was far more effective to consider each hypothesis separately.
    // The alternative pushes much more complexity into the step to actually fit
    // the model. For example, one might try and simultaneously fit piecewise linear
    // trend and seasonality, but for example determining the right break points
    // (if any) in the trend together with the appropriate scaled seasonal components
    // is a non trivial estimation problem. We take an ordered approach, first fitting
    // the trend then seasonality and selecting at each stage only to fit significant
    // effects. However, we also test simpler hypotheses, such that there is no
    // trend at all, explicitly. This is much more forgiving to the estimation
    // process since if the data doesn't have a trend not trying to fit one can
    // easily be identified as a better choice after the fact. The final selection
    // is based on a number of criterion which are geared towards our modelling
    // techniques and are described in select.

    if (this->checkInvariants() == false) {
        LOG_ERROR(<< "Failed invariants. Not testing for seasonality.");
        return {};
    }

    LOG_TRACE(<< "decomposing " << m_Values.size()
              << " values, bucket length = " << m_BucketLength);

    // Shortcircuit if we can't test any periods.
    if (canTestPeriod(m_Values, 0, CSignal::seasonalComponentSummary(2)) == false) {
        return {};
    }

    // If the values in the window are constant we should remove all components
    // we can test.
    double variance{[this] {
        TMeanVarAccumulator moments;
        for (const auto& value : m_Values) {
            moments.add(common::CBasicStatistics::mean(value),
                        common::CBasicStatistics::count(value));
        }
        return common::CBasicStatistics::variance(moments);
    }()};
    if (variance == 0.0) {
        CSeasonalDecomposition result;
        result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength, m_Values});
        result.add(m_ModelledPeriodsTestable);
        result.withinBucketVariance(m_SampleVariance);
        return result;
    }

    TSizeVec trendSegments{TSegmentation::piecewiseLinear(
        m_Values, m_SignificantPValue, m_OutlierFraction, m_MaximumNumberSegments)};
    LOG_TRACE(<< "trend segments = " << trendSegments);

    TRemoveTrend removeTrendModels[]{
        [this](const TSeasonalComponentVec& /*periods*/,
               TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
            LOG_TRACE(<< "no trend");
            values = m_Values;
            modelTrendSegments.clear();
            return true;
        },
        [this](const TSeasonalComponentVec& periods,
               TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
            // We wish to solve argmin_{t,s}{sum_i w_i (y_i - t(i) - s(i))^2}.
            // Since t and s are linear functions of their parameters this is
            // convex. The following makes use of the fact that the minimizers
            // of sum_i w_i (y_i - t(i))^2 and sum_i w_i (y_i - s(i))^2 are
            // trivial to perform a sequence of "line searches". In practice,
            // this converges quickly so we use a fixed number of iterations.
            LOG_TRACE(<< "quadratic trend");
            using TRegression = common::CLeastSquaresOnlineRegression<2, double>;

            modelTrendSegments.assign({0, values.size()});

            TRegression trend;
            TRegression::TArray parameters;
            parameters.fill(0.0);
            auto predictor = [&](std::size_t i) {
                return TRegression::predict(parameters, static_cast<double>(i));
            };

            if (periods.empty()) {
                values = m_Values;
                for (std::size_t j = 0; j < values.size(); ++j) {
                    trend.add(static_cast<double>(j),
                              common::CBasicStatistics::mean(values[j]),
                              common::CBasicStatistics::count(values[j]));
                }
                // Note that parameters are referenced by predictor. Reading them
                // here refreshes the values used for prediction. Computing the
                // parameters is the bottleneck in this code and the same values
                // are used for each prediction. We optimise removePredictions by
                // reading them once upfront.
                trend.parameters(parameters);
                removePredictions(predictor, values);
                return true;
            }

            for (std::size_t i = 0; i < 3; ++i) {
                values = m_Values;
                removePredictions(predictor, values);
                CSignal::fitSeasonalComponents(periods, values, m_Components);
                values = m_Values;
                removePredictions({periods, 0, periods.size()},
                                  {m_Components, 0, m_Components.size()}, values);
                trend = TRegression{};
                for (std::size_t j = 0; j < values.size(); ++j) {
                    trend.add(static_cast<double>(j),
                              common::CBasicStatistics::mean(values[j]),
                              common::CBasicStatistics::count(values[j]));
                }
                // See above.
                trend.parameters(parameters);
            }
            values = m_Values;
            removePredictions(predictor, values);
            return true;
        },
        [&](const TSeasonalComponentVec& periods,
            TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
            if (trendSegments.size() <= 2) {
                return false;
            }

            // We're only interested in applying segmentation when the number of
            // segments is small w.r.t. the number of repeats. In such cases we
            // can fit the trend first and suffer little loss in accuracy.
            LOG_TRACE(<< trendSegments.size() - 1 << " linear trend segments");
            modelTrendSegments = trendSegments;

            auto predictor = [&](std::size_t i) {
                double result{0.0};
                for (std::size_t j = 0; j < periods.size(); ++j) {
                    result += periods[j].value(m_Components[j], i);
                }
                return result;
            };

            values = m_Values;
            values = TSegmentation::removePiecewiseLinear(
                std::move(values), modelTrendSegments, m_OutlierFraction);

            if (periods.empty() == false) {
                CSignal::fitSeasonalComponents(periods, values, m_Components);
                values = m_Values;
                removePredictions(predictor, values);
                values = TSegmentation::removePiecewiseLinear(
                    std::move(values), modelTrendSegments, m_OutlierFraction);
                removePredictions([&](std::size_t j) { return -predictor(j); }, values);
            }

            return true;
        }};

    try {
        TModelVec decompositions;
        decompositions.reserve(8 * std::size(removeTrendModels));

        for (const auto& removeTrend : removeTrendModels) {
            this->addNotSeasonal(removeTrend, decompositions);
            this->addModelled(removeTrend, decompositions);
            this->addDiurnal(removeTrend, decompositions);
            this->addHighestAutocorrelation(removeTrend, decompositions);
        }

        return this->select(decompositions);

    } catch (const std::exception& e) {
        LOG_ERROR(<< "Seasonal decomposition failed: " << e.what());
    }

    return {};
}