in lib/maths/time_series/CTimeSeriesTestForSeasonality.cc [408:608]
CSeasonalDecomposition CTimeSeriesTestForSeasonality::decompose() const {
// The quality of anomaly detection is sensitive to bias variance tradeoff.
// If you care about point predictions you can get away with erring on the side
// of overfitting slightly to avoid bias. We however care about the predicted
// distribution and are very sensitive to prediction variance. We therefore want
// to be careful to only model seasonality which adds significant predictive
// value.
//
// This is the main entry point to decomposing a signal into its seasonal
// components. This whole process is somewhat involved. Complications include
// all the various data characteristics we would like to deal with automatically
// these include: signals polluted with outliers, discontinuities in the trend,
// discontinuities in the seasonality (i.e. scaling up or down) and missing
// values. We'd also like very high power for detecting the most common seasonal
// components, i.e. daily, weekly, weekday/weekend modulation and yearly
// (predictive calendar features are handled separately). We also run this
// continuously on various window lengths and it needs to produce a stable
// result if the seasonality is not changing whilst still being able to detect
// changes and to initialize new components with the right size (bias variance
// tradeoff) and seed values.
//
// The high-level strategy is:
// 1. For various trend assumptions, no trend, quadratic and piecewise linear,
// 2. Test for the seasonalities we already model, common diurnal seasonality
// and the best decomposition based on serial autocorrelation and select
// those hypotheses for which there is strong evidence.
// 3. Ensure there is good evidence that the signal is seasonal vs the best
// explanation for the values which only uses a trend.
// 4. Given a set of statistically significant seasonal hypotheses choose
// the one which will lead to the best modelling and avoids churn.
//
// I found it was far more effective to consider each hypothesis separately.
// The alternative pushes much more complexity into the step to actually fit
// the model. For example, one might try and simultaneously fit piecewise linear
// trend and seasonality, but for example determining the right break points
// (if any) in the trend together with the appropriate scaled seasonal components
// is a non trivial estimation problem. We take an ordered approach, first fitting
// the trend then seasonality and selecting at each stage only to fit significant
// effects. However, we also test simpler hypotheses, such that there is no
// trend at all, explicitly. This is much more forgiving to the estimation
// process since if the data doesn't have a trend not trying to fit one can
// easily be identified as a better choice after the fact. The final selection
// is based on a number of criterion which are geared towards our modelling
// techniques and are described in select.
if (this->checkInvariants() == false) {
LOG_ERROR(<< "Failed invariants. Not testing for seasonality.");
return {};
}
LOG_TRACE(<< "decomposing " << m_Values.size()
<< " values, bucket length = " << m_BucketLength);
// Shortcircuit if we can't test any periods.
if (canTestPeriod(m_Values, 0, CSignal::seasonalComponentSummary(2)) == false) {
return {};
}
// If the values in the window are constant we should remove all components
// we can test.
double variance{[this] {
TMeanVarAccumulator moments;
for (const auto& value : m_Values) {
moments.add(common::CBasicStatistics::mean(value),
common::CBasicStatistics::count(value));
}
return common::CBasicStatistics::variance(moments);
}()};
if (variance == 0.0) {
CSeasonalDecomposition result;
result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength, m_Values});
result.add(m_ModelledPeriodsTestable);
result.withinBucketVariance(m_SampleVariance);
return result;
}
TSizeVec trendSegments{TSegmentation::piecewiseLinear(
m_Values, m_SignificantPValue, m_OutlierFraction, m_MaximumNumberSegments)};
LOG_TRACE(<< "trend segments = " << trendSegments);
TRemoveTrend removeTrendModels[]{
[this](const TSeasonalComponentVec& /*periods*/,
TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
LOG_TRACE(<< "no trend");
values = m_Values;
modelTrendSegments.clear();
return true;
},
[this](const TSeasonalComponentVec& periods,
TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
// We wish to solve argmin_{t,s}{sum_i w_i (y_i - t(i) - s(i))^2}.
// Since t and s are linear functions of their parameters this is
// convex. The following makes use of the fact that the minimizers
// of sum_i w_i (y_i - t(i))^2 and sum_i w_i (y_i - s(i))^2 are
// trivial to perform a sequence of "line searches". In practice,
// this converges quickly so we use a fixed number of iterations.
LOG_TRACE(<< "quadratic trend");
using TRegression = common::CLeastSquaresOnlineRegression<2, double>;
modelTrendSegments.assign({0, values.size()});
TRegression trend;
TRegression::TArray parameters;
parameters.fill(0.0);
auto predictor = [&](std::size_t i) {
return TRegression::predict(parameters, static_cast<double>(i));
};
if (periods.empty()) {
values = m_Values;
for (std::size_t j = 0; j < values.size(); ++j) {
trend.add(static_cast<double>(j),
common::CBasicStatistics::mean(values[j]),
common::CBasicStatistics::count(values[j]));
}
// Note that parameters are referenced by predictor. Reading them
// here refreshes the values used for prediction. Computing the
// parameters is the bottleneck in this code and the same values
// are used for each prediction. We optimise removePredictions by
// reading them once upfront.
trend.parameters(parameters);
removePredictions(predictor, values);
return true;
}
for (std::size_t i = 0; i < 3; ++i) {
values = m_Values;
removePredictions(predictor, values);
CSignal::fitSeasonalComponents(periods, values, m_Components);
values = m_Values;
removePredictions({periods, 0, periods.size()},
{m_Components, 0, m_Components.size()}, values);
trend = TRegression{};
for (std::size_t j = 0; j < values.size(); ++j) {
trend.add(static_cast<double>(j),
common::CBasicStatistics::mean(values[j]),
common::CBasicStatistics::count(values[j]));
}
// See above.
trend.parameters(parameters);
}
values = m_Values;
removePredictions(predictor, values);
return true;
},
[&](const TSeasonalComponentVec& periods,
TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) {
if (trendSegments.size() <= 2) {
return false;
}
// We're only interested in applying segmentation when the number of
// segments is small w.r.t. the number of repeats. In such cases we
// can fit the trend first and suffer little loss in accuracy.
LOG_TRACE(<< trendSegments.size() - 1 << " linear trend segments");
modelTrendSegments = trendSegments;
auto predictor = [&](std::size_t i) {
double result{0.0};
for (std::size_t j = 0; j < periods.size(); ++j) {
result += periods[j].value(m_Components[j], i);
}
return result;
};
values = m_Values;
values = TSegmentation::removePiecewiseLinear(
std::move(values), modelTrendSegments, m_OutlierFraction);
if (periods.empty() == false) {
CSignal::fitSeasonalComponents(periods, values, m_Components);
values = m_Values;
removePredictions(predictor, values);
values = TSegmentation::removePiecewiseLinear(
std::move(values), modelTrendSegments, m_OutlierFraction);
removePredictions([&](std::size_t j) { return -predictor(j); }, values);
}
return true;
}};
try {
TModelVec decompositions;
decompositions.reserve(8 * std::size(removeTrendModels));
for (const auto& removeTrend : removeTrendModels) {
this->addNotSeasonal(removeTrend, decompositions);
this->addModelled(removeTrend, decompositions);
this->addDiurnal(removeTrend, decompositions);
this->addHighestAutocorrelation(removeTrend, decompositions);
}
return this->select(decompositions);
} catch (const std::exception& e) {
LOG_ERROR(<< "Seasonal decomposition failed: " << e.what());
}
return {};
}