in lib/maths/time_series/CTimeSeriesTestForSeasonality.cc [610:811]
CSeasonalDecomposition CTimeSeriesTestForSeasonality::select(TModelVec& decompositions) const {
// If the existing seasonality couldn't be tested short circuit: we'll keep it.
if (std::find_if(decompositions.begin(), decompositions.end(), [](const auto& decomposition) {
return decomposition.s_AlreadyModelled && decomposition.isTestable() == false;
}) != decompositions.end()) {
LOG_TRACE(<< "Not testable");
return {};
}
// Choose the hypothesis which yields the best explanation of the values.
// Sort by increasing complexity.
std::stable_sort(decompositions.begin(), decompositions.end(),
[](const auto& lhs, const auto& rhs) {
return lhs.numberParameters() < rhs.numberParameters();
});
auto computePValue = [&](std::size_t H1) {
double pValueMax{-1.0};
double logPValueProxyMax{-std::numeric_limits<double>::max()};
std::size_t pValueMaxHypothesis{decompositions.size()};
for (std::size_t H0 = 0; H0 < decompositions.size(); ++H0) {
if (decompositions[H0].isNull()) {
double pValue{decompositions[H1].pValue(decompositions[H0])};
double logPValueProxy{decompositions[H1].logPValueProxy(decompositions[H0])};
if (pValue > pValueMax ||
(pValue == pValueMax && logPValueProxy > logPValueProxyMax)) {
std::tie(pValueMax, logPValueProxyMax, pValueMaxHypothesis) =
std::make_tuple(pValue, logPValueProxy, H0);
}
}
}
pValueMax = std::max(pValueMax, common::CTools::smallestProbability());
return std::make_tuple(pValueMax, logPValueProxyMax, pValueMaxHypothesis);
};
double variance{common::CBasicStatistics::maximumLikelihoodVariance(
this->truncatedMoments(0.0, m_Values))};
double truncatedVariance{common::CBasicStatistics::maximumLikelihoodVariance(
this->truncatedMoments(m_OutlierFraction, m_Values))};
LOG_TRACE(<< "variance = " << variance << " truncated variance = " << truncatedVariance);
// Select the best decomposition if it is a statistically significant improvement.
std::size_t selected{decompositions.size()};
double qualitySelected{-std::numeric_limits<double>::max()};
double minPValue{1.0};
std::size_t h0ForMinPValue{0};
std::size_t h1ForMinPValue{0};
for (std::size_t H1 = 0; H1 < decompositions.size(); ++H1) {
if (decompositions[H1].isAlternative()) {
double pValueVsH0;
double logPValueProxy;
std::size_t H0;
std::tie(pValueVsH0, logPValueProxy, H0) = computePValue(H1);
double logPValue{(pValueVsH0 == 1.0 ? -std::numeric_limits<double>::min()
: std::log(pValueVsH0))};
logPValueProxy = std::min(logPValueProxy, -std::numeric_limits<double>::min());
double pValueToAccept{decompositions[H1].s_AlreadyModelled
? m_PValueToEvict
: m_AcceptedFalsePostiveRate};
if (pValueVsH0 < minPValue) {
minPValue = pValueVsH0;
h0ForMinPValue = H0;
h1ForMinPValue = H1;
}
LOG_TRACE(<< "hypothesis = " << decompositions[H1].s_Hypotheses);
LOG_TRACE(<< "p-value vs not periodic = " << pValueVsH0
<< ", log proxy = " << logPValueProxy);
if (pValueVsH0 > pValueToAccept) {
continue;
}
// We've rejected the hypothesis that the signal is not periodic.
//
// First check if this is significantly better than the currently selected
// model from an explained variance standpoint. If the test is ambiguous
// then fallback to selecting based on a number of criteria.
//
// We therefore choose the best model based on the following criteria:
// 1. The amount of variance explained per parameter. Models with a small
// period which explain most of the variance are preferred because they
// are more accurate and robust.
// 2. Whether the components are already modelled to avoid churn on marginal
// decisions.
// 3. The log p-value vs non seasonal. This captures information about both
// the model size and the variance with and without the model.
// 4. Standard deviations above the mean of the F-distribution. This captures
// similar information to the log p-value but won't underflow.
// 5. The total target model size. The p-value is less sensitive to model
// size as the window length increases. However, for both stability
// and efficiency considerations we strongly prefer smaller models.
// 6. The number of scalings and pieces in the trend model. These increase
// false positives so if we have an alternative similarly good hypothesis
// we use that one.
// 7. The number of repeats of the superposition of components we've seen.
// We penalise seeing fewer than two repeats to avoid using interference
// to fit changes in the seasonal pattern.
//
// Why sum the logs you might ask. This makes the decision dimensionless.
// Consider that sum_i{ log(f_i) } < sum_i{ log(f_i') } is equivalent to
// sum_i{ log(f_i / f_i')} < 0 so if we scale each feature by a constant
// they cancel and we still make the same decision.
//
// One can think of this as a smooth lexicographical order with the weights
// playing the role of the order: smaller weight values "break ties".
auto explainedVariancePerParameter =
decompositions[H1].explainedVariancePerParameter(variance, truncatedVariance);
double leastCommonRepeat{decompositions[H1].leastCommonRepeat()};
double pValueVsSelected{
selected < decompositions.size()
? decompositions[H1].pValue(decompositions[selected], 1e-3, m_SampleVariance)
: 1.0};
double scalings{decompositions[H1].numberScalings()};
double segments{
std::max(static_cast<double>(decompositions[H1].s_NumberTrendParameters / 2), 1.0) -
1.0};
LOG_TRACE(<< "explained variance per param = " << explainedVariancePerParameter
<< ", scalings = " << scalings << ", segments = " << segments
<< ", number parameters = " << decompositions[H1].numberParameters()
<< ", p-value H1 vs selected = " << pValueVsSelected);
LOG_TRACE(<< "residual moments = " << decompositions[H1].s_ResidualMoments
<< ", truncated residual moments = " << decompositions[H1].s_TruncatedResidualMoments
<< ", sample variance = " << m_SampleVariance);
LOG_TRACE(<< "least common repeat = " << leastCommonRepeat);
double quality{1.0 * std::log(explainedVariancePerParameter(0)) +
1.0 * std::log(explainedVariancePerParameter(1)) +
0.7 * decompositions[H1].componentsSimilarity() +
0.5 * std::log(-logPValue) + 0.2 * std::log(-logPValueProxy) -
0.5 * std::log(decompositions[H1].targetModelSize()) -
0.3 * std::log(0.2 + common::CTools::pow2(scalings)) -
0.3 * std::log(1.0 + common::CTools::pow2(segments)) -
0.3 * std::log(std::max(leastCommonRepeat, 0.5))};
double qualityToAccept{
qualitySelected -
std::log(1.0 + std::max(std::log(0.01 / pValueVsSelected), 0.0))};
LOG_TRACE(<< "target size = " << decompositions[H1].targetModelSize()
<< ", modelled = " << decompositions[H1].s_AlreadyModelled);
LOG_TRACE(<< "quality = " << quality << " to accept = " << qualityToAccept);
if (quality > qualityToAccept) {
std::tie(selected, qualitySelected) = std::make_pair(H1, quality);
LOG_TRACE(<< "selected " << selected);
}
}
}
CSeasonalDecomposition result;
if (selected < decompositions.size() &&
std::count_if(decompositions[selected].s_Hypotheses.begin(), // Are there new components?
decompositions[selected].s_Hypotheses.end(), [this](const auto& hypothesis) {
return hypothesis.s_Model &&
this->permittedPeriod(hypothesis.s_Period);
}) > 0) {
LOG_TRACE(<< "selected = " << decompositions[selected].s_Hypotheses);
result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength,
std::move(decompositions[selected].s_TrendInitialValues)});
for (auto& hypothesis : decompositions[selected].s_Hypotheses) {
if (hypothesis.s_Model && this->permittedPeriod(hypothesis.s_Period)) {
LOG_TRACE(<< "Adding " << hypothesis.s_Period.print());
result.add(this->annotationText(hypothesis.s_Period),
hypothesis.s_Period, hypothesis.s_ModelSize,
this->periodDescriptor(hypothesis.s_Period.s_Period),
m_ValuesStartTime, m_BucketsStartTime,
m_BucketLength, m_StartOfWeekTimeOverride,
std::move(hypothesis.s_InitialValues));
}
}
result.add(std::move(decompositions[selected].s_RemoveComponentsMask));
result.withinBucketVariance(m_SampleVariance);
return result;
}
// Check if we should remove all components.
std::ptrdiff_t numberModelled(m_ModelledPeriods.size());
double fractionNotMissing{static_cast<double>(observedRange(m_Values)) /
static_cast<double>(m_Values.size())};
LOG_TRACE(<< "p-value min = " << minPValue);
LOG_TRACE(<< "fraction not missing = " << fractionNotMissing);
if ((numberModelled > 0 && // We're modelling seasonality
decompositions[h1ForMinPValue].isEvictionPermitted() && // The window is suitable
minPValue > m_PValueToEvict) && // We have weak evidence for seasonality
(common::fuzzyGreaterThan(minPValue / m_PValueToEvict, 1.0, 0.2) &&
common::fuzzyGreaterThan(fractionNotMissing, 1.0, 0.5)) // We've observed enough of the window
.boolean()) {
result.add(CNewTrendSummary{
m_ValuesStartTime, m_BucketLength,
std::move(decompositions[h0ForMinPValue].s_TrendInitialValues)});
result.add(std::move(decompositions[h0ForMinPValue].s_RemoveComponentsMask));
result.withinBucketVariance(m_SampleVariance);
}
return result;
}