CSeasonalDecomposition CTimeSeriesTestForSeasonality::select()

in lib/maths/time_series/CTimeSeriesTestForSeasonality.cc [610:811]
145 lines of code
27 McCabe index (conditional complexity)

CSeasonalDecomposition CTimeSeriesTestForSeasonality::select(TModelVec& decompositions) const {

    // If the existing seasonality couldn't be tested short circuit: we'll keep it.

    if (std::find_if(decompositions.begin(), decompositions.end(), [](const auto& decomposition) {
            return decomposition.s_AlreadyModelled && decomposition.isTestable() == false;
        }) != decompositions.end()) {
        LOG_TRACE(<< "Not testable");
        return {};
    }

    // Choose the hypothesis which yields the best explanation of the values.

    // Sort by increasing complexity.
    std::stable_sort(decompositions.begin(), decompositions.end(),
                     [](const auto& lhs, const auto& rhs) {
                         return lhs.numberParameters() < rhs.numberParameters();
                     });

    auto computePValue = [&](std::size_t H1) {
        double pValueMax{-1.0};
        double logPValueProxyMax{-std::numeric_limits<double>::max()};
        std::size_t pValueMaxHypothesis{decompositions.size()};
        for (std::size_t H0 = 0; H0 < decompositions.size(); ++H0) {
            if (decompositions[H0].isNull()) {
                double pValue{decompositions[H1].pValue(decompositions[H0])};
                double logPValueProxy{decompositions[H1].logPValueProxy(decompositions[H0])};
                if (pValue > pValueMax ||
                    (pValue == pValueMax && logPValueProxy > logPValueProxyMax)) {
                    std::tie(pValueMax, logPValueProxyMax, pValueMaxHypothesis) =
                        std::make_tuple(pValue, logPValueProxy, H0);
                }
            }
        }
        pValueMax = std::max(pValueMax, common::CTools::smallestProbability());
        return std::make_tuple(pValueMax, logPValueProxyMax, pValueMaxHypothesis);
    };

    double variance{common::CBasicStatistics::maximumLikelihoodVariance(
        this->truncatedMoments(0.0, m_Values))};
    double truncatedVariance{common::CBasicStatistics::maximumLikelihoodVariance(
        this->truncatedMoments(m_OutlierFraction, m_Values))};
    LOG_TRACE(<< "variance = " << variance << " truncated variance = " << truncatedVariance);

    // Select the best decomposition if it is a statistically significant improvement.
    std::size_t selected{decompositions.size()};
    double qualitySelected{-std::numeric_limits<double>::max()};
    double minPValue{1.0};
    std::size_t h0ForMinPValue{0};
    std::size_t h1ForMinPValue{0};
    for (std::size_t H1 = 0; H1 < decompositions.size(); ++H1) {
        if (decompositions[H1].isAlternative()) {
            double pValueVsH0;
            double logPValueProxy;
            std::size_t H0;
            std::tie(pValueVsH0, logPValueProxy, H0) = computePValue(H1);
            double logPValue{(pValueVsH0 == 1.0 ? -std::numeric_limits<double>::min()
                                                : std::log(pValueVsH0))};
            logPValueProxy = std::min(logPValueProxy, -std::numeric_limits<double>::min());
            double pValueToAccept{decompositions[H1].s_AlreadyModelled
                                      ? m_PValueToEvict
                                      : m_AcceptedFalsePostiveRate};
            if (pValueVsH0 < minPValue) {
                minPValue = pValueVsH0;
                h0ForMinPValue = H0;
                h1ForMinPValue = H1;
            }
            LOG_TRACE(<< "hypothesis = " << decompositions[H1].s_Hypotheses);
            LOG_TRACE(<< "p-value vs not periodic = " << pValueVsH0
                      << ", log proxy = " << logPValueProxy);

            if (pValueVsH0 > pValueToAccept) {
                continue;
            }

            // We've rejected the hypothesis that the signal is not periodic.
            //
            // First check if this is significantly better than the currently selected
            // model from an explained variance standpoint. If the test is ambiguous
            // then fallback to selecting based on a number of criteria.
            //
            // We therefore choose the best model based on the following criteria:
            //   1. The amount of variance explained per parameter. Models with a small
            //      period which explain most of the variance are preferred because they
            //      are more accurate and robust.
            //   2. Whether the components are already modelled to avoid churn on marginal
            //      decisions.
            //   3. The log p-value vs non seasonal. This captures information about both
            //      the model size and the variance with and without the model.
            //   4. Standard deviations above the mean of the F-distribution. This captures
            //      similar information to the log p-value but won't underflow.
            //   5. The total target model size. The p-value is less sensitive to model
            //      size as the window length increases. However, for both stability
            //      and efficiency considerations we strongly prefer smaller models.
            //   6. The number of scalings and pieces in the trend model. These increase
            //      false positives so if we have an alternative similarly good hypothesis
            //      we use that one.
            //   7. The number of repeats of the superposition of components we've seen.
            //      We penalise seeing fewer than two repeats to avoid using interference
            //      to fit changes in the seasonal pattern.
            //
            // Why sum the logs you might ask. This makes the decision dimensionless.
            // Consider that sum_i{ log(f_i) } < sum_i{ log(f_i') } is equivalent to
            // sum_i{ log(f_i / f_i')} < 0 so if we scale each feature by a constant
            // they cancel and we still make the same decision.
            //
            // One can think of this as a smooth lexicographical order with the weights
            // playing the role of the order: smaller weight values "break ties".

            auto explainedVariancePerParameter =
                decompositions[H1].explainedVariancePerParameter(variance, truncatedVariance);
            double leastCommonRepeat{decompositions[H1].leastCommonRepeat()};
            double pValueVsSelected{
                selected < decompositions.size()
                    ? decompositions[H1].pValue(decompositions[selected], 1e-3, m_SampleVariance)
                    : 1.0};
            double scalings{decompositions[H1].numberScalings()};
            double segments{
                std::max(static_cast<double>(decompositions[H1].s_NumberTrendParameters / 2), 1.0) -
                1.0};
            LOG_TRACE(<< "explained variance per param = " << explainedVariancePerParameter
                      << ", scalings = " << scalings << ", segments = " << segments
                      << ", number parameters = " << decompositions[H1].numberParameters()
                      << ", p-value H1 vs selected = " << pValueVsSelected);
            LOG_TRACE(<< "residual moments = " << decompositions[H1].s_ResidualMoments
                      << ", truncated residual moments = " << decompositions[H1].s_TruncatedResidualMoments
                      << ", sample variance = " << m_SampleVariance);
            LOG_TRACE(<< "least common repeat = " << leastCommonRepeat);

            double quality{1.0 * std::log(explainedVariancePerParameter(0)) +
                           1.0 * std::log(explainedVariancePerParameter(1)) +
                           0.7 * decompositions[H1].componentsSimilarity() +
                           0.5 * std::log(-logPValue) + 0.2 * std::log(-logPValueProxy) -
                           0.5 * std::log(decompositions[H1].targetModelSize()) -
                           0.3 * std::log(0.2 + common::CTools::pow2(scalings)) -
                           0.3 * std::log(1.0 + common::CTools::pow2(segments)) -
                           0.3 * std::log(std::max(leastCommonRepeat, 0.5))};
            double qualityToAccept{
                qualitySelected -
                std::log(1.0 + std::max(std::log(0.01 / pValueVsSelected), 0.0))};
            LOG_TRACE(<< "target size = " << decompositions[H1].targetModelSize()
                      << ", modelled = " << decompositions[H1].s_AlreadyModelled);
            LOG_TRACE(<< "quality = " << quality << " to accept = " << qualityToAccept);

            if (quality > qualityToAccept) {
                std::tie(selected, qualitySelected) = std::make_pair(H1, quality);
                LOG_TRACE(<< "selected " << selected);
            }
        }
    }

    CSeasonalDecomposition result;

    if (selected < decompositions.size() &&
        std::count_if(decompositions[selected].s_Hypotheses.begin(), // Are there new components?
                      decompositions[selected].s_Hypotheses.end(), [this](const auto& hypothesis) {
                          return hypothesis.s_Model &&
                                 this->permittedPeriod(hypothesis.s_Period);
                      }) > 0) {
        LOG_TRACE(<< "selected = " << decompositions[selected].s_Hypotheses);

        result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength,
                                    std::move(decompositions[selected].s_TrendInitialValues)});
        for (auto& hypothesis : decompositions[selected].s_Hypotheses) {
            if (hypothesis.s_Model && this->permittedPeriod(hypothesis.s_Period)) {
                LOG_TRACE(<< "Adding " << hypothesis.s_Period.print());
                result.add(this->annotationText(hypothesis.s_Period),
                           hypothesis.s_Period, hypothesis.s_ModelSize,
                           this->periodDescriptor(hypothesis.s_Period.s_Period),
                           m_ValuesStartTime, m_BucketsStartTime,
                           m_BucketLength, m_StartOfWeekTimeOverride,
                           std::move(hypothesis.s_InitialValues));
            }
        }
        result.add(std::move(decompositions[selected].s_RemoveComponentsMask));
        result.withinBucketVariance(m_SampleVariance);
        return result;
    }

    // Check if we should remove all components.

    std::ptrdiff_t numberModelled(m_ModelledPeriods.size());
    double fractionNotMissing{static_cast<double>(observedRange(m_Values)) /
                              static_cast<double>(m_Values.size())};
    LOG_TRACE(<< "p-value min = " << minPValue);
    LOG_TRACE(<< "fraction not missing = " << fractionNotMissing);

    if ((numberModelled > 0 && // We're modelling seasonality
         decompositions[h1ForMinPValue].isEvictionPermitted() && // The window is suitable
         minPValue > m_PValueToEvict) && // We have weak evidence for seasonality
        (common::fuzzyGreaterThan(minPValue / m_PValueToEvict, 1.0, 0.2) &&
         common::fuzzyGreaterThan(fractionNotMissing, 1.0, 0.5)) // We've observed enough of the window
            .boolean()) {
        result.add(CNewTrendSummary{
            m_ValuesStartTime, m_BucketLength,
            std::move(decompositions[h0ForMinPValue].s_TrendInitialValues)});
        result.add(std::move(decompositions[h0ForMinPValue].s_RemoveComponentsMask));
        result.withinBucketVariance(m_SampleVariance);
    }

    return result;
}