lib/maths/time_series/CTimeSeriesTestForSeasonality.cc

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <maths/time_series/CTimeSeriesTestForSeasonality.h> #include <core/CLogger.h> #include <core/CTimeUtils.h> #include <core/CVectorRange.h> #include <core/Constants.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CFuzzyLogic.h> #include <maths/common/CIntegerTools.h> #include <maths/common/CLeastSquaresOnlineRegression.h> #include <maths/common/CLeastSquaresOnlineRegressionDetail.h> #include <maths/common/COrderings.h> #include <maths/common/CSetTools.h> #include <maths/common/CStatisticalTests.h> #include <maths/common/CTools.h> #include <maths/common/Constants.h> #include <maths/common/MathsTypes.h> #include <maths/time_series/CSeasonalComponent.h> #include <maths/time_series/CSeasonalTime.h> #include <maths/time_series/CSignal.h> #include <maths/time_series/CTimeSeriesSegmentation.h> #include <boost/iterator/counting_iterator.hpp> #include <boost/math/distributions/binomial.hpp> #include <boost/math/distributions/fisher_f.hpp> #include <algorithm> #include <cstddef> #include <exception> #include <limits> #include <numeric> #include <random> #include <sstream> namespace ml { namespace maths { namespace time_series { namespace { bool almostDivisor(std::size_t i, std::size_t j, double eps) { if (i > j) { return false; } // Check if j mod i < eps * j, i.e. that i is a divisor of j up to epsilon // relative difference. double diff{static_cast<double>(std::min(j % i, i - (j % i))) / static_cast<double>(j)}; return diff < eps; } double interpolateOnBucketOccupancy(double f1, double f0, double occupancy) { // We use piecewise linear interpolation with slightly higher rate of test // metrics for higher occupancy. Testing showed this provided a better // precision recall tradeoff. occupancy *= 2.0; double f0p5{0.7 * f0 + 0.3 * f1}; return occupancy < 0.5 ? common::CTools::linearlyInterpolate(0.0, 0.5, f0, f0p5, occupancy) : common::CTools::linearlyInterpolate(0.5, 1.0, f0p5, f1, occupancy); } } CNewTrendSummary::CNewTrendSummary(core_t::TTime initialValuesStartTime, core_t::TTime bucketLength, TFloatMeanAccumulatorVec initialValues) : m_InitialValuesStartTime{initialValuesStartTime}, m_BucketLength{bucketLength}, m_InitialValues{std::move(initialValues)} { } core_t::TTime CNewTrendSummary::initialValuesStartTime() const { return m_InitialValuesStartTime; } core_t::TTime CNewTrendSummary::initialValuesEndTime() const { return m_InitialValuesStartTime + static_cast<core_t::TTime>(m_InitialValues.size()) * m_BucketLength; } core_t::TTime CNewTrendSummary::bucketLength() const { return m_BucketLength; } const CNewTrendSummary::TFloatMeanAccumulatorVec& CNewTrendSummary::initialValues() const { return m_InitialValues; } CNewSeasonalComponentSummary::CNewSeasonalComponentSummary( std::string annotationText, const TSeasonalComponent& period, std::size_t size, EPeriodDescriptor periodDescriptor, core_t::TTime initialValuesStartTime, core_t::TTime bucketsStartTime, core_t::TTime bucketLength, TOptionalTime startOfWeekTime, TFloatMeanAccumulatorVec initialValues) : m_AnnotationText{std::move(annotationText)}, m_Period{period}, m_Size{size}, m_PeriodDescriptor{periodDescriptor}, m_InitialValuesStartTime{initialValuesStartTime}, m_BucketsStartTime{bucketsStartTime}, m_BucketLength{bucketLength}, m_StartOfWeekTime{startOfWeekTime}, m_InitialValues{std::move(initialValues)} { } const std::string& CNewSeasonalComponentSummary::annotationText() const { return m_AnnotationText; } std::size_t CNewSeasonalComponentSummary::size() const { return m_Size; } bool CNewSeasonalComponentSummary::isOneOf(int periods) const { return (m_PeriodDescriptor & periods) != 0; } CNewSeasonalComponentSummary::TSeasonalTimeUPtr CNewSeasonalComponentSummary::seasonalTime() const { auto interval = [this](std::size_t i) { return m_BucketLength * static_cast<core_t::TTime>(i); }; if ((m_PeriodDescriptor & E_Diurnal) != 0) { core_t::TTime period{[this] { if (m_PeriodDescriptor == E_Day) { return core::constants::DAY; } if (m_PeriodDescriptor == E_Week) { return core::constants::WEEK; } LOG_ERROR(<< "Unexpected descriptor '" << m_PeriodDescriptor << "'"); return core::constants::DAY; }()}; core_t::TTime windowStart{[&] { core_t::TTime times[]{0, 2 * core::constants::DAY}; core_t::TTime start{interval(m_Period.s_Window.first)}; return *std::min_element(std::begin(times), std::end(times), [&](const auto& lhs, const auto& rhs) { return std::abs(lhs - start) < std::abs(rhs - start); }); }()}; core_t::TTime windowEnd{[&] { core_t::TTime times[]{2 * core::constants::DAY, 7 * core::constants::DAY}; core_t::TTime start{interval(m_Period.s_Window.second)}; return *std::min_element(std::begin(times), std::end(times), [&](const auto& lhs, const auto& rhs) { return std::abs(lhs - start) < std::abs(rhs - start); }); }()}; if (m_Period.windowed()) { core_t::TTime startOfWeek{ m_StartOfWeekTime ? *m_StartOfWeekTime : (m_BucketsStartTime + interval(m_Period.s_StartOfWeek)) % core::constants::WEEK}; return std::make_unique<CDiurnalTime>(startOfWeek, windowStart, windowEnd, period); } return std::make_unique<CGeneralPeriodTime>(period); } if (m_PeriodDescriptor == E_Year) { return std::make_unique<CGeneralPeriodTime>(core::constants::YEAR); } return std::make_unique<CGeneralPeriodTime>(interval(m_Period.s_Period)); } core_t::TTime CNewSeasonalComponentSummary::initialValuesStartTime() const { return m_InitialValuesStartTime; } core_t::TTime CNewSeasonalComponentSummary::initialValuesEndTime() const { return m_InitialValuesStartTime + static_cast<core_t::TTime>(m_InitialValues.size()) * m_BucketLength; } core_t::TTime CNewSeasonalComponentSummary::bucketLength() const { return m_BucketLength; } const CNewSeasonalComponentSummary::TFloatMeanAccumulatorVec& CNewSeasonalComponentSummary::initialValues() const { return m_InitialValues; } std::string CNewSeasonalComponentSummary::print() const { std::ostringstream result; result << m_BucketLength * m_Period.s_Period; if (m_Period.windowed()) { result << "/(" << m_BucketLength * m_Period.s_Window.first << "," << m_BucketLength * m_Period.s_Window.second << ")"; } return result.str(); } CSeasonalComponent CNewSeasonalComponentSummary::createSeasonalComponent(double decayRate, double bucketLength) const { auto time = this->seasonalTime(); core_t::TTime period{time->period()}; core_t::TTime startTime{this->initialValuesStartTime()}; core_t::TTime endTime{this->initialValuesEndTime()}; core_t::TTime maxTimeShiftPerPeriod{ this->isOneOf(CNewSeasonalComponentSummary::E_Day | CNewSeasonalComponentSummary::E_Week | CNewSeasonalComponentSummary::E_Year) ? 0 : this->bucketLength() / 2}; const auto& initialValues = this->initialValues(); // If we see multiple repeats of the component in the window we use // a periodic boundary condition, which ensures that the prediction // at the repeat is continuous. auto boundaryCondition = period > time->windowLength() ? common::CSplineTypes::E_Natural : common::CSplineTypes::E_Periodic; CSeasonalComponent component(*time, this->size(), decayRate, bucketLength, maxTimeShiftPerPeriod, boundaryCondition); component.initialize(startTime, endTime, initialValues); return component; } void CSeasonalDecomposition::add(CNewTrendSummary trend) { m_Trend = std::move(trend); } void CSeasonalDecomposition::add(std::string annotationText, const TSeasonalComponent& period, std::size_t size, TPeriodDescriptor periodDescriptor, core_t::TTime initialValuesStartTime, core_t::TTime bucketsStartTime, core_t::TTime bucketLength, TOptionalTime startOfWeekTime, TFloatMeanAccumulatorVec initialValues) { m_Seasonal.emplace_back(std::move(annotationText), period, size, periodDescriptor, initialValuesStartTime, bucketsStartTime, bucketLength, startOfWeekTime, std::move(initialValues)); } void CSeasonalDecomposition::add(TBoolVec seasonalToRemoveMask) { m_SeasonalToRemoveMask = std::move(seasonalToRemoveMask); } void CSeasonalDecomposition::withinBucketVariance(double variance) { m_WithinBucketVariance = variance; } bool CSeasonalDecomposition::componentsChanged() const { return m_Seasonal.empty() == false || std::count(m_SeasonalToRemoveMask.begin(), m_SeasonalToRemoveMask.end(), true) > 0; } const CNewTrendSummary* CSeasonalDecomposition::trend() const { return m_Trend != std::nullopt ? &(*m_Trend) : nullptr; } const CSeasonalDecomposition::TNewSeasonalComponentVec& CSeasonalDecomposition::seasonal() const { return m_Seasonal; } const CSeasonalDecomposition::TBoolVec& CSeasonalDecomposition::seasonalToRemoveMask() const { return m_SeasonalToRemoveMask; } double CSeasonalDecomposition::withinBucketVariance() const { return m_WithinBucketVariance; } std::string CSeasonalDecomposition::print() const { return core::CContainerPrinter::print(m_Seasonal); } CTimeSeriesTestForSeasonality::CTimeSeriesTestForSeasonality(core_t::TTime valuesStartTime, core_t::TTime bucketsStartTime, core_t::TTime bucketLength, core_t::TTime sampleInterval, TFloatMeanAccumulatorVec values, double occupancy, double outlierFraction) // We require greater evidence to model seasonality for sparse data because // we're more prone to falsely detecting seasonality in this case and when // we do it tends to produce more confusing anomaly detection. : m_MaximumNumberSegments{static_cast<std::size_t>( interpolateOnBucketOccupancy(static_cast<double>(MAXIMUM_NUMBER_SEGMENTS), 1.0, occupancy))}, m_MinimumRepeatsPerSegmentToTestVariance{ interpolateOnBucketOccupancy(3.0, 6.0, occupancy)}, m_MinimumRepeatsPerSegmentToTestAmplitude{ interpolateOnBucketOccupancy(5.0, 10.0, occupancy)}, m_LowAutocorrelation{interpolateOnBucketOccupancy(0.3, 0.5, occupancy)}, m_MediumAutocorrelation{interpolateOnBucketOccupancy(0.5, 0.65, occupancy)}, m_HighAutocorrelation{interpolateOnBucketOccupancy(0.7, 0.75, occupancy)}, m_ValuesStartTime{valuesStartTime}, m_BucketsStartTime{bucketsStartTime}, m_BucketLength{bucketLength}, m_SampleInterval{sampleInterval}, m_OutlierFraction{outlierFraction}, m_Values{std::move(values)}, m_Outliers{static_cast<std::size_t>(std::max( outlierFraction * static_cast<double>(CSignal::countNotMissing(m_Values)) + 0.5, 1.0))} { TMeanVarAccumulator moments{this->truncatedMoments(m_OutlierFraction, m_Values)}; TMeanVarAccumulator meanAbs{this->truncatedMoments( m_OutlierFraction, m_Values, [](const TFloatMeanAccumulator& value) { return std::fabs(common::CBasicStatistics::mean(value)); })}; // Note we don't bother modelling seasonality whose amplitude is too small // compared to the absolute values. We won't raise anomalies for differences // from our predictions which are smaller than this anyway. m_EpsVariance = std::max( common::CTools::pow2(1000.0 * std::numeric_limits<double>::epsilon()) * common::CBasicStatistics::maximumLikelihoodVariance(moments), common::CTools::pow2(common::MINIMUM_COEFFICIENT_OF_VARIATION * std::max(common::CBasicStatistics::mean(meanAbs), 1e-8))); LOG_TRACE(<< "eps variance = " << m_EpsVariance); } bool CTimeSeriesTestForSeasonality::canTestModelledComponent( const TFloatMeanAccumulatorVec& values, core_t::TTime bucketsStartTime, core_t::TTime bucketLength, core_t::TTime minimumPeriod, std::size_t minimumResolution, const CSeasonalTime& component) { std::size_t minimumPeriodInBuckets{ std::max(buckets(bucketLength, minimumPeriod), minimumResolution)}; return 100 * (component.period() % bucketLength) < component.period() && canTestPeriod(values, minimumPeriodInBuckets, toPeriod(bucketsStartTime, bucketLength, component)); } void CTimeSeriesTestForSeasonality::addModelledSeasonality(const CSeasonalTime& component, std::size_t minimumResolution, std::size_t size) { auto period = toPeriod(m_BucketsStartTime, m_BucketLength, component); m_ModelledPeriods.push_back(period); m_ModelledPeriodsSizes.push_back(size); m_ModelledPeriodsTestable.push_back( canTestModelledComponent(m_Values, m_BucketsStartTime, m_BucketLength, m_MinimumPeriod, minimumResolution, component)); if (period.windowed()) { m_StartOfWeekOverride = period.s_StartOfWeek; // We need the actual time in case it isn't a multiple of the bucket length // after the start of the window. m_StartOfWeekTimeOverride = component.windowRepeatStart(); } } void CTimeSeriesTestForSeasonality::modelledSeasonalityPredictor(TPredictor predictor) { m_ModelledPredictor = std::move(predictor); } void CTimeSeriesTestForSeasonality::prepareWindowForDecompose() { // Although we precondition by removing untestable modelled component predictions // there can be errors. This is problematic when the period is too short to test // and can induce spurious apparent seasonality at a multiple of its period. This // fits and removes any seasonality from the prediction error. TSeasonalComponentVec untestable; untestable.reserve(m_ModelledPeriods.size()); std::size_t minimumPeriod{buckets(m_BucketLength, m_MinimumPeriod)}; for (const auto& period : m_ModelledPeriods) { if (period.period() > 1 && periodTooShortToTest(minimumPeriod, period)) { untestable.push_back(period); } } if (untestable.empty() == false) { TMeanAccumulatorVecVec components; CSignal::fitSeasonalComponentsRobust(untestable, m_OutlierFraction, m_Values, components); removePredictions({untestable, 0, untestable.size()}, {components, 0, components.size()}, m_Values); } // Remove extremely outlying values for which reweighting is insufficient. This // is defined as a contiguous range of much smaller or larger than typical values. // We need to be careful with the definition we use because, for example, periodic // spikes can appear to be extremely outlying for many measures. The definition we // use requires that values in the interval are very different to _all_ remaining // values which is sufficient to avoid false positives. CSignal::removeExtremeOutliers(m_OutlierFraction / 2.0, m_Values); } bool CTimeSeriesTestForSeasonality::checkInvariants() const { if (m_ModelledPeriods.size() != m_ModelledPeriodsSizes.size()) { LOG_ERROR(<< "# modelled periods (" << m_ModelledPeriods << ") != # modelled period sizes (" << m_ModelledPeriodsSizes << ")"); return false; } if (m_ModelledPeriodsSizes.size() != m_ModelledPeriodsTestable.size()) { LOG_ERROR(<< "# modelled period sizes (" << m_ModelledPeriodsSizes << ") != # modelled period testable (" << m_ModelledPeriodsTestable << ")"); return false; } return true; } CSeasonalDecomposition CTimeSeriesTestForSeasonality::decompose() const { // The quality of anomaly detection is sensitive to bias variance tradeoff. // If you care about point predictions you can get away with erring on the side // of overfitting slightly to avoid bias. We however care about the predicted // distribution and are very sensitive to prediction variance. We therefore want // to be careful to only model seasonality which adds significant predictive // value. // // This is the main entry point to decomposing a signal into its seasonal // components. This whole process is somewhat involved. Complications include // all the various data characteristics we would like to deal with automatically // these include: signals polluted with outliers, discontinuities in the trend, // discontinuities in the seasonality (i.e. scaling up or down) and missing // values. We'd also like very high power for detecting the most common seasonal // components, i.e. daily, weekly, weekday/weekend modulation and yearly // (predictive calendar features are handled separately). We also run this // continuously on various window lengths and it needs to produce a stable // result if the seasonality is not changing whilst still being able to detect // changes and to initialize new components with the right size (bias variance // tradeoff) and seed values. // // The high-level strategy is: // 1. For various trend assumptions, no trend, quadratic and piecewise linear, // 2. Test for the seasonalities we already model, common diurnal seasonality // and the best decomposition based on serial autocorrelation and select // those hypotheses for which there is strong evidence. // 3. Ensure there is good evidence that the signal is seasonal vs the best // explanation for the values which only uses a trend. // 4. Given a set of statistically significant seasonal hypotheses choose // the one which will lead to the best modelling and avoids churn. // // I found it was far more effective to consider each hypothesis separately. // The alternative pushes much more complexity into the step to actually fit // the model. For example, one might try and simultaneously fit piecewise linear // trend and seasonality, but for example determining the right break points // (if any) in the trend together with the appropriate scaled seasonal components // is a non trivial estimation problem. We take an ordered approach, first fitting // the trend then seasonality and selecting at each stage only to fit significant // effects. However, we also test simpler hypotheses, such that there is no // trend at all, explicitly. This is much more forgiving to the estimation // process since if the data doesn't have a trend not trying to fit one can // easily be identified as a better choice after the fact. The final selection // is based on a number of criterion which are geared towards our modelling // techniques and are described in select. if (this->checkInvariants() == false) { LOG_ERROR(<< "Failed invariants. Not testing for seasonality."); return {}; } LOG_TRACE(<< "decomposing " << m_Values.size() << " values, bucket length = " << m_BucketLength); // Shortcircuit if we can't test any periods. if (canTestPeriod(m_Values, 0, CSignal::seasonalComponentSummary(2)) == false) { return {}; } // If the values in the window are constant we should remove all components // we can test. double variance{[this] { TMeanVarAccumulator moments; for (const auto& value : m_Values) { moments.add(common::CBasicStatistics::mean(value), common::CBasicStatistics::count(value)); } return common::CBasicStatistics::variance(moments); }()}; if (variance == 0.0) { CSeasonalDecomposition result; result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength, m_Values}); result.add(m_ModelledPeriodsTestable); result.withinBucketVariance(m_SampleVariance); return result; } TSizeVec trendSegments{TSegmentation::piecewiseLinear( m_Values, m_SignificantPValue, m_OutlierFraction, m_MaximumNumberSegments)}; LOG_TRACE(<< "trend segments = " << trendSegments); TRemoveTrend removeTrendModels[]{ [this](const TSeasonalComponentVec& /*periods*/, TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) { LOG_TRACE(<< "no trend"); values = m_Values; modelTrendSegments.clear(); return true; }, [this](const TSeasonalComponentVec& periods, TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) { // We wish to solve argmin_{t,s}{sum_i w_i (y_i - t(i) - s(i))^2}. // Since t and s are linear functions of their parameters this is // convex. The following makes use of the fact that the minimizers // of sum_i w_i (y_i - t(i))^2 and sum_i w_i (y_i - s(i))^2 are // trivial to perform a sequence of "line searches". In practice, // this converges quickly so we use a fixed number of iterations. LOG_TRACE(<< "quadratic trend"); using TRegression = common::CLeastSquaresOnlineRegression<2, double>; modelTrendSegments.assign({0, values.size()}); TRegression trend; TRegression::TArray parameters; parameters.fill(0.0); auto predictor = [&](std::size_t i) { return TRegression::predict(parameters, static_cast<double>(i)); }; if (periods.empty()) { values = m_Values; for (std::size_t j = 0; j < values.size(); ++j) { trend.add(static_cast<double>(j), common::CBasicStatistics::mean(values[j]), common::CBasicStatistics::count(values[j])); } // Note that parameters are referenced by predictor. Reading them // here refreshes the values used for prediction. Computing the // parameters is the bottleneck in this code and the same values // are used for each prediction. We optimise removePredictions by // reading them once upfront. trend.parameters(parameters); removePredictions(predictor, values); return true; } for (std::size_t i = 0; i < 3; ++i) { values = m_Values; removePredictions(predictor, values); CSignal::fitSeasonalComponents(periods, values, m_Components); values = m_Values; removePredictions({periods, 0, periods.size()}, {m_Components, 0, m_Components.size()}, values); trend = TRegression{}; for (std::size_t j = 0; j < values.size(); ++j) { trend.add(static_cast<double>(j), common::CBasicStatistics::mean(values[j]), common::CBasicStatistics::count(values[j])); } // See above. trend.parameters(parameters); } values = m_Values; removePredictions(predictor, values); return true; }, [&](const TSeasonalComponentVec& periods, TFloatMeanAccumulatorVec& values, TSizeVec& modelTrendSegments) { if (trendSegments.size() <= 2) { return false; } // We're only interested in applying segmentation when the number of // segments is small w.r.t. the number of repeats. In such cases we // can fit the trend first and suffer little loss in accuracy. LOG_TRACE(<< trendSegments.size() - 1 << " linear trend segments"); modelTrendSegments = trendSegments; auto predictor = [&](std::size_t i) { double result{0.0}; for (std::size_t j = 0; j < periods.size(); ++j) { result += periods[j].value(m_Components[j], i); } return result; }; values = m_Values; values = TSegmentation::removePiecewiseLinear( std::move(values), modelTrendSegments, m_OutlierFraction); if (periods.empty() == false) { CSignal::fitSeasonalComponents(periods, values, m_Components); values = m_Values; removePredictions(predictor, values); values = TSegmentation::removePiecewiseLinear( std::move(values), modelTrendSegments, m_OutlierFraction); removePredictions([&](std::size_t j) { return -predictor(j); }, values); } return true; }}; try { TModelVec decompositions; decompositions.reserve(8 * std::size(removeTrendModels)); for (const auto& removeTrend : removeTrendModels) { this->addNotSeasonal(removeTrend, decompositions); this->addModelled(removeTrend, decompositions); this->addDiurnal(removeTrend, decompositions); this->addHighestAutocorrelation(removeTrend, decompositions); } return this->select(decompositions); } catch (const std::exception& e) { LOG_ERROR(<< "Seasonal decomposition failed: " << e.what()); } return {}; } CSeasonalDecomposition CTimeSeriesTestForSeasonality::select(TModelVec& decompositions) const { // If the existing seasonality couldn't be tested short circuit: we'll keep it. if (std::find_if(decompositions.begin(), decompositions.end(), [](const auto& decomposition) { return decomposition.s_AlreadyModelled && decomposition.isTestable() == false; }) != decompositions.end()) { LOG_TRACE(<< "Not testable"); return {}; } // Choose the hypothesis which yields the best explanation of the values. // Sort by increasing complexity. std::stable_sort(decompositions.begin(), decompositions.end(), [](const auto& lhs, const auto& rhs) { return lhs.numberParameters() < rhs.numberParameters(); }); auto computePValue = [&](std::size_t H1) { double pValueMax{-1.0}; double logPValueProxyMax{-std::numeric_limits<double>::max()}; std::size_t pValueMaxHypothesis{decompositions.size()}; for (std::size_t H0 = 0; H0 < decompositions.size(); ++H0) { if (decompositions[H0].isNull()) { double pValue{decompositions[H1].pValue(decompositions[H0])}; double logPValueProxy{decompositions[H1].logPValueProxy(decompositions[H0])}; if (pValue > pValueMax || (pValue == pValueMax && logPValueProxy > logPValueProxyMax)) { std::tie(pValueMax, logPValueProxyMax, pValueMaxHypothesis) = std::make_tuple(pValue, logPValueProxy, H0); } } } pValueMax = std::max(pValueMax, common::CTools::smallestProbability()); return std::make_tuple(pValueMax, logPValueProxyMax, pValueMaxHypothesis); }; double variance{common::CBasicStatistics::maximumLikelihoodVariance( this->truncatedMoments(0.0, m_Values))}; double truncatedVariance{common::CBasicStatistics::maximumLikelihoodVariance( this->truncatedMoments(m_OutlierFraction, m_Values))}; LOG_TRACE(<< "variance = " << variance << " truncated variance = " << truncatedVariance); // Select the best decomposition if it is a statistically significant improvement. std::size_t selected{decompositions.size()}; double qualitySelected{-std::numeric_limits<double>::max()}; double minPValue{1.0}; std::size_t h0ForMinPValue{0}; std::size_t h1ForMinPValue{0}; for (std::size_t H1 = 0; H1 < decompositions.size(); ++H1) { if (decompositions[H1].isAlternative()) { double pValueVsH0; double logPValueProxy; std::size_t H0; std::tie(pValueVsH0, logPValueProxy, H0) = computePValue(H1); double logPValue{(pValueVsH0 == 1.0 ? -std::numeric_limits<double>::min() : std::log(pValueVsH0))}; logPValueProxy = std::min(logPValueProxy, -std::numeric_limits<double>::min()); double pValueToAccept{decompositions[H1].s_AlreadyModelled ? m_PValueToEvict : m_AcceptedFalsePostiveRate}; if (pValueVsH0 < minPValue) { minPValue = pValueVsH0; h0ForMinPValue = H0; h1ForMinPValue = H1; } LOG_TRACE(<< "hypothesis = " << decompositions[H1].s_Hypotheses); LOG_TRACE(<< "p-value vs not periodic = " << pValueVsH0 << ", log proxy = " << logPValueProxy); if (pValueVsH0 > pValueToAccept) { continue; } // We've rejected the hypothesis that the signal is not periodic. // // First check if this is significantly better than the currently selected // model from an explained variance standpoint. If the test is ambiguous // then fallback to selecting based on a number of criteria. // // We therefore choose the best model based on the following criteria: // 1. The amount of variance explained per parameter. Models with a small // period which explain most of the variance are preferred because they // are more accurate and robust. // 2. Whether the components are already modelled to avoid churn on marginal // decisions. // 3. The log p-value vs non seasonal. This captures information about both // the model size and the variance with and without the model. // 4. Standard deviations above the mean of the F-distribution. This captures // similar information to the log p-value but won't underflow. // 5. The total target model size. The p-value is less sensitive to model // size as the window length increases. However, for both stability // and efficiency considerations we strongly prefer smaller models. // 6. The number of scalings and pieces in the trend model. These increase // false positives so if we have an alternative similarly good hypothesis // we use that one. // 7. The number of repeats of the superposition of components we've seen. // We penalise seeing fewer than two repeats to avoid using interference // to fit changes in the seasonal pattern. // // Why sum the logs you might ask. This makes the decision dimensionless. // Consider that sum_i{ log(f_i) } < sum_i{ log(f_i') } is equivalent to // sum_i{ log(f_i / f_i')} < 0 so if we scale each feature by a constant // they cancel and we still make the same decision. // // One can think of this as a smooth lexicographical order with the weights // playing the role of the order: smaller weight values "break ties". auto explainedVariancePerParameter = decompositions[H1].explainedVariancePerParameter(variance, truncatedVariance); double leastCommonRepeat{decompositions[H1].leastCommonRepeat()}; double pValueVsSelected{ selected < decompositions.size() ? decompositions[H1].pValue(decompositions[selected], 1e-3, m_SampleVariance) : 1.0}; double scalings{decompositions[H1].numberScalings()}; double segments{ std::max(static_cast<double>(decompositions[H1].s_NumberTrendParameters / 2), 1.0) - 1.0}; LOG_TRACE(<< "explained variance per param = " << explainedVariancePerParameter << ", scalings = " << scalings << ", segments = " << segments << ", number parameters = " << decompositions[H1].numberParameters() << ", p-value H1 vs selected = " << pValueVsSelected); LOG_TRACE(<< "residual moments = " << decompositions[H1].s_ResidualMoments << ", truncated residual moments = " << decompositions[H1].s_TruncatedResidualMoments << ", sample variance = " << m_SampleVariance); LOG_TRACE(<< "least common repeat = " << leastCommonRepeat); double quality{1.0 * std::log(explainedVariancePerParameter(0)) + 1.0 * std::log(explainedVariancePerParameter(1)) + 0.7 * decompositions[H1].componentsSimilarity() + 0.5 * std::log(-logPValue) + 0.2 * std::log(-logPValueProxy) - 0.5 * std::log(decompositions[H1].targetModelSize()) - 0.3 * std::log(0.2 + common::CTools::pow2(scalings)) - 0.3 * std::log(1.0 + common::CTools::pow2(segments)) - 0.3 * std::log(std::max(leastCommonRepeat, 0.5))}; double qualityToAccept{ qualitySelected - std::log(1.0 + std::max(std::log(0.01 / pValueVsSelected), 0.0))}; LOG_TRACE(<< "target size = " << decompositions[H1].targetModelSize() << ", modelled = " << decompositions[H1].s_AlreadyModelled); LOG_TRACE(<< "quality = " << quality << " to accept = " << qualityToAccept); if (quality > qualityToAccept) { std::tie(selected, qualitySelected) = std::make_pair(H1, quality); LOG_TRACE(<< "selected " << selected); } } } CSeasonalDecomposition result; if (selected < decompositions.size() && std::count_if(decompositions[selected].s_Hypotheses.begin(), // Are there new components? decompositions[selected].s_Hypotheses.end(), [this](const auto& hypothesis) { return hypothesis.s_Model && this->permittedPeriod(hypothesis.s_Period); }) > 0) { LOG_TRACE(<< "selected = " << decompositions[selected].s_Hypotheses); result.add(CNewTrendSummary{m_ValuesStartTime, m_BucketLength, std::move(decompositions[selected].s_TrendInitialValues)}); for (auto& hypothesis : decompositions[selected].s_Hypotheses) { if (hypothesis.s_Model && this->permittedPeriod(hypothesis.s_Period)) { LOG_TRACE(<< "Adding " << hypothesis.s_Period.print()); result.add(this->annotationText(hypothesis.s_Period), hypothesis.s_Period, hypothesis.s_ModelSize, this->periodDescriptor(hypothesis.s_Period.s_Period), m_ValuesStartTime, m_BucketsStartTime, m_BucketLength, m_StartOfWeekTimeOverride, std::move(hypothesis.s_InitialValues)); } } result.add(std::move(decompositions[selected].s_RemoveComponentsMask)); result.withinBucketVariance(m_SampleVariance); return result; } // Check if we should remove all components. std::ptrdiff_t numberModelled(m_ModelledPeriods.size()); double fractionNotMissing{static_cast<double>(observedRange(m_Values)) / static_cast<double>(m_Values.size())}; LOG_TRACE(<< "p-value min = " << minPValue); LOG_TRACE(<< "fraction not missing = " << fractionNotMissing); if ((numberModelled > 0 && // We're modelling seasonality decompositions[h1ForMinPValue].isEvictionPermitted() && // The window is suitable minPValue > m_PValueToEvict) && // We have weak evidence for seasonality (common::fuzzyGreaterThan(minPValue / m_PValueToEvict, 1.0, 0.2) && common::fuzzyGreaterThan(fractionNotMissing, 1.0, 0.5)) // We've observed enough of the window .boolean()) { result.add(CNewTrendSummary{ m_ValuesStartTime, m_BucketLength, std::move(decompositions[h0ForMinPValue].s_TrendInitialValues)}); result.add(std::move(decompositions[h0ForMinPValue].s_RemoveComponentsMask)); result.withinBucketVariance(m_SampleVariance); } return result; } void CTimeSeriesTestForSeasonality::addNotSeasonal(const TRemoveTrend& removeTrend, TModelVec& decompositions) const { if (removeTrend({}, m_ValuesMinusTrend, m_ModelTrendSegments)) { decompositions.emplace_back( *this, this->truncatedMoments(0.0, m_ValuesMinusTrend), this->truncatedMoments(m_OutlierFraction, m_ValuesMinusTrend), this->numberTrendParameters( m_ModelTrendSegments.empty() ? 0 : m_ModelTrendSegments.size() - 1), m_Values, THypothesisStatsVec{}, m_ModelledPeriodsTestable); } } void CTimeSeriesTestForSeasonality::addModelled(const TRemoveTrend& removeTrend, TModelVec& decompositions) const { m_CandidatePeriods = m_ModelledPeriods; this->removeIfNotTestable(m_CandidatePeriods); if (m_CandidatePeriods.empty() == false && removeTrend(m_CandidatePeriods, m_ValuesMinusTrend, m_ModelTrendSegments)) { // Already modelled seasonal components. std::stable_sort(m_CandidatePeriods.begin(), m_CandidatePeriods.end(), [](const auto& lhs, const auto& rhs) { return lhs.s_Period < rhs.s_Period; }); this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, true, // Already modelled false, // Is diurnal decompositions); // Already modelled plus highest serial autocorrelation seasonal components. m_Periods = m_CandidatePeriods; CSignal::fitSeasonalComponents(m_Periods, m_ValuesMinusTrend, m_Components); m_TemporaryValues = m_ValuesMinusTrend; removePredictions({m_Periods, 0, m_Periods.size()}, {m_Components, 0, m_Components.size()}, m_TemporaryValues); auto diurnal = std::make_tuple(this->day(), this->week(), this->year()); for (const auto& period : CSignal::seasonalDecomposition( m_TemporaryValues, m_OutlierFraction, diurnal, m_StartOfWeekOverride, 0.05, m_MaximumNumberComponents)) { if (std::find_if(m_Periods.begin(), m_Periods.end(), [&](const auto& modelledPeriod) { return modelledPeriod.periodAlmostEqual(period, 0.05); }) == m_Periods.end()) { m_CandidatePeriods.push_back(period); } } this->removeIfNotTestable(m_CandidatePeriods); if (this->includesNewComponents(m_CandidatePeriods) && this->onlyDiurnal(m_CandidatePeriods) == false) { this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, false, // Already modelled false, // Is diurnal decompositions); } } } void CTimeSeriesTestForSeasonality::addDiurnal(const TRemoveTrend& removeTrend, TModelVec& decompositions) const { // day + year. m_CandidatePeriods.assign({CSignal::seasonalComponentSummary(this->day()), CSignal::seasonalComponentSummary(this->year())}); this->removeIfNotTestable(m_CandidatePeriods); if (this->includesNewComponents(m_CandidatePeriods) && removeTrend(m_CandidatePeriods, m_ValuesMinusTrend, m_ModelTrendSegments)) { this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, false, // Already modelled true, // Is diurnal decompositions); } m_CandidatePeriods.assign({CSignal::seasonalComponentSummary(this->week()), CSignal::seasonalComponentSummary(this->year())}); this->removeIfNotTestable(m_CandidatePeriods); if (removeTrend(m_CandidatePeriods, m_ValuesMinusTrend, m_ModelTrendSegments)) { m_CandidatePeriods = CSignal::tradingDayDecomposition( m_ValuesMinusTrend, m_OutlierFraction, this->week(), m_StartOfWeekOverride); // weekday/weekend modulation + year. if (m_CandidatePeriods.empty() == false) { CSignal::appendSeasonalComponentSummary(this->year(), m_CandidatePeriods); this->removeIfNotTestable(m_CandidatePeriods); if (this->includesNewComponents(m_CandidatePeriods)) { this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, false, // Already modelled true, // Is diurnal decompositions); } } // week + year. m_CandidatePeriods.assign({CSignal::seasonalComponentSummary(this->week()), CSignal::seasonalComponentSummary(this->year())}); this->removeIfNotTestable(m_CandidatePeriods); if (this->includesNewComponents(m_CandidatePeriods)) { this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, false, // Already modelled true, // Is diurnal decompositions); } } } void CTimeSeriesTestForSeasonality::addHighestAutocorrelation(const TRemoveTrend& removeTrend, TModelVec& decompositions) const { // Highest serial autocorrelation components. if (removeTrend({}, m_ValuesMinusTrend, m_ModelTrendSegments)) { auto diurnal = std::make_tuple(this->day(), this->week(), this->year()); m_CandidatePeriods = CSignal::seasonalDecomposition( m_ValuesMinusTrend, m_OutlierFraction, diurnal, m_StartOfWeekOverride, 0.05, m_MaximumNumberComponents); this->removeIfNotTestable(m_CandidatePeriods); if (removeTrend(m_CandidatePeriods, m_ValuesMinusTrend, m_ModelTrendSegments) && this->includesNewComponents(m_CandidatePeriods) && this->onlyDiurnal(m_CandidatePeriods) == false) { this->testAndAddDecomposition(m_CandidatePeriods, m_ModelTrendSegments, m_ValuesMinusTrend, false, // Already modelled false, // Is diurnal decompositions); } } } void CTimeSeriesTestForSeasonality::testAndAddDecomposition( const TSeasonalComponentVec& periods, const TSizeVec& trendSegments, const TFloatMeanAccumulatorVec& valuesToTest, bool alreadyModelled, bool isDiurnal, TModelVec& decompositions) const { std::size_t numberTrendSegments{trendSegments.empty() ? 0 : trendSegments.size() - 1}; auto decomposition = this->testDecomposition(periods, numberTrendSegments, valuesToTest, alreadyModelled); if (this->considerDecompositionForSelection(decomposition, alreadyModelled, isDiurnal)) { decomposition.s_AlreadyModelled = alreadyModelled; this->removeDiscontinuities(trendSegments, decomposition.s_TrendInitialValues); decompositions.push_back(std::move(decomposition)); } } bool CTimeSeriesTestForSeasonality::considerDecompositionForSelection(const SModel& decomposition, bool alreadyModelled, bool isDiurnal) const { return decomposition.seasonal() && std::count_if(decomposition.s_Hypotheses.begin(), decomposition.s_Hypotheses.end(), [this](const auto& hypothesis) { return hypothesis.s_Period.windowed() && hypothesis.s_Period.s_Period == this->week(); }) != static_cast<std::ptrdiff_t>(decomposition.s_Hypotheses.size()) && std::count_if(decomposition.s_Hypotheses.begin(), decomposition.s_Hypotheses.end(), [&](const auto& hypothesis) { return alreadyModelled || this->isDiurnal(hypothesis.s_Period) == isDiurnal; }) > 0; } CTimeSeriesTestForSeasonality::SModel CTimeSeriesTestForSeasonality::testDecomposition(const TSeasonalComponentVec& periods, std::size_t numberTrendSegments, const TFloatMeanAccumulatorVec& valuesToTest, bool alreadyModelled) const { // The main loop schematically does the following: // 1. Remove any out-of-phase seasonal components which haven't been tested from // the values to test (these affect the tests we apply). // 2. Test the period with and without piecewise linear scaling and choose the // best alternative. // 3. If the period was accepted update the values to remove the predictions of // that component. // // Conceptually we are testing a sequence of nested hypotheses for modelling the // seasonality since any added component could be zeroed just by setting all its // predictions to zero. I chose not to express this as likelihood ratio test // because we can get away with a less powerful test and its errors are sensitive // to the distribution assumptions. // // For each component we test the significance of the variance it explains (F-test), // the cyclic autocorrelation on the window of values and the significance of an // amplitude test statistic. The amplitude test looks for frequently repeated spikes // or dips. Our model can represent such signals with seasonality in the variance // and modelling such approximate seasonality is very effective at reducing false // positives. // // The treatment of outliers is subtle. The variance and autocorrelation tests are // adversely affected by outliers. However, an unmodelled seasonality can easily // generate outliers. We compute test statistics with and without outliers and use // the most significant test. In this context, it is important to realize the outliers // need to be identified w.r.t. the model which is assumed for the data. We therefore // compute weights separately for the null hypothesis that the component is not present. // // The overall choice of whether to model a component depends on all these factors and // also the number of repeats we have observed, the number of values we observe per // period, etc (see testVariance and testAmplitude for details). Since the considerations // are heterogenous we combine them using fuzzy logic. using TMeanScaleHypothesis = std::function<bool(TFloatMeanAccumulatorVec&, const TSeasonalComponentVec&, const TMeanAccumulatorVecVec&, SHypothesisStats&)>; LOG_TRACE(<< "testing " << periods); auto meanScale = [](const TSizeVec& segmentation, const TDoubleVec& scales) { return TSegmentation::meanScale(segmentation, scales); }; TMeanScaleHypothesis constantScales[]{ [&](TFloatMeanAccumulatorVec& values, const TSeasonalComponentVec&, const TMeanAccumulatorVecVec&, SHypothesisStats& hypothesis) { hypothesis.s_ScaleSegments.assign({0, values.size()}); return true; }, [&](TFloatMeanAccumulatorVec& values, const TSeasonalComponentVec& period, const TMeanAccumulatorVecVec& component, SHypothesisStats& hypothesis) { hypothesis.s_ScaleSegments = TSegmentation::piecewiseLinearScaledSeasonal( values, [&](std::size_t i) { return period[0].value(component[0], i); }, m_SignificantPValue, m_MaximumNumberSegments); return this->constantScale(meanScale, m_Periods, hypothesis.s_ScaleSegments, values, m_ScaledComponent, m_ComponentScales); }}; TFloatMeanAccumulatorVec residuals{valuesToTest}; THypothesisStatsVec hypotheses; hypotheses.reserve(periods.size()); // If the superposition of periodicities doesn't repeat in window we need to be careful // not to use it to model non-seasonal changes. We start to penalise the selection as // lcm(periods) exceeds the window length. std::size_t leastCommonRepeat{1}; std::size_t range{observedRange(m_Values)}; for (std::size_t i = 0; i < periods.size(); ++i) { LOG_TRACE(<< "testing " << periods[i].print()); // Precondition by removing all remaining out-of-phase components. m_TemporaryValues = residuals; m_Periods.clear(); for (std::size_t j = i + 1; j < periods.size(); ++j) { if (almostDivisor(periods[j].s_Period, periods[i].s_Period, 0.05) == false && almostDivisor(periods[i].s_Period, periods[j].s_Period, 0.05) == false) { m_Periods.push_back(periods[j]); } } if (m_Periods.empty() == false) { LOG_TRACE(<< "removing " << m_Periods); CSignal::fitSeasonalComponents(m_Periods, m_TemporaryValues, m_Components); removePredictions({m_Periods, 0, m_Periods.size()}, {m_Components, 0, m_Components.size()}, m_TemporaryValues); } // Restrict to the component to test time windows. m_WindowIndices.resize(m_TemporaryValues.size()); std::iota(m_WindowIndices.begin(), m_WindowIndices.end(), 0); CSignal::restrictTo(periods[i], m_TemporaryValues); CSignal::restrictTo(periods[i], m_WindowIndices); m_ValuesToTest = m_TemporaryValues; // Compute the null hypothesis residual variance statistics. m_Periods.assign(1, CSignal::seasonalComponentSummary(1)); CSignal::fitSeasonalComponentsRobust(m_Periods, m_OutlierFraction, m_TemporaryValues, m_Components); auto H0 = this->residualVarianceStats(m_TemporaryValues); SHypothesisStats bestHypothesis{periods[i]}; bool componentAlreadyModelled{this->alreadyModelled(periods[i])}; for (const auto& constantScale : constantScales) { SHypothesisStats hypothesis{periods[i]}; auto period = CSignal::seasonalComponentSummary(periods[i].period()); m_Periods.assign(1, period); CSignal::fitSeasonalComponentsRobust(m_Periods, m_OutlierFraction, m_ValuesToTest, m_Components); if (constantScale(m_ValuesToTest, m_Periods, m_Components, hypothesis) && CSignal::countNotMissing(m_ValuesToTest) > 0) { LOG_TRACE(<< "scale segments = " << hypothesis.s_ScaleSegments); hypothesis.s_IsTestable = true; hypothesis.s_NumberTrendSegments = numberTrendSegments; hypothesis.s_NumberScaleSegments = hypothesis.s_ScaleSegments.size() - 1; hypothesis.s_MeanNumberRepeats = CSignal::meanNumberRepeatedValues(m_ValuesToTest, period); hypothesis.s_WindowRepeats = static_cast<double>(range) / static_cast<double>(periods[i].s_WindowRepeat); hypothesis.s_LeastCommonRepeat = static_cast<double>(componentAlreadyModelled ? leastCommonRepeat : common::CIntegerTools::lcm( leastCommonRepeat, periods[i].s_WindowRepeat)) / static_cast<double>(range); hypothesis.testExplainedVariance(*this, H0); hypothesis.testAutocorrelation(*this); hypothesis.testAmplitude(*this); hypothesis.s_Truth = hypothesis.varianceTestResult(*this) || hypothesis.amplitudeTestResult(*this); LOG_TRACE(<< "truth = " << hypothesis.s_Truth.print()); if (hypothesis.isBetter(bestHypothesis)) { bestHypothesis = std::move(hypothesis); } } } if (alreadyModelled && bestHypothesis.evict(*this, i)) { LOG_TRACE(<< "discarding " << periods[i].print()); bestHypothesis.s_DiscardingModel = true; hypotheses.push_back(std::move(bestHypothesis)); } else if (alreadyModelled || bestHypothesis.s_Truth.boolean()) { leastCommonRepeat = componentAlreadyModelled ? leastCommonRepeat : common::CIntegerTools::lcm( leastCommonRepeat, periods[i].s_WindowRepeat); LOG_TRACE(<< "selected " << periods[i].print()); this->updateResiduals(bestHypothesis, residuals); hypotheses.push_back(std::move(bestHypothesis)); } else if (bestHypothesis.s_Period.windowed()) { hypotheses.push_back(std::move(bestHypothesis)); } } if (alreadyModelled == false && std::count_if(hypotheses.begin(), hypotheses.end(), [this](const auto& hypothesis) { return hypothesis.s_Truth.boolean() && this->alreadyModelled(hypothesis.s_Period) == false; }) == 0) { return {}; } auto residualMoments = this->truncatedMoments(0.0, residuals); auto truncatedResidualMoments = this->truncatedMoments(m_OutlierFraction, residuals); LOG_TRACE(<< "variance = " << residualMoments << " <variance> = " << truncatedResidualMoments); TBoolVec componentsToRemoveMask{this->finalizeHypotheses( valuesToTest, alreadyModelled, hypotheses, residuals)}; for (std::size_t i = 0; i < m_Values.size(); ++i) { double offset{common::CBasicStatistics::mean(m_Values[i]) - common::CBasicStatistics::mean(valuesToTest[i])}; common::CBasicStatistics::moment<0>(residuals[i]) += offset; } return {*this, residualMoments, truncatedResidualMoments, this->numberTrendParameters(numberTrendSegments), std::move(residuals), std::move(hypotheses), std::move(componentsToRemoveMask)}; } void CTimeSeriesTestForSeasonality::updateResiduals(const SHypothesisStats& hypothesis, TFloatMeanAccumulatorVec& residuals) const { m_TemporaryValues = residuals; m_WindowIndices.resize(residuals.size()); std::iota(m_WindowIndices.begin(), m_WindowIndices.end(), 0); const TSizeVec& scaleSegments{hypothesis.s_ScaleSegments}; CSignal::restrictTo(hypothesis.s_Period, m_TemporaryValues); CSignal::restrictTo(hypothesis.s_Period, m_WindowIndices); m_Periods.assign(1, CSignal::seasonalComponentSummary(hypothesis.s_Period.period())); auto meanScale = [](const TSizeVec& segmentation, const TDoubleVec& scales) { return TSegmentation::meanScale(segmentation, scales); }; bool scale{this->constantScale(meanScale, m_Periods, scaleSegments, m_TemporaryValues, m_ScaledComponent, m_ComponentScales)}; if (scale == false) { CSignal::fitSeasonalComponentsRobust(m_Periods, m_OutlierFraction, m_TemporaryValues, m_Components); } for (std::size_t i = 0; i < m_TemporaryValues.size(); ++i) { common::CBasicStatistics::moment<0>(residuals[m_WindowIndices[i]]) -= (scale ? TSegmentation::scaleAt(i, scaleSegments, m_ComponentScales) * m_Periods[0].value(m_ScaledComponent[0], i) : m_Periods[0].value(m_Components[0], i)); } } CTimeSeriesTestForSeasonality::TBoolVec CTimeSeriesTestForSeasonality::finalizeHypotheses(const TFloatMeanAccumulatorVec& values, bool alreadyModelled, THypothesisStatsVec& hypotheses, TFloatMeanAccumulatorVec& residuals) const { auto componentsToRemoveMask = this->selectModelledHypotheses(alreadyModelled, hypotheses); auto componentsExcludedMask = componentsToRemoveMask; m_Periods.clear(); TSizeVec periodsHypotheses; periodsHypotheses.reserve(hypotheses.size()); for (std::size_t i = 0; i < hypotheses.size(); ++i) { // We always fit here even components we are already modelling because it's // a fairer comparison with new components. if (hypotheses[i].s_Model) { m_Periods.push_back(hypotheses[i].s_Period); periodsHypotheses.push_back(i); } else if (hypotheses[i].s_DiscardingModel == false && hypotheses[i].s_SimilarModelled < m_ModelledPeriodsTestable.size() && m_ModelledPeriodsTestable[hypotheses[i].s_SimilarModelled]) { componentsExcludedMask[hypotheses[i].s_SimilarModelled] = true; m_Periods.push_back(hypotheses[i].s_Period); periodsHypotheses.push_back(i); } } LOG_TRACE(<< "periods to model = " << m_Periods << ", period hypotheses = " << periodsHypotheses << ", not preconditioned = " << componentsExcludedMask); residuals = values; this->removeModelledPredictions(componentsExcludedMask, residuals); CSignal::fitSeasonalComponentsRobust(m_Periods, m_OutlierFraction, residuals, m_Components); TSeasonalComponentVec period; TMeanAccumulatorVecVec component; for (std::size_t i = 0; i < m_Periods.size(); ++i) { std::size_t j{periodsHypotheses[i]}; const TSizeVec& scaleSegments{hypotheses[j].s_ScaleSegments}; // Consider falling back to not scaling if scaling fails. for (auto scale : {scaleSegments.size() > 2, false}) { m_TemporaryValues = residuals; m_WindowIndices.resize(residuals.size()); std::iota(m_WindowIndices.begin(), m_WindowIndices.end(), 0); removePredictions({m_Periods, i + 1, m_Periods.size()}, {m_Components, i + 1, m_Components.size()}, m_TemporaryValues); CSignal::restrictTo(m_Periods[i], m_TemporaryValues); CSignal::restrictTo(m_Periods[i], m_WindowIndices); period.assign(1, CSignal::seasonalComponentSummary(m_Periods[i].period())); if (scale) { auto weightedMeanScale = [this](const TSizeVec& segmentation, const TDoubleVec& scales) { return TSegmentation::meanScale(segmentation, scales, [&](std::size_t k) { return std::pow( 0.9, static_cast<double>(m_TemporaryValues.size() - k - 1)); }); }; if (this->constantScale(weightedMeanScale, period, scaleSegments, m_TemporaryValues, component, m_ComponentScales) == false) { continue; } } else { CSignal::fitSeasonalComponents(period, m_TemporaryValues, component); } hypotheses[j].s_ModelSize = this->selectComponentSize(m_TemporaryValues, period[0]); hypotheses[j].s_InitialValues.resize(residuals.size()); component[0] = CSignal::smoothResample(hypotheses[j].s_ModelSize, std::move(component[0])); double overlapping{static_cast<double>(std::count_if( m_Periods.begin(), m_Periods.end(), [&](const auto& period_) { return period_.windowed() == false || period_.s_Window == m_Periods[i].s_Window; }))}; for (std::size_t k = 0; k < m_TemporaryValues.size(); ++k) { auto& moments = m_TemporaryValues[k]; if (common::CBasicStatistics::count(moments) > 0.0) { auto& value = common::CBasicStatistics::moment<0>(moments); double prediction{(scale ? TSegmentation::scaleAt(k, scaleSegments, m_ComponentScales) : 1.0) * period[0].value(component[0], k)}; value = prediction + (value - prediction) / overlapping; common::CBasicStatistics::moment<0>(residuals[m_WindowIndices[k]]) -= prediction; } else { // If we were unable to estimate the value of the seasonal // component here we should discard it when initialising // the trend. residuals[m_WindowIndices[k]] = TFloatMeanAccumulator{}; } hypotheses[j].s_InitialValues[m_WindowIndices[k]] = moments; } break; } } return componentsToRemoveMask; } CTimeSeriesTestForSeasonality::TBoolVec CTimeSeriesTestForSeasonality::selectModelledHypotheses(bool alreadyModelled, THypothesisStatsVec& hypotheses) const { // Ensure that we only keep "false" hypotheses which are needed because they // are the best hypothesis for their time window. // // For weekday/weekend modulation we use dedicated models for the weekend and // weekdays since it is more parsimonious than using one model for the whole // week. We do however need to ensure that we've selected a seasonal model for // say weekends if we have one for weekdays (even it is just a level prediction). // To achieve this we always add in all "windowed" seasonal components in // testDecomposition, whether they meet the conditions to be selected or not. // This prunes all those that aren't needed because we already have a model // for their window. They are kept in order of their actual truth value. for (std::size_t i = 0, removedCount = 0; alreadyModelled == false && i < hypotheses.size(); i += (removedCount > 0 ? 0 : 1)) { removedCount = 0; const auto& hypothesis = hypotheses[i]; if (hypothesis.s_Period.windowed()) { const auto& cutoff = std::max_element( hypotheses.begin(), hypotheses.end(), [&](const SHypothesisStats& lhs, const SHypothesisStats& rhs) { return common::COrderings::lexicographicalCompare( lhs.s_Period.s_Window == hypothesis.s_Period.s_Window, lhs.s_Truth, rhs.s_Period.s_Window == hypothesis.s_Period.s_Window, rhs.s_Truth); }); removedCount = hypotheses.size(); hypotheses.erase( std::remove_if(hypotheses.begin(), hypotheses.end(), [&](const SHypothesisStats& candidate) { return candidate.s_Period.s_Window == hypothesis.s_Period.s_Window && candidate.s_Truth.boolean() == false && candidate.s_Truth < cutoff->s_Truth; }), hypotheses.end()); removedCount -= hypotheses.size(); } } LOG_TRACE(<< "selecting from " << hypotheses); // Check if there are any extra unmodelled components. bool change{false}; std::size_t numberModelled{m_ModelledPeriods.size()}; for (std::size_t i = 0; i < hypotheses.size(); ++i) { const auto& period = hypotheses[i].s_Period; std::size_t similar{this->similarModelled(period)}; hypotheses[i].s_SimilarModelled = similar; change |= (similar == numberModelled) || (m_ModelledPeriods[similar].almostEqual(period, 0.05) == false); } LOG_TRACE(<< "change = " << change); std::ptrdiff_t excess{-m_MaximumNumberComponents}; for (auto& hypothesis : hypotheses) { hypothesis.s_Model = change; excess += hypothesis.s_Model ? 1 : 0; } // Check if there are any existing components to evict. TBoolVec componentsToRemoveMask(numberModelled, false); for (std::size_t i = 0; i < numberModelled; ++i) { auto hypothesis = std::find_if(hypotheses.begin(), hypotheses.end(), [&](const auto& candidate) { return candidate.s_SimilarModelled == i; }); componentsToRemoveMask[i] = m_ModelledPeriodsTestable[i] && (change || hypothesis->s_DiscardingModel); if (componentsToRemoveMask[i]) { for (std::size_t j = 0; j < hypotheses.size(); ++j) { hypotheses[j].s_DiscardingModel |= (hypotheses[j].s_SimilarModelled == i); } } excess -= componentsToRemoveMask[i] ? 1 : 0; } if (alreadyModelled && std::count(componentsToRemoveMask.begin(), componentsToRemoveMask.end(), true) > 0) { // Are we retaining any already modelled windowed components? if (*std::find_if(boost::counting_iterator<std::size_t>(0), boost::counting_iterator<std::size_t>(numberModelled), [&](auto i) { return m_ModelledPeriods[i].windowed() && componentsToRemoveMask[i] == false; }) != numberModelled) { auto retainingModelForWindow = [&](std::size_t i) { return m_ModelledPeriods[i].windowed() && *std::find_if(boost::counting_iterator<std::size_t>(0), boost::counting_iterator<std::size_t>(numberModelled), [&](auto j) { return m_ModelledPeriods[j].s_Window == m_ModelledPeriods[i].s_Window && componentsToRemoveMask[j] == false; }) < numberModelled; }; for (auto& hypothesis : hypotheses) { std::size_t similar{hypothesis.s_SimilarModelled}; if (similar < m_ModelledPeriods.size() && componentsToRemoveMask[similar] && m_ModelledPeriods[similar].windowed() && retainingModelForWindow(similar) == false) { componentsToRemoveMask[similar] = false; hypothesis.s_DiscardingModel = false; ++excess; } } } // If we're discarding some components we reinitalise the ones we keep. if (std::count(componentsToRemoveMask.begin(), componentsToRemoveMask.end(), true) > 0) { for (auto& hypothesis : hypotheses) { std::size_t similar{hypothesis.s_SimilarModelled}; if (similar < m_ModelledPeriodsTestable.size()) { bool remove{componentsToRemoveMask[similar]}; hypothesis.s_Model = m_ModelledPeriodsTestable[similar] && remove == false; componentsToRemoveMask[similar] = remove || hypothesis.s_Model; } } } LOG_TRACE(<< "components to remove = " << componentsToRemoveMask); } LOG_TRACE(<< "excess = " << excess); // Don't exceed the maximum number of components discarding excess in order // of increasing explained variance. for (/**/; excess > 0; --excess) { auto hypothesis = std::min_element( hypotheses.begin(), hypotheses.end(), [](const auto& lhs, const auto& rhs) { return lhs.s_ExplainedVariance < rhs.s_ExplainedVariance; }); hypothesis->s_Model = false; } return componentsToRemoveMask; } std::size_t CTimeSeriesTestForSeasonality::selectComponentSize(const TFloatMeanAccumulatorVec& valuesToTest, const TSeasonalComponent& period) const { auto matchingModelled = std::find_if( m_ModelledPeriods.begin(), m_ModelledPeriods.end(), [&](const auto& modelledPeriod) { return modelledPeriod.s_Period == period.s_Period; }); std::size_t modelledSize{ matchingModelled == m_ModelledPeriods.end() ? 0 : m_ModelledPeriodsSizes[matchingModelled - m_ModelledPeriods.begin()]}; return common::CTools::truncate( std::max(modelledSize, CSignal::selectComponentSize(valuesToTest, period.period())), m_MinimumModelSize, m_MaximumModelSize); } std::size_t CTimeSeriesTestForSeasonality::similarModelled(const TSeasonalComponent& period) const { std::size_t exact{*std::find_if( boost::counting_iterator<std::size_t>(0), boost::counting_iterator<std::size_t>(m_ModelledPeriods.size()), [&](const auto& j) { return m_ModelledPeriods[j].almostEqual(period, 0.05); })}; return exact < m_ModelledPeriods.size() ? exact : *std::find_if( boost::counting_iterator<std::size_t>(0), boost::counting_iterator<std::size_t>(m_ModelledPeriods.size()), [&](const auto& j) { return m_ModelledPeriods[j].periodAlmostEqual(period, 0.05); }); } void CTimeSeriesTestForSeasonality::removeModelledPredictions(const TBoolVec& componentsExcludedMask, TFloatMeanAccumulatorVec& values) const { // We've already conditioned values on any non-testable components so we // *always* exclude them from the prediction. TBoolVec mask{m_ModelledPeriodsTestable}; for (std::size_t i = 0; i < mask.size(); ++i) { mask[i] = mask[i] == false || componentsExcludedMask[i]; } auto bucketPredictor = CSignal::bucketPredictor( [&](core_t::TTime time) { return m_ModelledPredictor(time, mask); }, m_BucketsStartTime, m_BucketLength, m_ValuesStartTime - m_BucketsStartTime, m_SampleInterval); for (std::size_t i = 0; i < values.size(); ++i) { common::CBasicStatistics::moment<0>(values[i]) -= bucketPredictor(m_BucketLength * static_cast<core_t::TTime>(i)); } } void CTimeSeriesTestForSeasonality::removeDiscontinuities(const TSizeVec& trendSegments, TFloatMeanAccumulatorVec& values) const { if (trendSegments.size() > 2) { // Ignore short segments since they often fit outliers. std::size_t minimumSegmentLength{static_cast<std::size_t>(std::ceil( m_OutlierFraction * static_cast<double>(CSignal::countNotMissing(values)) / 2.0))}; for (std::size_t i = 1; i < trendSegments.size(); ++i) { if (trendSegments[i] - trendSegments[i - 1] < minimumSegmentLength) { for (std::size_t j = trendSegments[i - 1]; j < trendSegments[i]; ++j) { values[j] = TFloatMeanAccumulator{}; } } } values = TSegmentation::removePiecewiseLinearDiscontinuities( std::move(values), trendSegments, m_OutlierFraction); } } bool CTimeSeriesTestForSeasonality::constantScale(const TConstantScale& scale, const TSeasonalComponentVec& periods, const TSizeVec& scaleSegments, TFloatMeanAccumulatorVec& values, TMeanAccumulatorVecVec& components, TDoubleVec& scales) const { if (scaleSegments.size() > 2) { // Ignore short segments since they often fit outliers. std::size_t minimumSegmentLength{static_cast<std::size_t>(std::ceil( m_OutlierFraction * static_cast<double>(CSignal::countNotMissing(values)) / 2.0))}; for (std::size_t i = 1; i < scaleSegments.size(); ++i) { if (scaleSegments[i] - scaleSegments[i - 1] < minimumSegmentLength) { for (std::size_t j = scaleSegments[i - 1]; j < scaleSegments[i]; ++j) { values[j] = TFloatMeanAccumulator{}; } } } values = TSegmentation::constantScalePiecewiseLinearScaledSeasonal( values, periods, scaleSegments, scale, m_OutlierFraction, components, scales); return true; } return false; } CTimeSeriesTestForSeasonality::TVarianceStats CTimeSeriesTestForSeasonality::residualVarianceStats(const TFloatMeanAccumulatorVec& values) const { auto result = CSignal::residualVarianceStats(values, m_Periods, m_Components); result.s_ResidualVariance += m_EpsVariance; result.s_TruncatedResidualVariance += m_EpsVariance; return result; } CTimeSeriesTestForSeasonality::TMeanVarAccumulator CTimeSeriesTestForSeasonality::truncatedMoments(double outlierFraction, const TFloatMeanAccumulatorVec& residuals, const TTransform& transform) const { double cutoff{std::numeric_limits<double>::max()}; std::size_t count{CSignal::countNotMissing(residuals)}; std::size_t numberOutliers{ static_cast<std::size_t>(outlierFraction * static_cast<double>(count) + 0.5)}; if (numberOutliers > 0) { m_Outliers.clear(); m_Outliers.resize(numberOutliers); for (const auto& value : residuals) { if (common::CBasicStatistics::count(value) > 0.0) { m_Outliers.add(std::fabs(transform(value))); } } cutoff = m_Outliers.biggest(); count -= m_Outliers.count(); } LOG_TRACE(<< "cutoff = " << cutoff << ", count = " << count); TMeanVarAccumulator moments; for (const auto& value : residuals) { if (common::CBasicStatistics::count(value) > 0.0 && std::fabs(transform(value)) < cutoff) { moments.add(transform(value)); } } if (numberOutliers > 0) { moments.add(cutoff, static_cast<double>(count) - common::CBasicStatistics::count(moments)); } common::CBasicStatistics::moment<1>(moments) += m_EpsVariance; return moments; } std::size_t CTimeSeriesTestForSeasonality::numberTrendParameters(std::size_t numberTrendSegments) const { return numberTrendSegments == 1 ? 3 : 2 * numberTrendSegments; } bool CTimeSeriesTestForSeasonality::includesNewComponents(const TSeasonalComponentVec& periods) const { return periods.empty() == false && this->includesPermittedPeriod(periods) && this->alreadyModelled(periods) == false; } bool CTimeSeriesTestForSeasonality::alreadyModelled(const TSeasonalComponentVec& periods) const { for (const auto& period : periods) { if (this->alreadyModelled(period) == false) { return false; } } return true; } bool CTimeSeriesTestForSeasonality::alreadyModelled(const TSeasonalComponent& period) const { return std::find(m_ModelledPeriods.begin(), m_ModelledPeriods.end(), period) != m_ModelledPeriods.end(); } bool CTimeSeriesTestForSeasonality::onlyDiurnal(const TSeasonalComponentVec& periods) const { return std::find_if(periods.begin(), periods.end(), [this](const auto& period) { return this->isDiurnal(period) == false; }) == periods.end(); } void CTimeSeriesTestForSeasonality::removeIfNotTestable(TSeasonalComponentVec& periods) const { // We don't try to test components which aren't testable or nearly match // modelled components which aren't testable. periods.erase( std::remove_if(periods.begin(), periods.end(), [this](const auto& period) { std::size_t similar{this->similarModelled(period)}; return canTestPeriod(m_Values, buckets(m_BucketLength, m_MinimumPeriod), period) == false || (similar < m_ModelledPeriodsTestable.size() && m_ModelledPeriodsTestable[similar] == false); }), periods.end()); } CTimeSeriesTestForSeasonality::TPeriodDescriptor CTimeSeriesTestForSeasonality::periodDescriptor(std::size_t period) const { if (period == this->day()) { return CNewSeasonalComponentSummary::E_Day; } if (period == this->week()) { return CNewSeasonalComponentSummary::E_Week; } if (period == this->year()) { return CNewSeasonalComponentSummary::E_Year; } return CNewSeasonalComponentSummary::E_General; } bool CTimeSeriesTestForSeasonality::isDiurnal(std::size_t period) const { core_t::TTime periodInSeconds{static_cast<core_t::TTime>(period) * m_BucketLength}; return periodInSeconds == core::constants::DAY || periodInSeconds == core::constants::WEEK || periodInSeconds == core::constants::YEAR; } bool CTimeSeriesTestForSeasonality::isDiurnal(const TSeasonalComponent& period) const { return this->isDiurnal(period.s_Period); } bool CTimeSeriesTestForSeasonality::isWeekend(const TSeasonalComponent& period) const { return period.windowed() && period.s_Window == this->weekendWindow(); } bool CTimeSeriesTestForSeasonality::isWeekday(const TSeasonalComponent& period) const { return period.windowed() && period.s_Window == this->weekdayWindow(); } bool CTimeSeriesTestForSeasonality::permittedPeriod(const TSeasonalComponent& period) const { return m_MinimumPeriod == 0 || static_cast<core_t::TTime>(period.s_WindowRepeat) * m_BucketLength >= m_MinimumPeriod; } bool CTimeSeriesTestForSeasonality::includesPermittedPeriod(const TSeasonalComponentVec& periods) const { return m_MinimumPeriod == 0 || std::find_if(periods.begin(), periods.end(), [this](const auto& period) { return this->permittedPeriod(period); }) != periods.end(); } std::string CTimeSeriesTestForSeasonality::annotationText(const TSeasonalComponent& period) const { std::ostringstream result; result << "Detected seasonal component with period "; if (period.s_Period == this->day()) { result << core::CTimeUtils::durationToString(core::constants::DAY); } else if (period.s_Period == this->week()) { result << core::CTimeUtils::durationToString(core::constants::WEEK); } else if (period.s_Period == this->year()) { result << core::CTimeUtils::durationToString(core::constants::YEAR); } else { result << core::CTimeUtils::durationToString( m_BucketLength * static_cast<core_t::TTime>(period.s_Period)); } result << (this->isWeekend(period) ? " (weekend)" : (this->isWeekday(period) ? " (weekdays)" : "")); return result.str(); } std::size_t CTimeSeriesTestForSeasonality::day() const { return buckets(m_BucketLength, core::constants::DAY); } std::size_t CTimeSeriesTestForSeasonality::week() const { return buckets(m_BucketLength, core::constants::WEEK); } std::size_t CTimeSeriesTestForSeasonality::year() const { return buckets(m_BucketLength, core::constants::YEAR); } CTimeSeriesTestForSeasonality::TSizeSizePr CTimeSeriesTestForSeasonality::weekdayWindow() const { return {2 * this->day(), 7 * this->day()}; } CTimeSeriesTestForSeasonality::TSizeSizePr CTimeSeriesTestForSeasonality::weekendWindow() const { return {0, 2 * this->day()}; } CTimeSeriesTestForSeasonality::TSeasonalComponent CTimeSeriesTestForSeasonality::toPeriod(core_t::TTime startTime, core_t::TTime bucketLength, const CSeasonalTime& component) { std::size_t periodInBuckets{buckets(bucketLength, component.period())}; if (component.windowed()) { std::size_t startOfWindowInBuckets{ buckets(bucketLength, adjustForStartTime(startTime, component.windowRepeatStart())) % buckets(bucketLength, core::constants::WEEK)}; std::size_t windowRepeatInBuckets{buckets(bucketLength, component.windowRepeat())}; TSizeSizePr windowInBuckets{buckets(bucketLength, component.window().first), buckets(bucketLength, component.window().second)}; return {periodInBuckets, startOfWindowInBuckets, windowRepeatInBuckets, windowInBuckets}; } TSizeSizePr windowInBuckets{0, periodInBuckets}; return {periodInBuckets, 0, periodInBuckets, windowInBuckets}; } core_t::TTime CTimeSeriesTestForSeasonality::adjustForStartTime(core_t::TTime startTime, core_t::TTime startOfWeek) { return (core::constants::WEEK + startOfWeek - (startTime % core::constants::WEEK)) % core::constants::WEEK; } std::size_t CTimeSeriesTestForSeasonality::buckets(core_t::TTime bucketLength, core_t::TTime interval) { return static_cast<std::size_t>((interval + bucketLength / 2) / bucketLength); } bool CTimeSeriesTestForSeasonality::canTestPeriod(const TFloatMeanAccumulatorVec& values, std::size_t minimumPeriod, const TSeasonalComponent& period) { return periodTooShortToTest(minimumPeriod, period) == false && periodTooLongToTest(values, period) == false; } bool CTimeSeriesTestForSeasonality::periodTooLongToTest(const TFloatMeanAccumulatorVec& values, const TSeasonalComponent& period) { std::size_t range{observedRange(values)}; std::size_t gap{longestGap(values)}; return 190 * period.s_WindowRepeat + 100 * gap > 100 * range; } bool CTimeSeriesTestForSeasonality::periodTooShortToTest(std::size_t minimumPeriod, const TSeasonalComponent& period) { return period.s_Period < std::max(minimumPeriod, std::size_t{2}); } std::size_t CTimeSeriesTestForSeasonality::observedRange(const TFloatMeanAccumulatorVec& values) { std::size_t begin; std::size_t end; std::tie(begin, end) = observedInterval(values); return end - begin; } std::size_t CTimeSeriesTestForSeasonality::longestGap(const TFloatMeanAccumulatorVec& values) { std::size_t result{0}; std::size_t begin; std::size_t end; std::tie(begin, end) = observedInterval(values); for (std::size_t i = begin, j = end; j < end; i = j) { for (++j; j < end; ++j) { if (common::CBasicStatistics::count(values[j]) > 0.0) { break; } } result = std::max(result, j - i - 1); } return result; } CTimeSeriesTestForSeasonality::TSizeSizePr CTimeSeriesTestForSeasonality::observedInterval(const TFloatMeanAccumulatorVec& values) { std::size_t begin{0}; std::size_t end{values.size()}; std::size_t size{values.size()}; for (/**/; begin < size && common::CBasicStatistics::count(values[begin]) == 0.0; ++begin) { } for (/**/; end > begin && common::CBasicStatistics::count(values[end - 1]) == 0.0; --end) { } return {begin, end}; } void CTimeSeriesTestForSeasonality::removePredictions(const TSeasonalComponentCRng& periodsToRemove, const TMeanAccumulatorVecCRng& componentsToRemove, TFloatMeanAccumulatorVec& values) { removePredictions( [&](std::size_t i) { double value{0.0}; for (std::size_t j = 0; j < periodsToRemove.size(); ++j) { value += periodsToRemove[j].value(componentsToRemove[j], i); } return value; }, values); } void CTimeSeriesTestForSeasonality::removePredictions(const TBucketPredictor& predictor, TFloatMeanAccumulatorVec& values) { for (std::size_t i = 0; i < values.size(); ++i) { if (common::CBasicStatistics::count(values[i]) > 0.0) { common::CBasicStatistics::moment<0>(values[i]) -= predictor(i); } } } bool CTimeSeriesTestForSeasonality::CMinAmplitude::seenSufficientDataToTestAmplitude( std::size_t range, std::size_t period) { return range >= MINIMUM_REPEATS * period; } void CTimeSeriesTestForSeasonality::CMinAmplitude::add(std::size_t index, const TFloatMeanAccumulator& value) { if (common::CBasicStatistics::count(value) > 0.0) { std::size_t bucket{index / m_BucketLength}; if (bucket < m_BucketAmplitudes.size()) { ++m_Count; m_BucketAmplitudes[bucket].add(common::CBasicStatistics::mean(value) - m_Level); } } } double CTimeSeriesTestForSeasonality::CMinAmplitude::amplitude() const { double amplitudes[]{INF, INF}; for (const auto& bucket : m_BucketAmplitudes) { if (bucket.initialized()) { amplitudes[0] = std::min(amplitudes[0], std::max(-bucket.min(), 0.0)); amplitudes[1] = std::min(amplitudes[1], std::max(bucket.max(), 0.0)); } else { amplitudes[0] = amplitudes[1] = 0.0; break; } } return std::max(amplitudes[0], amplitudes[1]); } double CTimeSeriesTestForSeasonality::CMinAmplitude::significance(const boost::math::normal& normal) const { double amplitude{this->amplitude()}; if (amplitude == 0.0 || m_Count == 0) { return 1.0; } double twoTailPValue{2.0 * common::CTools::safeCdf(normal, -amplitude)}; if (twoTailPValue <= 0.0) { return 0.0; } boost::math::binomial binomial(static_cast<double>(m_Count), twoTailPValue); return common::CTools::safeCdfComplement( binomial, static_cast<double>(m_BucketAmplitudes.size()) - 1.0); } std::string CTimeSeriesTestForSeasonality::CMinAmplitude::print() const { auto appendBucket = [](const TMinMaxAccumulator& bucket, std::ostringstream& result) { if (bucket.initialized()) { result << "(" << bucket.min() << "," << bucket.max() << ")"; } else { result << "-"; } }; std::ostringstream result; result << "count = " << m_Count << " ["; appendBucket(m_BucketAmplitudes[0], result); for (std::size_t i = 1; i < m_BucketAmplitudes.size(); ++i) { result << ", "; appendBucket(m_BucketAmplitudes[i], result); } result << "]"; return result.str(); } void CTimeSeriesTestForSeasonality::SHypothesisStats::testExplainedVariance( const CTimeSeriesTestForSeasonality& params, const TVarianceStats& H0) { auto H1 = params.residualVarianceStats(params.m_ValuesToTest); s_FractionNotMissing = static_cast<double>(H1.s_NumberParameters) / static_cast<double>(params.m_Components[0].size()); s_ResidualVariance = H1.s_ResidualVariance; s_ExplainedVariance = common::CBasicStatistics::maximumLikelihoodVariance( std::accumulate(params.m_Components[0].begin(), params.m_Components[0].end(), TMeanVarAccumulator{}, [](auto result, const auto& value) { if (common::CBasicStatistics::count(value) > 0.0) { result.add(common::CBasicStatistics::mean(value)); } return result; })); s_NumberParametersToExplainVariance = H1.s_NumberParameters + s_NumberScaleSegments - 1; s_ExplainedVariancePValue = CSignal::nestedDecompositionPValue(H0, H1); LOG_TRACE(<< "fraction not missing = " << s_FractionNotMissing); LOG_TRACE(<< H1.print() << " vs " << H0.print()); LOG_TRACE(<< "p-value = " << s_ExplainedVariancePValue); } void CTimeSeriesTestForSeasonality::SHypothesisStats::testAutocorrelation( const CTimeSeriesTestForSeasonality& params) { CSignal::TFloatMeanAccumulatorCRng valuesToTestAutocorrelation{ params.m_ValuesToTest, 0, common::CIntegerTools::floor(params.m_ValuesToTest.size(), params.m_Periods[0].period())}; double autocorrelations[]{ CSignal::cyclicAutocorrelation( // Normal params.m_Periods[0], valuesToTestAutocorrelation, [](const TFloatMeanAccumulator& value) { return common::CBasicStatistics::mean(value); }, [](const TFloatMeanAccumulator& value) { return common::CBasicStatistics::count(value); }, params.m_EpsVariance), CSignal::cyclicAutocorrelation( // Not reweighting outliers params.m_Periods[0], valuesToTestAutocorrelation, [](const TFloatMeanAccumulator& value) { return common::CBasicStatistics::mean(value); }, [](const TFloatMeanAccumulator&) { return 1.0; }, params.m_EpsVariance), CSignal::cyclicAutocorrelation( // Absolute values params.m_Periods[0], valuesToTestAutocorrelation, [](const TFloatMeanAccumulator& value) { return std::fabs(common::CBasicStatistics::mean(value)); }, [](const TFloatMeanAccumulator& value) { return common::CBasicStatistics::count(value); }, params.m_EpsVariance), CSignal::cyclicAutocorrelation( // Not reweighting outliers and absolute values params.m_Periods[0], valuesToTestAutocorrelation, [](const TFloatMeanAccumulator& value) { return std::fabs(common::CBasicStatistics::mean(value)); }, [](const TFloatMeanAccumulator&) { return 1.0; }, params.m_EpsVariance)}; LOG_TRACE(<< "autocorrelations = " << autocorrelations); s_Autocorrelation = *std::max_element(std::begin(autocorrelations), std::begin(autocorrelations) + 2); s_AutocorrelationUpperBound = *std::max_element(std::begin(autocorrelations), std::end(autocorrelations)); LOG_TRACE(<< "autocorrelation = " << s_Autocorrelation << ", autocorrelation upper bound = " << s_AutocorrelationUpperBound); } void CTimeSeriesTestForSeasonality::SHypothesisStats::testAmplitude(const CTimeSeriesTestForSeasonality& params) { using TMeanAccumulator = common::CBasicStatistics::SSampleMean<double>::TAccumulator; s_SeenSufficientDataToTestAmplitude = CMinAmplitude::seenSufficientDataToTestAmplitude( observedRange(params.m_ValuesToTest), params.m_Periods[0].s_Period); if (s_SeenSufficientDataToTestAmplitude == false) { return; } double level{common::CBasicStatistics::mean(std::accumulate( params.m_ValuesToTest.begin(), params.m_ValuesToTest.end(), TMeanAccumulator{}, [](TMeanAccumulator partialLevel, const TFloatMeanAccumulator& value) { partialLevel.add(common::CBasicStatistics::mean(value), common::CBasicStatistics::count(value)); return partialLevel; }))}; params.m_Amplitudes.assign(params.m_Periods[0].period(), {params.m_ValuesToTest.size(), s_MeanNumberRepeats, level}); for (std::size_t i = 0; i < params.m_ValuesToTest.size(); ++i) { if (params.m_Periods[0].contains(i)) { params.m_Amplitudes[params.m_Periods[0].offset(i)].add( i, params.m_ValuesToTest[i]); } } double pValue{1.0}; if (s_ResidualVariance <= 0.0) { pValue = std::find_if(params.m_Amplitudes.begin(), params.m_Amplitudes.end(), [](const auto& amplitude) { return amplitude.amplitude() > 0.0; }) != params.m_Amplitudes.end() ? 0.0 : 1.0; } else { boost::math::normal normal(0.0, std::sqrt(s_ResidualVariance)); for (const auto& amplitude : params.m_Amplitudes) { if (amplitude.amplitude() >= 2.0 * boost::math::standard_deviation(normal)) { pValue = std::min(pValue, amplitude.significance(normal)); } } } s_AmplitudePValue = std::max( common::CTools::oneMinusPowOneMinusX( pValue, static_cast<double>(std::count_if( params.m_Amplitudes.begin(), params.m_Amplitudes.end(), [](const auto& amplitude) { return amplitude.amplitude() > 0.0; }))), common::CTools::smallestProbability()); LOG_TRACE(<< "amplitude p-value = " << s_AmplitudePValue); } common::CFuzzyTruthValue CTimeSeriesTestForSeasonality::SHypothesisStats::varianceTestResult( const CTimeSeriesTestForSeasonality& params) const { // We have the following hard constraints: // 1. We need to see at least m_MinimumRepeatsPerSegmentToTestVariance // repeats of the seasonality. // 2. The test p-value needs to be less than m_SignificantPValue. // 3. The autocorrelation needs to be higher than m_MediumAutocorrelation. // // We also get more confident the more non-missing values we see per repeat, // if we have very high signficance or very high autocorrelation and less // confident if we've seen very few repeats or have low autocorrelation. // // In order to make final decision we soften the hard constraints using a fuzzy // logic approach. This uses a standard form with a logistic function to represent // the truth value of a proposition and multiplicative AND. This has the effect // that missing any constraint significantly means the test fails, but we can // still take advantage of a constraint which nearly meets if some other one // is comfortably satisfied. In this context, the width of the fuzzy region is // relatively small, typically 10% of the constraint value. // // The other considerations are one-sided, i.e. they either *only* increase // or decrease the truth value of the overall proposition. This is done by // setting them to the max or min of the constraint value and the decision // boundary. double minimumRepeatsPerSegment{params.m_MinimumRepeatsPerSegmentToTestVariance}; double mediumAutocorrelation{params.m_MediumAutocorrelation}; double lowAutocorrelation{params.m_LowAutocorrelation}; double highAutocorrelation{params.m_HighAutocorrelation}; double logSignificantPValue{std::log(params.m_SignificantPValue)}; double logVerySignificantPValue{std::log(params.m_VerySignificantPValue)}; std::size_t segments{std::max(s_NumberTrendSegments, std::size_t{1}) + s_NumberScaleSegments - 1}; double repeatsPerSegment{s_MeanNumberRepeats / static_cast<double>(segments)}; double windowRepeatsPerSegment{ segments > 1 ? s_WindowRepeats / static_cast<double>(segments) : 2.0}; double logPValue{std::log(std::max(s_ExplainedVariancePValue, std::numeric_limits<double>::min()))}; return common::fuzzyGreaterThan(repeatsPerSegment / minimumRepeatsPerSegment, 1.0, 0.3) && common::fuzzyGreaterThan(std::min(repeatsPerSegment / 2.0, 1.0), 1.0, 0.1) && common::fuzzyGreaterThan(std::min(windowRepeatsPerSegment / 2.0, 1.0), 1.0, 0.1) && common::fuzzyLessThan(std::max(s_LeastCommonRepeat / 0.5, 1.0), 1.0, 0.5) && common::fuzzyGreaterThan(s_FractionNotMissing, 1.0, 0.5) && common::fuzzyGreaterThan(logPValue / logSignificantPValue, 1.0, 0.1) && common::fuzzyGreaterThan( std::max(logPValue / logVerySignificantPValue, 1.0), 1.0, 0.1) && common::fuzzyGreaterThan(s_Autocorrelation / mediumAutocorrelation, 1.0, 0.2) && common::fuzzyGreaterThan( std::min(s_Autocorrelation / lowAutocorrelation, 1.0), 1.0, 0.1) && common::fuzzyGreaterThan( std::max(s_Autocorrelation / highAutocorrelation, 1.0), 1.0, 0.1); } common::CFuzzyTruthValue CTimeSeriesTestForSeasonality::SHypothesisStats::amplitudeTestResult( const CTimeSeriesTestForSeasonality& params) const { if (s_SeenSufficientDataToTestAmplitude == false) { return common::CFuzzyTruthValue::OR_UNDETERMINED_VALUE; } // Compare with the discussion in testVariance. double minimumRepeatsPerSegment{params.m_MinimumRepeatsPerSegmentToTestAmplitude}; double lowAutocorrelation{params.m_LowAutocorrelation}; double logSignificantPValue{std::log(params.m_SignificantPValue)}; double logVerySignificantPValue{std::log(params.m_VerySignificantPValue)}; std::size_t segments{std::max(s_NumberTrendSegments, std::size_t{1}) + s_NumberScaleSegments - 1}; double repeatsPerSegment{s_MeanNumberRepeats / static_cast<double>(segments)}; double windowRepeatsPerSegment{ segments > 1 ? s_WindowRepeats / static_cast<double>(segments) : 2.0}; double autocorrelation{s_AutocorrelationUpperBound}; double logPValue{ std::log(std::max(s_AmplitudePValue, std::numeric_limits<double>::min()))}; return common::fuzzyGreaterThan(repeatsPerSegment / minimumRepeatsPerSegment, 1.0, 0.1) && common::fuzzyGreaterThan(std::min(repeatsPerSegment / 2.0, 1.0), 1.0, 0.1) && common::fuzzyGreaterThan(std::min(windowRepeatsPerSegment / 2.0, 1.0), 1.0, 0.1) && common::fuzzyLessThan(std::max(s_LeastCommonRepeat / 0.5, 1.0), 1.0, 0.5) && common::fuzzyGreaterThan(s_FractionNotMissing, 1.0, 0.5) && common::fuzzyGreaterThan(autocorrelation / lowAutocorrelation, 1.0, 0.2) && common::fuzzyGreaterThan(logPValue / logSignificantPValue, 1.0, 0.1) && common::fuzzyGreaterThan( std::max(logPValue / logVerySignificantPValue, 1.0), 1.0, 0.1); } bool CTimeSeriesTestForSeasonality::SHypothesisStats::isBetter(const SHypothesisStats& other) const { // We check (lexicographically): // 1. "is testable" which is equivalent to if the stats have been initialized, // 2. The truth value, // 3. The amount of variance the hypothesis explains, which doesn't saturate // like the truth value. double min{std::numeric_limits<double>::min()}; return common::COrderings::lexicographicalCompare( other.s_IsTestable, other.s_Truth.boolean(), 1.0 * std::log(std::max(other.s_Truth.value(), min)) + 0.5 * std::log(-std::log(std::max(other.s_ExplainedVariancePValue, min))), s_IsTestable, s_Truth.boolean(), 1.0 * std::log(std::max(s_Truth.value(), min)) + 0.5 * std::log(-std::log(std::max(s_ExplainedVariancePValue, min)))); } bool CTimeSeriesTestForSeasonality::SHypothesisStats::evict(const CTimeSeriesTestForSeasonality& params, std::size_t modelledIndex) const { return s_ExplainedVariancePValue > params.m_PValueToEvict && s_AmplitudePValue > params.m_PValueToEvict && this->isEvictionPermitted(params, modelledIndex); } bool CTimeSeriesTestForSeasonality::SHypothesisStats::isEvictionPermitted( const CTimeSeriesTestForSeasonality& params, std::size_t modelledIndex) const { std::size_t range{params.m_ModelledPeriods[modelledIndex].fractionInWindow( observedRange(params.m_Values))}; std::size_t period{params.m_ModelledPeriods[modelledIndex].period()}; return params.m_ModelledPeriodsTestable[modelledIndex] && 4 * period >= params.m_ModelledPeriodsSizes[modelledIndex] && CMinAmplitude::seenSufficientDataToTestAmplitude(range, period); } double CTimeSeriesTestForSeasonality::SHypothesisStats::weight() const { return s_ExplainedVariance * static_cast<double>(s_Period.s_Window.second - s_Period.s_Window.first) / static_cast<double>(s_Period.s_WindowRepeat); } std::string CTimeSeriesTestForSeasonality::SHypothesisStats::print() const { return s_Period.print() + "/" + std::to_string(s_NumberScaleSegments) + "/" + std::to_string(s_NumberTrendSegments); } bool CTimeSeriesTestForSeasonality::SModel::isTestable() const { for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_IsTestable == false) { return false; } } return true; } bool CTimeSeriesTestForSeasonality::SModel::isNull() const { return s_Hypotheses.empty() && common::CBasicStatistics::count(s_ResidualMoments) > this->numberParameters(); } bool CTimeSeriesTestForSeasonality::SModel::isAlternative() const { return this->isNull() == false && common::CBasicStatistics::count(s_ResidualMoments) > this->numberParameters(); } double CTimeSeriesTestForSeasonality::SModel::componentsSimilarity() const { if (s_AlreadyModelled == false) { return 0.0; } for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_DiscardingModel) { return 0.5; } } return 1.0; } bool CTimeSeriesTestForSeasonality::SModel::isEvictionPermitted() const { if (s_AlreadyModelled == false) { return false; } for (std::size_t i = 0; i < s_Hypotheses.size(); ++i) { if (s_Hypotheses[i].isEvictionPermitted(*s_Params, i) == false) { return false; } } return true; } double CTimeSeriesTestForSeasonality::SModel::pValue(const SModel& H0, double minimumRelativeTruncatedVariance, double unexplainedVariance) const { double n[]{common::CBasicStatistics::count(H0.s_ResidualMoments), common::CBasicStatistics::count(H0.s_TruncatedResidualMoments)}; double v0[]{common::CBasicStatistics::maximumLikelihoodVariance(H0.s_ResidualMoments) + unexplainedVariance, common::CBasicStatistics::maximumLikelihoodVariance(H0.s_TruncatedResidualMoments) + unexplainedVariance}; double v1[]{std::max(common::CBasicStatistics::maximumLikelihoodVariance(s_ResidualMoments), std::numeric_limits<double>::epsilon() * v0[0]) + unexplainedVariance, std::max(common::CBasicStatistics::maximumLikelihoodVariance(s_TruncatedResidualMoments), std::numeric_limits<double>::epsilon() * v0[1]) + unexplainedVariance}; double df0[]{n[0] - H0.numberParameters(), n[1] - H0.numberParameters()}; double df1[]{common::CBasicStatistics::count(s_ResidualMoments) - this->numberParameters(), common::CBasicStatistics::count(s_TruncatedResidualMoments) - this->numberParameters()}; v0[1] += minimumRelativeTruncatedVariance * v0[0]; v1[1] += minimumRelativeTruncatedVariance * v1[0]; return std::max( std::min(common::CStatisticalTests::rightTailFTest(v0[0], v1[0], df0[0], df1[0]), common::CStatisticalTests::rightTailFTest(v0[1], v1[1], df0[1], df1[1])), common::CTools::smallestProbability()); } double CTimeSeriesTestForSeasonality::SModel::logPValueProxy(const SModel& H0) const { // We use minus the number of standard deviations above the mean of the F-distribution. double v0[]{common::CBasicStatistics::maximumLikelihoodVariance(H0.s_ResidualMoments), common::CBasicStatistics::maximumLikelihoodVariance(H0.s_TruncatedResidualMoments)}; double v1[]{std::max(common::CBasicStatistics::maximumLikelihoodVariance(s_ResidualMoments), 1e-3 * v0[0]), std::max(common::CBasicStatistics::maximumLikelihoodVariance(s_TruncatedResidualMoments), 1e-3 * v0[1])}; double df0[]{common::CBasicStatistics::count(H0.s_ResidualMoments) - H0.numberParameters(), common::CBasicStatistics::count(H0.s_TruncatedResidualMoments) - H0.numberParameters()}; double df1[]{common::CBasicStatistics::count(s_ResidualMoments) - this->numberParameters(), common::CBasicStatistics::count(s_TruncatedResidualMoments) - this->numberParameters()}; double result{0.0}; for (auto i : {0, 1}) { // d2 needs to be > 4 for finite variance. We can happily use 0.0 for the log(p-value) // proxy if this condition is not satisfied. if (df1[i] > 4.0 && df0[i] > 0.0) { boost::math::fisher_f f{df0[i], df1[i]}; double mean{boost::math::mean(f)}; double sd{boost::math::standard_deviation(f)}; result = std::max(result, ((v0[i] * df1[i]) / (v1[i] * df0[i]) - mean) / sd); } } return -result; }; CTimeSeriesTestForSeasonality::TVector2x1 CTimeSeriesTestForSeasonality::SModel::explainedVariancePerParameter(double variance, double truncatedVariance) const { TVector2x1 explainedVariance; explainedVariance(0) = variance - common::CBasicStatistics::maximumLikelihoodVariance(s_ResidualMoments); explainedVariance(1) = truncatedVariance - common::CBasicStatistics::maximumLikelihoodVariance( s_TruncatedResidualMoments); TVector2x1 result{0.0}; double Z{0.0}; for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_Model || hypothesis.s_DiscardingModel == false) { double weight{hypothesis.weight()}; result += weight * explainedVariance / static_cast<double>(hypothesis.s_NumberParametersToExplainVariance); Z += weight; } } return max(result == TVector2x1{0.0} ? result : result / Z, TVector2x1{std::numeric_limits<double>::min()}); } double CTimeSeriesTestForSeasonality::SModel::numberParameters() const { return static_cast<double>(std::accumulate( s_Hypotheses.begin(), s_Hypotheses.end(), s_NumberTrendParameters + 1, [this](std::size_t partialNumber, const auto& hypothesis) { auto i = std::find_if( s_Hypotheses.begin(), s_Hypotheses.end(), [&](const auto& hypothesis_) { return hypothesis.s_Period.nested(hypothesis_.s_Period); }); return partialNumber + ((hypothesis.s_Model || hypothesis.s_DiscardingModel == false) && i == s_Hypotheses.end() ? hypothesis.s_NumberParametersToExplainVariance : 0); })); } double CTimeSeriesTestForSeasonality::SModel::targetModelSize() const { return std::accumulate( s_Hypotheses.begin(), s_Hypotheses.end(), 0.0, [&](auto partialSize, const auto& hypothesis) { if (hypothesis.s_Model) { partialSize += static_cast<double>(hypothesis.s_ModelSize); } else if (hypothesis.s_DiscardingModel == false) { partialSize += static_cast<double>( s_Params->m_ModelledPeriodsSizes[hypothesis.s_SimilarModelled]); } return partialSize; }); } double CTimeSeriesTestForSeasonality::SModel::numberScalings() const { std::size_t segments{0}; for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_Model || hypothesis.s_DiscardingModel == false) { segments += hypothesis.s_NumberScaleSegments - 1; } } return static_cast<double>(segments); } double CTimeSeriesTestForSeasonality::SModel::autocorrelation() const { double result{0.0}; double Z{0.0}; for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_Model || hypothesis.s_DiscardingModel == false) { double weight{hypothesis.weight()}; result += weight * hypothesis.s_Autocorrelation; Z += weight; } } return result == 0.0 ? 0.0 : result / Z; } double CTimeSeriesTestForSeasonality::SModel::leastCommonRepeat() const { std::size_t result{1}; for (const auto& hypothesis : s_Hypotheses) { if (hypothesis.s_Model || hypothesis.s_DiscardingModel == false) { result = common::CIntegerTools::lcm(result, hypothesis.s_Period.s_WindowRepeat); } } return static_cast<double>(result) / static_cast<double>(observedRange(s_Params->m_Values)); } } } }

lib/maths/time_series/CTimeSeriesTestForSeasonality.cc (1,842 lines of code) (raw):