in lib/maths/time_series/CCalendarCyclicTest.cc [216:346]
CCalendarCyclicTest::TFeatureTimePrVec CCalendarCyclicTest::test() const {
if (m_ErrorQuantiles.count() == 0) {
return {};
}
// The statistics we need in order to be able to test for calendar features.
using TMaxDoubleAccumulator =
common::CBasicStatistics::SMax<double, MINIMUM_REPEATS>::TAccumulator;
struct SStats {
unsigned int s_Repeats{0};
unsigned int s_RepeatsWithLargeErrors{0};
double s_ErrorSum{0.0};
double s_ErrorCount{0.0};
double s_LargeErrorCount{0.0};
double s_VeryLargeErrorCount{0.0};
double s_PValue{0.0};
TMaxDoubleAccumulator s_Cdf;
};
using TFeatureStatsUMap =
boost::unordered_map<std::pair<CCalendarFeature, core_t::TTime>, SStats, SHashAndOffsetFeature>;
TErrorStatsVec errors{this->inflate()};
struct SFeatureAndError {
bool operator<(const SFeatureAndError& rhs) const {
return common::COrderings::lexicographicalCompare(
-s_ErrorSum, std::fabs(s_Offset), -rhs.s_ErrorSum, std::fabs(rhs.s_Offset));
}
CCalendarFeature s_Feature;
core_t::TTime s_Offset;
double s_ErrorSum;
};
using TMaxErrorFeatureAccumulator =
common::CBasicStatistics::COrderStatisticsStack<SFeatureAndError, 3>;
TMaxErrorFeatureAccumulator largestErrorFeatures;
double errorThreshold;
m_ErrorQuantiles.quantile(50.0, errorThreshold);
errorThreshold *= 2.0;
TFeatureStatsUMap stats{TIME_ZONE_OFFSETS.size() * errors.size()};
// Note that the current index points to the next bucket to overwrite,
// i.e. the earliest bucket error statistics we have. The start of
// this bucket is WINDOW before the start time of the current partial
// bucket.
for (core_t::TTime i = m_CurrentBucketIndex, time = m_CurrentBucketTime - WINDOW;
time < m_CurrentBucketTime; i = (i + 1) % SIZE, time += BUCKET) {
if (errors[i].s_Count > 0) {
double n{static_cast<double>(errors[i].s_Count)};
double nl{static_cast<double>(errors[i].s_LargeErrorCount % (1 << 17))};
double nv{static_cast<double>(errors[i].s_LargeErrorCount / (1 << 17))};
double pValue{this->errorsPValue(n, nl, nv)};
// It is clear that the maximum value of a set is at least as
// large as its mean. We use this to compute a lower bound for
// the right tail probability for the largest error.
double cdf;
m_ErrorQuantiles.cdf(this->errorAdj(errors[i].s_LargeErrorSum / n), cdf);
for (auto offset : TIME_ZONE_OFFSETS) {
core_t::TTime midpoint{time + BUCKET / 2 + offset};
for (auto feature : CCalendarFeature::features(midpoint)) {
if (feature.testForTimeZoneOffset(offset)) {
SStats& stat = stats[std::make_pair(feature, offset)];
stat.s_Repeats += 1;
stat.s_RepeatsWithLargeErrors += nl > 0 ? 1 : 0;
stat.s_ErrorSum += errors[i].s_LargeErrorSum;
stat.s_ErrorCount += n;
stat.s_LargeErrorCount += nl;
stat.s_VeryLargeErrorCount += nv;
stat.s_PValue = std::max(stat.s_PValue, pValue);
stat.s_Cdf.add(cdf);
}
}
}
}
}
for (const auto& stat : stats) {
auto[feature, offset] = stat.first;
unsigned int repeats{stat.second.s_Repeats};
unsigned int repeatsWithLargeErrors{stat.second.s_RepeatsWithLargeErrors};
double n{stat.second.s_ErrorCount};
double nl{stat.second.s_LargeErrorCount};
double nv{stat.second.s_VeryLargeErrorCount};
double sl{stat.second.s_ErrorSum};
double maxBucketPValue{stat.second.s_PValue};
double cdf{stat.second.s_Cdf.biggest()};
if (repeats < MINIMUM_REPEATS || // Insufficient repeats
sl <= errorThreshold * nl) { // Error too small to bother modelling
continue;
}
if (std::pow(maxBucketPValue, repeats) < MAXIMUM_P_VALUE) { // High significance for each repeat
largestErrorFeatures.add(SFeatureAndError{feature, offset, sl});
continue;
}
if (m_BucketLength < LONG_BUCKET || // Short raw data buckets
repeatsWithLargeErrors < MINIMUM_REPEATS) { // Too few repeats with large errors
continue;
}
double windowPValue{std::min(this->errorsPValue(n, nl, nv),
binomialPValueAdj(n, 1.0 - cdf, repeats))};
if (windowPValue < MAXIMUM_P_VALUE) { // High joint significance for all repeats
largestErrorFeatures.add(SFeatureAndError{feature, offset, sl});
}
}
// Extract features and offsets in descending error sum order.
TFeatureTimePrVec result;
largestErrorFeatures.sort();
std::transform(largestErrorFeatures.begin(), largestErrorFeatures.end(),
std::back_inserter(result), [](const auto& featureAndError) {
return std::make_pair(featureAndError.s_Feature,
featureAndError.s_Offset);
});
// Enforce a single time zone.
if (result.size() > 1) {
auto offset = result[0].second;
result.erase(std::remove_if(result.begin(), result.end(),
[&](const auto& featureAndOffset) {
return featureAndOffset.second != offset;
}),
result.end());
}
return result;
}