CCalendarCyclicTest::TFeatureTimePrVec CCalendarCyclicTest::test()

in lib/maths/time_series/CCalendarCyclicTest.cc [216:346]


CCalendarCyclicTest::TFeatureTimePrVec CCalendarCyclicTest::test() const {

    if (m_ErrorQuantiles.count() == 0) {
        return {};
    }

    // The statistics we need in order to be able to test for calendar features.
    using TMaxDoubleAccumulator =
        common::CBasicStatistics::SMax<double, MINIMUM_REPEATS>::TAccumulator;
    struct SStats {
        unsigned int s_Repeats{0};
        unsigned int s_RepeatsWithLargeErrors{0};
        double s_ErrorSum{0.0};
        double s_ErrorCount{0.0};
        double s_LargeErrorCount{0.0};
        double s_VeryLargeErrorCount{0.0};
        double s_PValue{0.0};
        TMaxDoubleAccumulator s_Cdf;
    };
    using TFeatureStatsUMap =
        boost::unordered_map<std::pair<CCalendarFeature, core_t::TTime>, SStats, SHashAndOffsetFeature>;

    TErrorStatsVec errors{this->inflate()};

    struct SFeatureAndError {
        bool operator<(const SFeatureAndError& rhs) const {
            return common::COrderings::lexicographicalCompare(
                -s_ErrorSum, std::fabs(s_Offset), -rhs.s_ErrorSum, std::fabs(rhs.s_Offset));
        }
        CCalendarFeature s_Feature;
        core_t::TTime s_Offset;
        double s_ErrorSum;
    };
    using TMaxErrorFeatureAccumulator =
        common::CBasicStatistics::COrderStatisticsStack<SFeatureAndError, 3>;

    TMaxErrorFeatureAccumulator largestErrorFeatures;

    double errorThreshold;
    m_ErrorQuantiles.quantile(50.0, errorThreshold);
    errorThreshold *= 2.0;

    TFeatureStatsUMap stats{TIME_ZONE_OFFSETS.size() * errors.size()};

    // Note that the current index points to the next bucket to overwrite,
    // i.e. the earliest bucket error statistics we have. The start of
    // this bucket is WINDOW before the start time of the current partial
    // bucket.
    for (core_t::TTime i = m_CurrentBucketIndex, time = m_CurrentBucketTime - WINDOW;
         time < m_CurrentBucketTime; i = (i + 1) % SIZE, time += BUCKET) {
        if (errors[i].s_Count > 0) {
            double n{static_cast<double>(errors[i].s_Count)};
            double nl{static_cast<double>(errors[i].s_LargeErrorCount % (1 << 17))};
            double nv{static_cast<double>(errors[i].s_LargeErrorCount / (1 << 17))};
            double pValue{this->errorsPValue(n, nl, nv)};
            // It is clear that the maximum value of a set is at least as
            // large as its mean. We use this to compute a lower bound for
            // the right tail probability for the largest error.
            double cdf;
            m_ErrorQuantiles.cdf(this->errorAdj(errors[i].s_LargeErrorSum / n), cdf);

            for (auto offset : TIME_ZONE_OFFSETS) {
                core_t::TTime midpoint{time + BUCKET / 2 + offset};
                for (auto feature : CCalendarFeature::features(midpoint)) {
                    if (feature.testForTimeZoneOffset(offset)) {
                        SStats& stat = stats[std::make_pair(feature, offset)];
                        stat.s_Repeats += 1;
                        stat.s_RepeatsWithLargeErrors += nl > 0 ? 1 : 0;
                        stat.s_ErrorSum += errors[i].s_LargeErrorSum;
                        stat.s_ErrorCount += n;
                        stat.s_LargeErrorCount += nl;
                        stat.s_VeryLargeErrorCount += nv;
                        stat.s_PValue = std::max(stat.s_PValue, pValue);
                        stat.s_Cdf.add(cdf);
                    }
                }
            }
        }
    }

    for (const auto& stat : stats) {
        auto[feature, offset] = stat.first;
        unsigned int repeats{stat.second.s_Repeats};
        unsigned int repeatsWithLargeErrors{stat.second.s_RepeatsWithLargeErrors};
        double n{stat.second.s_ErrorCount};
        double nl{stat.second.s_LargeErrorCount};
        double nv{stat.second.s_VeryLargeErrorCount};
        double sl{stat.second.s_ErrorSum};
        double maxBucketPValue{stat.second.s_PValue};
        double cdf{stat.second.s_Cdf.biggest()};

        if (repeats < MINIMUM_REPEATS || // Insufficient repeats
            sl <= errorThreshold * nl) { // Error too small to bother modelling
            continue;
        }
        if (std::pow(maxBucketPValue, repeats) < MAXIMUM_P_VALUE) { // High significance for each repeat
            largestErrorFeatures.add(SFeatureAndError{feature, offset, sl});
            continue;
        }
        if (m_BucketLength < LONG_BUCKET || // Short raw data buckets
            repeatsWithLargeErrors < MINIMUM_REPEATS) { // Too few repeats with large errors
            continue;
        }
        double windowPValue{std::min(this->errorsPValue(n, nl, nv),
                                     binomialPValueAdj(n, 1.0 - cdf, repeats))};
        if (windowPValue < MAXIMUM_P_VALUE) { // High joint significance for all repeats
            largestErrorFeatures.add(SFeatureAndError{feature, offset, sl});
        }
    }

    // Extract features and offsets in descending error sum order.
    TFeatureTimePrVec result;
    largestErrorFeatures.sort();
    std::transform(largestErrorFeatures.begin(), largestErrorFeatures.end(),
                   std::back_inserter(result), [](const auto& featureAndError) {
                       return std::make_pair(featureAndError.s_Feature,
                                             featureAndError.s_Offset);
                   });

    // Enforce a single time zone.
    if (result.size() > 1) {
        auto offset = result[0].second;
        result.erase(std::remove_if(result.begin(), result.end(),
                                    [&](const auto& featureAndOffset) {
                                        return featureAndOffset.second != offset;
                                    }),
                     result.end());
    }

    return result;
}