void CTimeSeriesSegmentation::fitTopDownPiecewiseLinear()

in lib/maths/time_series/CTimeSeriesSegmentation.cc [319:447]


void CTimeSeriesSegmentation::fitTopDownPiecewiseLinear(ITR begin,
                                                        ITR end,
                                                        std::size_t depth,
                                                        std::size_t offset,
                                                        std::size_t maxDepth,
                                                        double pValueToSegment,
                                                        double outlierFraction,
                                                        TSizeVec& segmentation,
                                                        TDoubleDoublePrVec& depthAndPValue,
                                                        TFloatMeanAccumulatorVec& reweighted) {

    // We want to find the partition of [begin, end) into linear models which
    // efficiently predicts the values. To this end we minimize residual variance
    // whilst maintaining a parsimonious model. We achieve the later objective by
    // doing model selection for each candidate segment using a significance test
    // of the explained variance.

    auto range = std::distance(begin, end);
    std::ptrdiff_t initialStep{4};
    if (range < 2 * initialStep || depth > maxDepth) {
        return;
    }

    LOG_TRACE(<< "Considering splitting [" << offset << "," << offset + range << ")");

    double startTime{static_cast<double>(offset)};
    TMeanVarAccumulator residualMoments{[&] {
        reweighted.assign(begin, end);
        TRegression model{fitLinearModel(reweighted.cbegin(), reweighted.cend(), startTime)};
        auto predict = [&](std::size_t i) {
            double time{startTime + static_cast<double>(i)};
            return model.predict(time);
        };
        CSignal::reweightOutliers(predict, outlierFraction, reweighted);
        return centredResidualMoments(reweighted.cbegin(), reweighted.cend(), startTime);
    }()};

    if (common::CBasicStatistics::variance(residualMoments) > 0.0) {

        // We iterate through every possible binary split of the data into
        // contiguous ranges looking for the split which minimizes the total
        // variance of the values minus linear model predictions. Outliers
        // are removed based on an initial best split.

        TMinAccumulator<ITR> minResidualVariance;
        reweighted.assign(begin, end);

        for (auto reweights : {0, 1}) {
            LOG_TRACE(<< "  reweights = " << reweights);

            minResidualVariance = minimumResidualVarianceSplit(
                reweighted.cbegin(), reweighted.cend(), initialStep, startTime, reweighted);
            ITR split{minResidualVariance[0].second};
            minResidualVariance = minimumResidualVarianceSplit(
                split - std::min(split - reweighted.cbegin(), initialStep),
                split + std::min(reweighted.cend() - split, initialStep),
                1 /*unit step*/, startTime, reweighted);

            if (outlierFraction < 0.0 || outlierFraction >= 1.0) {
                LOG_ERROR(<< "Ignoring bad value for outlier fraction " << outlierFraction
                          << ". This must be in the range [0,1).");
                break;
            }
            if (reweights == 1 || outlierFraction == 0.0) {
                break;
            }

            split = minResidualVariance[0].second;
            double splitTime{static_cast<double>(std::distance(reweighted.cbegin(), split))};
            TRegression leftModel{fitLinearModel(reweighted.cbegin(), split, startTime)};
            TRegression rightModel{fitLinearModel(split, reweighted.cend(), splitTime)};
            TRegressionArray leftParameters;
            TRegressionArray rightParameters;
            leftModel.parameters(leftParameters);
            rightModel.parameters(rightParameters);
            auto predict = [&](std::size_t j) {
                double time{startTime + static_cast<double>(j)};
                return time < splitTime ? TRegression::predict(leftParameters, time)
                                        : TRegression::predict(rightParameters, time);
            };

            reweighted.assign(begin, end);
            CSignal::reweightOutliers(predict, outlierFraction, reweighted);
        }

        ITR split{minResidualVariance[0].second};
        std::size_t splitOffset(std::distance(reweighted.cbegin(), split));
        double splitTime{static_cast<double>(splitOffset)};
        LOG_TRACE(<< "  candidate = " << offset + splitOffset
                  << ", variance = " << minResidualVariance[0].first);

        TMeanVarAccumulator leftResidualMoments{
            centredResidualMoments(reweighted.cbegin(), split, startTime)};
        TMeanVarAccumulator rightResidualMoments{
            centredResidualMoments(split, reweighted.cend(), splitTime)};
        TMeanAccumulator meanAbsValues{std::accumulate(
            begin, end, TMeanAccumulator{}, [](auto partialMean, const auto& value) {
                partialMean.add(std::fabs(common::CBasicStatistics::mean(value)),
                                common::CBasicStatistics::count(value));
                return partialMean;
            })};
        LOG_TRACE(<< "  total moments = " << residualMoments
                  << ", left moments = " << leftResidualMoments << ", right moments = "
                  << rightResidualMoments << ", mean abs = " << meanAbsValues);

        double v[]{common::CBasicStatistics::maximumLikelihoodVariance(residualMoments),
                   common::CBasicStatistics::maximumLikelihoodVariance(
                       leftResidualMoments + rightResidualMoments) +
                       common::MINIMUM_COEFFICIENT_OF_VARIATION *
                           common::CBasicStatistics::mean(meanAbsValues)};
        double df[]{4.0 - 2.0, static_cast<double>(range - 4)};
        double pValue{common::CTools::oneMinusPowOneMinusX(
            common::CStatisticalTests::rightTailFTest(std::max(v[0] - v[1], 0.0),
                                                      v[1], df[0], df[1]),
            static_cast<double>(reweighted.size() / initialStep))};
        LOG_TRACE(<< "  p-value = " << pValue);

        if (pValue < pValueToSegment) {
            fitTopDownPiecewiseLinear(begin, begin + splitOffset, depth + 1, offset,
                                      maxDepth, pValueToSegment, outlierFraction,
                                      segmentation, depthAndPValue, reweighted);
            fitTopDownPiecewiseLinear(begin + splitOffset, end, depth + 1, offset + splitOffset,
                                      maxDepth, pValueToSegment, outlierFraction,
                                      segmentation, depthAndPValue, reweighted);
            segmentation.push_back(offset + splitOffset);
            depthAndPValue.emplace_back(static_cast<double>(depth), pValue);
        }
    }
}