in lib/maths/time_series/CTimeSeriesSegmentation.cc [319:447]
void CTimeSeriesSegmentation::fitTopDownPiecewiseLinear(ITR begin,
ITR end,
std::size_t depth,
std::size_t offset,
std::size_t maxDepth,
double pValueToSegment,
double outlierFraction,
TSizeVec& segmentation,
TDoubleDoublePrVec& depthAndPValue,
TFloatMeanAccumulatorVec& reweighted) {
// We want to find the partition of [begin, end) into linear models which
// efficiently predicts the values. To this end we minimize residual variance
// whilst maintaining a parsimonious model. We achieve the later objective by
// doing model selection for each candidate segment using a significance test
// of the explained variance.
auto range = std::distance(begin, end);
std::ptrdiff_t initialStep{4};
if (range < 2 * initialStep || depth > maxDepth) {
return;
}
LOG_TRACE(<< "Considering splitting [" << offset << "," << offset + range << ")");
double startTime{static_cast<double>(offset)};
TMeanVarAccumulator residualMoments{[&] {
reweighted.assign(begin, end);
TRegression model{fitLinearModel(reweighted.cbegin(), reweighted.cend(), startTime)};
auto predict = [&](std::size_t i) {
double time{startTime + static_cast<double>(i)};
return model.predict(time);
};
CSignal::reweightOutliers(predict, outlierFraction, reweighted);
return centredResidualMoments(reweighted.cbegin(), reweighted.cend(), startTime);
}()};
if (common::CBasicStatistics::variance(residualMoments) > 0.0) {
// We iterate through every possible binary split of the data into
// contiguous ranges looking for the split which minimizes the total
// variance of the values minus linear model predictions. Outliers
// are removed based on an initial best split.
TMinAccumulator<ITR> minResidualVariance;
reweighted.assign(begin, end);
for (auto reweights : {0, 1}) {
LOG_TRACE(<< " reweights = " << reweights);
minResidualVariance = minimumResidualVarianceSplit(
reweighted.cbegin(), reweighted.cend(), initialStep, startTime, reweighted);
ITR split{minResidualVariance[0].second};
minResidualVariance = minimumResidualVarianceSplit(
split - std::min(split - reweighted.cbegin(), initialStep),
split + std::min(reweighted.cend() - split, initialStep),
1 /*unit step*/, startTime, reweighted);
if (outlierFraction < 0.0 || outlierFraction >= 1.0) {
LOG_ERROR(<< "Ignoring bad value for outlier fraction " << outlierFraction
<< ". This must be in the range [0,1).");
break;
}
if (reweights == 1 || outlierFraction == 0.0) {
break;
}
split = minResidualVariance[0].second;
double splitTime{static_cast<double>(std::distance(reweighted.cbegin(), split))};
TRegression leftModel{fitLinearModel(reweighted.cbegin(), split, startTime)};
TRegression rightModel{fitLinearModel(split, reweighted.cend(), splitTime)};
TRegressionArray leftParameters;
TRegressionArray rightParameters;
leftModel.parameters(leftParameters);
rightModel.parameters(rightParameters);
auto predict = [&](std::size_t j) {
double time{startTime + static_cast<double>(j)};
return time < splitTime ? TRegression::predict(leftParameters, time)
: TRegression::predict(rightParameters, time);
};
reweighted.assign(begin, end);
CSignal::reweightOutliers(predict, outlierFraction, reweighted);
}
ITR split{minResidualVariance[0].second};
std::size_t splitOffset(std::distance(reweighted.cbegin(), split));
double splitTime{static_cast<double>(splitOffset)};
LOG_TRACE(<< " candidate = " << offset + splitOffset
<< ", variance = " << minResidualVariance[0].first);
TMeanVarAccumulator leftResidualMoments{
centredResidualMoments(reweighted.cbegin(), split, startTime)};
TMeanVarAccumulator rightResidualMoments{
centredResidualMoments(split, reweighted.cend(), splitTime)};
TMeanAccumulator meanAbsValues{std::accumulate(
begin, end, TMeanAccumulator{}, [](auto partialMean, const auto& value) {
partialMean.add(std::fabs(common::CBasicStatistics::mean(value)),
common::CBasicStatistics::count(value));
return partialMean;
})};
LOG_TRACE(<< " total moments = " << residualMoments
<< ", left moments = " << leftResidualMoments << ", right moments = "
<< rightResidualMoments << ", mean abs = " << meanAbsValues);
double v[]{common::CBasicStatistics::maximumLikelihoodVariance(residualMoments),
common::CBasicStatistics::maximumLikelihoodVariance(
leftResidualMoments + rightResidualMoments) +
common::MINIMUM_COEFFICIENT_OF_VARIATION *
common::CBasicStatistics::mean(meanAbsValues)};
double df[]{4.0 - 2.0, static_cast<double>(range - 4)};
double pValue{common::CTools::oneMinusPowOneMinusX(
common::CStatisticalTests::rightTailFTest(std::max(v[0] - v[1], 0.0),
v[1], df[0], df[1]),
static_cast<double>(reweighted.size() / initialStep))};
LOG_TRACE(<< " p-value = " << pValue);
if (pValue < pValueToSegment) {
fitTopDownPiecewiseLinear(begin, begin + splitOffset, depth + 1, offset,
maxDepth, pValueToSegment, outlierFraction,
segmentation, depthAndPValue, reweighted);
fitTopDownPiecewiseLinear(begin + splitOffset, end, depth + 1, offset + splitOffset,
maxDepth, pValueToSegment, outlierFraction,
segmentation, depthAndPValue, reweighted);
segmentation.push_back(offset + splitOffset);
depthAndPValue.emplace_back(static_cast<double>(depth), pValue);
}
}
}