lib/maths/common/CXMeansOnline1d.cc (1,187 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <maths/common/CXMeansOnline1d.h>
#include <core/CLogger.h>
#include <core/CMemoryDef.h>
#include <core/CSmallVector.h>
#include <core/CStatePersistInserter.h>
#include <core/CStateRestoreTraverser.h>
#include <core/Constants.h>
#include <core/RestoreMacros.h>
#include <maths/common/CBasicStatistics.h>
#include <maths/common/CBasicStatisticsPersist.h>
#include <maths/common/CChecksum.h>
#include <maths/common/CNormalMeanPrecConjugate.h>
#include <maths/common/CPrior.h>
#include <maths/common/CRestoreParams.h>
#include <maths/common/CTools.h>
#include <maths/common/Constants.h>
#include <maths/common/MathsTypes.h>
#include <boost/math/constants/constants.hpp>
#include <boost/math/distributions/normal.hpp>
#include <boost/math/special_functions/digamma.hpp>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace ml {
namespace maths {
namespace common {
namespace {
using TDouble1Vec = core::CSmallVector<double, 1>;
using TDoubleDoublePr = std::pair<double, double>;
using TSizeVec = std::vector<std::size_t>;
using TTuple = CNaturalBreaksClassifier::TTuple;
using TTupleVec = CNaturalBreaksClassifier::TTupleVec;
namespace detail {
using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
//! \brief Orders two clusters by their centres.
struct SClusterCentreLess {
bool operator()(const CXMeansOnline1d::CCluster& lhs,
const CXMeansOnline1d::CCluster& rhs) const {
return lhs.centre() < rhs.centre();
}
bool operator()(double lhs, const CXMeansOnline1d::CCluster& rhs) const {
return lhs < rhs.centre();
}
bool operator()(const CXMeansOnline1d::CCluster& lhs, double rhs) const {
return lhs.centre() < rhs;
}
};
//! Get the minimum of \p x, \p y and \p z.
double min(double x, double y, double z) {
return std::min(std::min(x, y), z);
}
//! Get the log of the likelihood that \p point is from the \p normal.
maths_t::EFloatingPointErrorStatus logLikelihoodFromCluster(double point,
const CNormalMeanPrecConjugate& normal,
double probability,
double& result) {
result = core::constants::LOG_MIN_DOUBLE - 1.0;
double likelihood;
maths_t::EFloatingPointErrorStatus status = normal.jointLogMarginalLikelihood(
{point}, maths_t::CUnitWeights::SINGLE_UNIT, likelihood);
if (status & maths_t::E_FpFailed) {
LOG_ERROR(<< "Unable to compute likelihood for: " << point);
return status;
}
if (status & maths_t::E_FpOverflowed) {
result = likelihood;
return status;
}
result = likelihood + std::log(probability);
return status;
}
//! Get the moments of \p categories and the splits into
//! [\p start, \p split) and [\p split, \p end).
void candidates(const TTupleVec& categories,
std::size_t start,
std::size_t split,
std::size_t end,
TMeanVarAccumulator& mv,
TMeanVarAccumulator& mvl,
TMeanVarAccumulator& mvr) {
LOG_TRACE(<< "categories = "
<< core::CContainerPrinter::print(categories.begin() + start,
categories.begin() + end));
LOG_TRACE(<< "split at = " << split);
for (std::size_t i = start; i < split; ++i) {
mv += categories[i];
mvl += categories[i];
}
for (std::size_t i = split; i < end; ++i) {
mv += categories[i];
mvr += categories[i];
}
LOG_TRACE(<< "mv = " << mv << ", mvl = " << mvl << ", mvr = " << mvr);
}
//! Compute the mean of \p category.
double mean(maths_t::EDataType dataType, const TTuple& category) {
double result = CBasicStatistics::mean(category);
switch (dataType) {
case maths_t::E_DiscreteData:
break;
case maths_t::E_IntegerData:
result += 0.5;
break;
case maths_t::E_ContinuousData:
break;
case maths_t::E_MixedData:
break;
}
return result;
}
//! Compute the variance of \p category.
double variance(maths_t::EDataType dataType, const TTuple& category) {
double n = CBasicStatistics::count(category);
double result = (1.0 + 1.0 / n) * CBasicStatistics::maximumLikelihoodVariance(category);
switch (dataType) {
case maths_t::E_DiscreteData:
break;
case maths_t::E_IntegerData:
result += 1.0 / 12.0;
break;
case maths_t::E_ContinuousData:
break;
case maths_t::E_MixedData:
break;
}
return result;
}
//! Computes the Bayes Information Content of splitting verses
//! not splitting.
//!
//! This considers splitting the data at split such that the
//! first cluster is [\p start, \p split) and the second is
//! [\p split, \p end) and computes the information gain
//! (strictly \f$\max(-2\log(L(\{x_i\}|M_1)) + 2\log(L(\{x_i\}|M_2)) + (n_1 - n_2) \log(n), 0)\f$
//! where \f$M_1\f$ is the one cluster model, \f$M_2\f$ is the
//! two cluster model, \f$n_1\f$ is the number of parameters in
//! the one cluster model, \f$n_2\f$ is the number of parameters
//! in the two cluster model and \f$n\f$ is the total number of
//! points). Various models one and two cluster models are
//! considered.
void BICGain(maths_t::EDataType dataType,
CAvailableModeDistributions distributions,
double smallest,
const TTupleVec& categories,
std::size_t start,
std::size_t split,
std::size_t end,
double& distance,
double& nl,
double& nr) {
// The basic idea is to compute the difference between the
// Bayes Information Content (BIC) for one and two clusters
// for the sketch defined by the categories passed to this
// function. The exact form of BIC for the mixture (i.e. the
// two cluster case) depends on what one assumes about the
// the model. For example, if one assumes that the categories
// are labeled with their correct cluster then their likelihood
// is their likelihood of coming from the corresponding cluster
// multiplied by the cluster weight. Alternatively, if one
// assumes that they have equal prior probability of coming
// from either cluster, their likelihood is the weighted sum
// of their likelihoods for both clusters. In this second case
// the exact BIC (even modeling the modes as normals) can't be
// computed in terms of the statistics we maintain in the sketch.
// We could for example compute the expected BIC over a mixture
// model of the sketch. However, we will assume the former case
// (as they do in x-means). In this case, the BIC for the single
// cluster and normal distribution is
//
// \sum_i{ ni * (log(1/2/pi/v) + (vi + (mi - m)^2) / v) } + 2.0 * log(n)
//
// where ni is the count, mi is the mean and vi is the variance
// of the i'th category, and n, m and v are the overall count
// mean and variance. For the two cluster normal distribution
// case it is
//
// \sum_{i in l}{ ni * (log(1/2/pi/vl/wl/wl) + (vi + (mi - ml)^2) / vl)
// + \sum_{i in r}{ ni * (log(1/2/pi/vl/wr/wr) + (vi + (mi - mr)^2) / vr)
// + 5.0 * log(n)
//
// where wl, ml and vl are the weight, mean and variance of one
// cluster and wr, mr and vr are the weight, mean and variance
// of the other cluster.
//
// We also consider log-normal and mixture of log-normal and
// gamma and mixture of gamma cases. We don't maintain the
// sufficient statistics we need to estimate the exact BIC
// for these distributions instead we compute the best
// approximation with the statistics available. Specifically,
// we estimate the distribution parameters using method of
// moments rather than maximum likelihood and compute the
// BIC using Taylor expansions of log(xi) and log(xi)^2.
static const double MIN_RELATIVE_VARIANCE = 1e-10;
static const double MIN_ABSOLUTE_VARIANCE = 1.0;
TMeanVarAccumulator mv;
TMeanVarAccumulator mvl;
TMeanVarAccumulator mvr;
candidates(categories, start, split, end, mv, mvl, mvr);
double logNormalOffset = std::max(0.0, GAMMA_OFFSET_MARGIN - smallest);
double gammaOffset = std::max(0.0, LOG_NORMAL_OFFSET_MARGIN - smallest);
for (std::size_t i = start; i < end; ++i) {
double x = mean(dataType, categories[i]);
logNormalOffset = std::max(logNormalOffset, LOG_NORMAL_OFFSET_MARGIN - x);
gammaOffset = std::max(gammaOffset, GAMMA_OFFSET_MARGIN - x);
}
LOG_TRACE(<< "offsets = [" << gammaOffset << "," << logNormalOffset << "]");
distance = 0.0;
nl = CBasicStatistics::count(mvl);
nr = CBasicStatistics::count(mvr);
// Compute the BIC gain for splitting the mode.
double ll1n = 0.0;
double ll1l = 0.0;
double ll1g = 0.0;
double ll2nl = 0.0;
double ll2ll = 0.0;
double ll2gl = 0.0;
double ll2nr = 0.0;
double ll2lr = 0.0;
double ll2gr = 0.0;
// Normal
double n = CBasicStatistics::count(mv);
double m = mean(dataType, mv);
double v = variance(dataType, mv);
if (v <= MINIMUM_COEFFICIENT_OF_VARIATION * std::fabs(m)) {
return;
}
// Log-normal (method of moments)
double s = std::log(1.0 + v / CTools::pow2(m + logNormalOffset));
double l = std::log(m + logNormalOffset) - s / 2.0;
// Gamma (method of moments)
double a = CTools::pow2(m + gammaOffset) / v;
double b = (m + gammaOffset) / v;
double smin = std::max(logNormalOffset, gammaOffset);
double vmin = std::min(MIN_RELATIVE_VARIANCE * std::max(v, CTools::pow2(smin)),
MIN_ABSOLUTE_VARIANCE);
// Mixture of normals
double wl = CBasicStatistics::count(mvl) / n;
double ml = mean(dataType, mvl);
double vl = std::max(variance(dataType, mvl), vmin);
double wr = CBasicStatistics::count(mvr) / n;
double mr = mean(dataType, mvr);
double vr = std::max(variance(dataType, mvr), vmin);
bool haveGamma = distributions.haveGamma();
try {
// Mixture of log-normals (method of moments)
double sl = std::log(1.0 + vl / CTools::pow2(ml + logNormalOffset));
double ll = std::log(ml + logNormalOffset) - sl / 2.0;
double sr = std::log(1.0 + vr / CTools::pow2(mr + logNormalOffset));
double lr = std::log(mr + logNormalOffset) - sr / 2.0;
// Mixture of gammas (method of moments)
double al = CTools::pow2(ml + gammaOffset) / vl;
double bl = (ml + gammaOffset) / vl;
double ar = CTools::pow2(mr + gammaOffset) / vr;
double br = (mr + gammaOffset) / vr;
double log2piv = std::log(boost::math::double_constants::two_pi * v);
double log2pis = std::log(boost::math::double_constants::two_pi * s);
double loggn = 0.0;
double loggnl = 0.0;
double loggnr = 0.0;
if (haveGamma && CTools::lgamma(a, loggn, true) &&
CTools::lgamma(al, loggnl, true) && CTools::lgamma(ar, loggnr, true)) {
loggn -= a * std::log(b);
loggnl -= al * std::log(bl) + std::log(wl);
loggnr -= ar * std::log(br) + std::log(wr);
} else {
haveGamma = false;
}
double log2pivl =
std::log(boost::math::double_constants::two_pi * vl / CTools::pow2(wl));
double log2pivr =
std::log(boost::math::double_constants::two_pi * vr / CTools::pow2(wr));
double log2pisl =
std::log(boost::math::double_constants::two_pi * sl / CTools::pow2(wl));
double log2pisr =
std::log(boost::math::double_constants::two_pi * sr / CTools::pow2(wr));
for (std::size_t i = start; i < split; ++i) {
double ni = CBasicStatistics::count(categories[i]);
double mi = mean(dataType, categories[i]);
double vi = variance(dataType, categories[i]);
if (vi == 0.0) {
double li = std::log(mi + logNormalOffset);
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
ll2ll += ni * (CTools::pow2(li - ll) / sl + 2.0 * li + log2pisl);
ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
} else {
double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
double li = std::log(mi + logNormalOffset) - si / 2.0;
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
ll2ll += ni * ((si + CTools::pow2(li - ll)) / sl + 2.0 * li + log2pisl);
ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
}
}
for (std::size_t i = split; i < end; ++i) {
double ni = CBasicStatistics::count(categories[i]);
double mi = mean(dataType, categories[i]);
double vi = variance(dataType, categories[i]);
if (vi == 0.0) {
double li = std::log(mi + logNormalOffset);
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
ll2lr += ni * (CTools::pow2(li - lr) / sr + 2.0 * li + log2pisr);
ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
} else {
double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
double li = std::log(mi + logNormalOffset) - si / 2.0;
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
ll2lr += ni * ((si + CTools::pow2(li - lr)) / sr + 2.0 * li + log2pisr);
ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
}
}
} catch (const std::exception& e) {
LOG_ERROR(<< "Failed to compute BIC gain: " << e.what() << ", n = " << n
<< ", m = " << m << ", v = " << v << ", wl = " << wl
<< ", ml = " << ml << ", vl = " << vl << ", wr = " << wr
<< ", mr = " << mr << ", vr = " << vr);
return;
}
double logn = std::log(n);
double ll1 =
min(distributions.haveNormal() ? ll1n : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll1l : std::numeric_limits<double>::max(),
haveGamma ? ll1g : std::numeric_limits<double>::max()) +
distributions.parameters() * logn;
double ll2 =
min(distributions.haveNormal() ? ll2nl : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll2ll : std::numeric_limits<double>::max(),
haveGamma ? ll2gl : std::numeric_limits<double>::max()) +
min(distributions.haveNormal() ? ll2nr : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll2lr : std::numeric_limits<double>::max(),
haveGamma ? ll2gr : std::numeric_limits<double>::max()) +
(2.0 * distributions.parameters() + 1.0) * logn;
LOG_TRACE(<< "BIC(1) = " << ll1 << ", BIC(2) = " << ll2);
distance = std::max(ll1 - ll2, 0.0);
}
//! Update the mean and variance of \p category to represent
//! truncating the values to the interval \p interval. This
//! is done by Winsorization, i.e. rather than discarding values
//! outside the interval we restrict them to the closest interval
//! end point. To approximate the effect on the category mean and
//! variance we assume that the underlying distribution is normal
//! and calculate the restricted mean as:
//! <pre class="fragment">
//! \f$m_{a,b} = a F(a) + \int_a^b{x f(x)}dx + b (1 - F(b))\f$
//! </pre>
//! and the variance as:
//! <pre class="fragment">
//! \f$m_{a,b} = (a-m_{a,b})^2 F(a) + \int_a^b{(x-m_{a,b})^2 f(x)}dx + (b-m_{a,b})^2 (1 - F(b))\f$
//! </pre>
//!
//! \param[in] interval The Winsorization interval.
//! \param[in,out] category The category to Winsorise.
void winsorise(const TDoubleDoublePr& interval, TTuple& category) {
if (CBasicStatistics::maximumLikelihoodVariance(category) < 0.0) {
CBasicStatistics::moment<1>(category) = 0.0;
}
double a = interval.first;
double b = interval.second;
double m = CBasicStatistics::mean(category);
double sigma = std::sqrt(CBasicStatistics::maximumLikelihoodVariance(category));
double t = 3.0 * sigma;
double xa = m - a;
double xb = b - m;
if (sigma == 0.0 || (xa > t && xb > t)) {
return;
}
try {
boost::math::normal normal(m, sigma);
double pa = xa > t ? 0.0 : CTools::safeCdf(normal, a);
double pb = xb > t ? 0.0 : CTools::safeCdfComplement(normal, b);
xa /= sigma;
xb /= sigma;
double ea = xa > t ? 0.0 : std::exp(-xa * xa / 2.0);
double eb = xb > t ? 0.0 : std::exp(-xb * xb / 2.0);
double km = sigma / boost::math::double_constants::root_two_pi * (ea - eb);
double kv = -sigma * sigma / boost::math::double_constants::root_two_pi *
(xa * ea + xb * eb);
double wm = pa * a + pb * b + m * (1.0 - pb - pa) + km;
xa = a - wm;
xb = b - wm;
double xm = wm - m;
double wv = xa * xa * pa + xb * xb * pb +
(sigma * sigma + xm * xm) * (1.0 - pb - pa) + 2.0 * xm * km + kv;
double n = CBasicStatistics::count(category);
CBasicStatistics::moment<0>(category) = wm;
CBasicStatistics::moment<1>(category) = std::max((n - 1.0) / n * wv, 0.0);
} catch (const std::exception& e) {
LOG_ERROR(<< "Bad category = " << category << ": " << e.what());
}
}
//! Search for a split of the data that satisfies the constraints
//! on both the BIC divergence and minimum count.
//!
//! In order to handle the constraint on minimum count, we do a
//! breadth first search of the binary tree of optimal 2-splits
//! of subsets of the data looking for splits which satisfy the
//! constraints on *both* BIC divergence and count. The search
//! terminates at any node which can't be split subject to BIC.
//!
//! The intention of this is to find "natural" splits of the data
//! which would be obscured when splitting into the optimal 2-split.
//! This can occur when a small number of points (less than minimum
//! count) are sufficiently far from the others that they split off
//! in preference to some other natural split of the data. Although
//! we can impose the count constraint when finding the optimal
//! 2-split this has associated problems. In particular, extreme
//! outliers then tend to rip sufficient points away from their
//! natural cluster in order to generate a new cluster.
bool splitSearch(double minimumCount,
double minimumDistance,
maths_t::EDataType dataType,
CAvailableModeDistributions distributions,
double smallest,
const TTupleVec& categories,
TSizeVec& result) {
using TSizeSizePr = std::pair<std::size_t, std::size_t>;
LOG_TRACE(<< "begin split search");
result.clear();
TSizeSizePr node(0, categories.size());
TTupleVec nodeCategories;
nodeCategories.reserve(categories.size());
TSizeVec candidate;
candidate.reserve(2);
// The search effectively visits a binary tree of possible
// 2-splits of the data, which is searched breadth first.
// If a suitable split is found on a level of the tree then
// the search terminates returning that split. Note that
// if a subset of the data can be split we also check that
// the corresponding full 2-split can be split subject to the
// same constraints (to avoid merging the split straight away).
for (;;) {
LOG_TRACE(<< "node = " << node);
LOG_TRACE(<< "categories = " << categories);
nodeCategories.assign(categories.begin() + node.first,
categories.begin() + node.second);
CNaturalBreaksClassifier::naturalBreaks(
nodeCategories, 2, 0, CNaturalBreaksClassifier::E_TargetDeviation, candidate);
LOG_TRACE(<< "candidate = " << candidate);
if (candidate.size() != 2) {
LOG_ERROR(<< "Expected 2-split: " << candidate);
break;
}
if (candidate[0] == 0 || candidate[0] == nodeCategories.size()) {
// This can happen if all the points are co-located,
// in which case we can't split this node anyway.
break;
}
candidate[0] += node.first;
candidate[1] += node.first;
double distance;
double nl;
double nr;
BICGain(dataType, distributions, smallest, categories, node.first,
candidate[0], node.second, distance, nl, nr);
// Check the count constraint.
bool satisfiesCount = (std::min(nl, nr) >= minimumCount);
LOG_TRACE(<< "count = " << std::min(nl, nr) << " (to split " << minimumCount << ")");
// Check the distance constraint.
bool satisfiesDistance = (distance > minimumDistance);
LOG_TRACE(<< "max(BIC(1) - BIC(2), 0) = " << distance << " (to split "
<< minimumDistance << ")");
if (satisfiesCount == false) {
// Recurse to the (one) node with sufficient count.
if (nl > minimumCount && candidate[0] - node.first > 1) {
node = {node.first, candidate[0]};
continue;
}
if (nr > minimumCount && node.second - candidate[0] > 1) {
node = {candidate[0], node.second};
continue;
}
} else if (satisfiesDistance) {
LOG_TRACE(<< "Checking full split");
BICGain(dataType, distributions, smallest, categories, 0,
candidate[0], categories.size(), distance, nl, nr);
LOG_TRACE(<< "max(BIC(1) - BIC(2), 0) = " << distance
<< " (to split " << minimumDistance << ")");
if (distance > minimumDistance) {
result.push_back(candidate[0]);
result.push_back(categories.size());
}
}
break;
}
LOG_TRACE(<< "end split search");
return !result.empty();
}
} // detail::
// 1 - "smallest hard assignment weight"
const double HARD_ASSIGNMENT_THRESHOLD = 0.01;
// CXMeansOnline1d
const core::TPersistenceTag WEIGHT_CALC_TAG("a", "weight");
const core::TPersistenceTag MINIMUM_CLUSTER_FRACTION_TAG("b", "cluster_fraction");
const core::TPersistenceTag MINIMUM_CLUSTER_COUNT_TAG("c", "minimum_cluster_count");
const core::TPersistenceTag
WINSORIZATION_CONFIDENCE_INTERVAL_TAG("d", "winsorisation_confidence_interval");
const core::TPersistenceTag CLUSTER_INDEX_GENERATOR_TAG("e", "index_generator");
const core::TPersistenceTag CLUSTER_TAG("f", "cluster");
const core::TPersistenceTag AVAILABLE_DISTRIBUTIONS_TAG("g", "available_distributions");
const core::TPersistenceTag SMALLEST_TAG("h", "smallest");
const core::TPersistenceTag LARGEST_TAG("i", "largest");
const core::TPersistenceTag DECAY_RATE_TAG("j", "decay_rate");
const core::TPersistenceTag HISTORY_LENGTH_TAG("k", "history_length");
// CXMeansOnline1d::CCluster
const core::TPersistenceTag INDEX_TAG("a", "index");
const core::TPersistenceTag STRUCTURE_TAG("b", "structure");
const core::TPersistenceTag PRIOR_TAG("c", "prior");
const std::string EMPTY_STRING;
}
CAvailableModeDistributions::CAvailableModeDistributions(int value)
: m_Value(value) {
}
const CAvailableModeDistributions& CAvailableModeDistributions::
operator+(const CAvailableModeDistributions& rhs) {
m_Value = m_Value | rhs.m_Value;
return *this;
}
double CAvailableModeDistributions::parameters() const {
return (this->haveNormal() ? 2.0 : 0.0) + (this->haveGamma() ? 2.0 : 0.0) +
(this->haveLogNormal() ? 2.0 : 0.0);
}
bool CAvailableModeDistributions::haveNormal() const {
return (m_Value & NORMAL) != 0;
}
bool CAvailableModeDistributions::haveGamma() const {
return (m_Value & GAMMA) != 0;
}
bool CAvailableModeDistributions::haveLogNormal() const {
return (m_Value & LOG_NORMAL) != 0;
}
std::string CAvailableModeDistributions::toString() const {
return core::CStringUtils::typeToString(m_Value);
}
bool CAvailableModeDistributions::fromString(const std::string& value) {
return core::CStringUtils::stringToType(value, m_Value);
}
CXMeansOnline1d::CXMeansOnline1d(maths_t::EDataType dataType,
CAvailableModeDistributions availableDistributions,
maths_t::EClusterWeightCalc weightCalc,
double decayRate,
double minimumClusterFraction,
double minimumClusterCount,
double minimumCategoryCount,
double winsorizationConfidenceInterval,
const TSplitFunc& splitFunc,
const TMergeFunc& mergeFunc)
: CClusterer1d(splitFunc, mergeFunc), m_DataType(dataType),
m_WeightCalc(weightCalc), m_AvailableDistributions(availableDistributions),
m_InitialDecayRate(decayRate), m_DecayRate(decayRate),
m_HistoryLength(0.0), m_MinimumClusterFraction(minimumClusterFraction),
m_MinimumClusterCount(minimumClusterCount),
m_MinimumCategoryCount(minimumCategoryCount),
m_WinsorizationConfidenceInterval(winsorizationConfidenceInterval),
m_Clusters(1, CCluster(*this)) {
}
CXMeansOnline1d::CXMeansOnline1d(const SDistributionRestoreParams& params,
core::CStateRestoreTraverser& traverser)
: CClusterer1d(CDoNothing(), CDoNothing()), m_DataType(params.s_DataType),
m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_AvailableDistributions(CAvailableModeDistributions::ALL),
m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
if (traverser.traverseSubLevel([&](auto& traverser_) {
return this->acceptRestoreTraverser(params, traverser_);
}) == false) {
traverser.setBadState();
}
}
CXMeansOnline1d::CXMeansOnline1d(const SDistributionRestoreParams& params,
const TSplitFunc& splitFunc,
const TMergeFunc& mergeFunc,
core::CStateRestoreTraverser& traverser)
: CClusterer1d(splitFunc, mergeFunc), m_DataType(params.s_DataType),
m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_AvailableDistributions(CAvailableModeDistributions::ALL),
m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
m_HistoryLength(), m_MinimumClusterFraction(), m_MinimumClusterCount(),
m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
if (traverser.traverseSubLevel([&](auto& traverser_) {
return this->acceptRestoreTraverser(params, traverser_);
}) == false) {
traverser.setBadState();
}
}
CXMeansOnline1d::CXMeansOnline1d(const CXMeansOnline1d& other)
: CClusterer1d(other.splitFunc(), other.mergeFunc()),
m_DataType(other.m_DataType), m_WeightCalc(other.m_WeightCalc),
m_AvailableDistributions(other.m_AvailableDistributions),
m_InitialDecayRate(other.m_InitialDecayRate),
m_DecayRate(other.m_DecayRate), m_HistoryLength(other.m_HistoryLength),
m_MinimumClusterFraction(other.m_MinimumClusterFraction),
m_MinimumClusterCount(other.m_MinimumClusterCount),
m_MinimumCategoryCount(other.m_MinimumCategoryCount),
m_WinsorizationConfidenceInterval(other.m_WinsorizationConfidenceInterval),
m_ClusterIndexGenerator(other.m_ClusterIndexGenerator.deepCopy()),
m_Smallest(other.m_Smallest), m_Largest(other.m_Largest),
m_Clusters(other.m_Clusters) {
}
CXMeansOnline1d& CXMeansOnline1d::operator=(const CXMeansOnline1d& other) {
if (this != &other) {
CXMeansOnline1d tmp(other);
this->swap(tmp);
}
return *this;
}
void CXMeansOnline1d::swap(CXMeansOnline1d& other) {
this->CClusterer1d::swap(other);
std::swap(m_DataType, other.m_DataType);
std::swap(m_WeightCalc, other.m_WeightCalc);
std::swap(m_InitialDecayRate, other.m_InitialDecayRate);
std::swap(m_DecayRate, other.m_DecayRate);
std::swap(m_HistoryLength, other.m_HistoryLength);
std::swap(m_MinimumClusterFraction, other.m_MinimumClusterFraction);
std::swap(m_MinimumClusterCount, other.m_MinimumClusterCount);
std::swap(m_MinimumCategoryCount, other.m_MinimumCategoryCount);
std::swap(m_WinsorizationConfidenceInterval, other.m_WinsorizationConfidenceInterval);
std::swap(m_AvailableDistributions, other.m_AvailableDistributions);
std::swap(m_ClusterIndexGenerator, other.m_ClusterIndexGenerator);
std::swap(m_Smallest, other.m_Smallest);
std::swap(m_Largest, other.m_Largest);
m_Clusters.swap(other.m_Clusters);
}
const core::TPersistenceTag& CXMeansOnline1d::persistenceTag() const {
return X_MEANS_ONLINE_1D_TAG;
}
void CXMeansOnline1d::acceptPersistInserter(core::CStatePersistInserter& inserter) const {
for (const auto& cluster : m_Clusters) {
inserter.insertLevel(CLUSTER_TAG, [&cluster](auto& inserter_) {
cluster.acceptPersistInserter(inserter_);
});
}
inserter.insertValue(AVAILABLE_DISTRIBUTIONS_TAG, m_AvailableDistributions.toString());
inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision);
inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength, core::CIEEE754::E_SinglePrecision);
inserter.insertValue(SMALLEST_TAG, m_Smallest.toDelimited());
inserter.insertValue(LARGEST_TAG, m_Largest.toDelimited());
inserter.insertValue(WEIGHT_CALC_TAG, static_cast<int>(m_WeightCalc));
inserter.insertValue(MINIMUM_CLUSTER_FRACTION_TAG, m_MinimumClusterFraction.toString());
inserter.insertValue(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount.toString());
inserter.insertValue(WINSORIZATION_CONFIDENCE_INTERVAL_TAG,
m_WinsorizationConfidenceInterval.toString());
inserter.insertLevel(CLUSTER_INDEX_GENERATOR_TAG, [this](auto& inserter_) {
m_ClusterIndexGenerator.acceptPersistInserter(inserter_);
});
}
CXMeansOnline1d* CXMeansOnline1d::clone() const {
return new CXMeansOnline1d(*this);
}
void CXMeansOnline1d::clear() {
*this = CXMeansOnline1d(
m_DataType, m_AvailableDistributions, m_WeightCalc, m_InitialDecayRate,
m_MinimumClusterFraction, m_MinimumClusterCount, m_MinimumCategoryCount,
m_WinsorizationConfidenceInterval, this->splitFunc(), this->mergeFunc());
}
std::size_t CXMeansOnline1d::numberClusters() const {
return m_Clusters.size();
}
void CXMeansOnline1d::dataType(maths_t::EDataType dataType) {
m_DataType = dataType;
for (auto& cluster : m_Clusters) {
cluster.dataType(dataType);
}
}
void CXMeansOnline1d::decayRate(double decayRate) {
m_DecayRate = decayRate;
for (auto& cluster : m_Clusters) {
cluster.decayRate(decayRate);
}
}
bool CXMeansOnline1d::hasCluster(std::size_t index) const {
return this->cluster(index) != nullptr;
}
bool CXMeansOnline1d::clusterCentre(std::size_t index, double& result) const {
const CCluster* cluster = this->cluster(index);
if (cluster == nullptr) {
LOG_ERROR(<< "Cluster " << index << " doesn't exist");
return false;
}
result = cluster->centre();
return true;
}
bool CXMeansOnline1d::clusterSpread(std::size_t index, double& result) const {
const CCluster* cluster = this->cluster(index);
if (cluster == nullptr) {
LOG_ERROR(<< "Cluster " << index << " doesn't exist");
return false;
}
result = cluster->spread();
return true;
}
void CXMeansOnline1d::cluster(const double& point, TSizeDoublePr2Vec& result, double count) const {
result.clear();
if (m_Clusters.empty()) {
LOG_ERROR(<< "No clusters");
return;
}
auto rightCluster = std::lower_bound(m_Clusters.begin(), m_Clusters.end(),
point, detail::SClusterCentreLess());
if (rightCluster == m_Clusters.end()) {
--rightCluster;
result.emplace_back(rightCluster->index(), count);
} else if (rightCluster == m_Clusters.begin()) {
result.emplace_back(rightCluster->index(), count);
} else {
// This does a soft assignment. Given we are finding a
// partitioning clustering (as a result of targeting
// the k-means objective) we only consider the case that
// the point comes from either the left or right cluster.
// A-priori the probability a randomly selected point
// comes from a cluster is proportional to its weight:
// P(i) = n(i) / Sum_j{ n(j) }
//
// Bayes theorem then immediately gives that the probability
// that a given point is from the i'th cluster
// P(i | x) = L(x | i) * P(i) / Z
//
// where Z is the normalization constant:
// Z = Sum_i{ P(i | x) }
//
// Below we work with log likelihoods so the normalization
// constant cancels on either side of the inequality. Note
// also that we do not want to soft assign the point to a
// cluster if its probability is close to zero.
auto leftCluster = rightCluster;
--leftCluster;
double likelihoodLeft = leftCluster->logLikelihoodFromCluster(m_WeightCalc, point);
double likelihoodRight = rightCluster->logLikelihoodFromCluster(m_WeightCalc, point);
double renormalizer = std::max(likelihoodLeft, likelihoodRight);
double pLeft = std::exp(likelihoodLeft - renormalizer);
double pRight = std::exp(likelihoodRight - renormalizer);
double normalizer = pLeft + pRight;
pLeft /= normalizer;
pRight /= normalizer;
if (pLeft < HARD_ASSIGNMENT_THRESHOLD * pRight) {
result.emplace_back(rightCluster->index(), count);
} else if (pRight < HARD_ASSIGNMENT_THRESHOLD * pLeft) {
result.emplace_back(leftCluster->index(), count);
} else {
result.emplace_back(leftCluster->index(), pLeft * count);
result.emplace_back(rightCluster->index(), pRight * count);
}
}
}
void CXMeansOnline1d::add(const double& point, TSizeDoublePr2Vec& clusters, double count) {
m_Smallest.add(point);
m_Largest.add(point);
clusters.clear();
auto rightCluster = std::lower_bound(m_Clusters.begin(), m_Clusters.end(),
point, detail::SClusterCentreLess());
if (rightCluster == m_Clusters.end()) {
--rightCluster;
LOG_TRACE(<< "Adding " << point << " to " << rightCluster->centre());
rightCluster->add(point, count);
clusters.emplace_back(rightCluster->index(), count);
if (this->maybeSplit(rightCluster)) {
this->cluster(point, clusters, count);
} else if (rightCluster != m_Clusters.begin()) {
auto leftCluster = rightCluster;
--leftCluster;
if (this->maybeMerge(leftCluster, rightCluster)) {
this->cluster(point, clusters, count);
}
}
} else if (rightCluster == m_Clusters.begin()) {
LOG_TRACE(<< "Adding " << point << " to " << rightCluster->centre());
rightCluster->add(point, count);
clusters.emplace_back(rightCluster->index(), count);
if (this->maybeSplit(rightCluster)) {
this->cluster(point, clusters, count);
} else {
auto leftCluster = rightCluster;
++rightCluster;
if (this->maybeMerge(leftCluster, rightCluster)) {
this->cluster(point, clusters, count);
}
}
} else {
// See the cluster member function for more details on
// soft assignment.
auto leftCluster = rightCluster;
--leftCluster;
double likelihoodLeft = leftCluster->logLikelihoodFromCluster(m_WeightCalc, point);
double likelihoodRight = rightCluster->logLikelihoodFromCluster(m_WeightCalc, point);
// Normalize the likelihood values.
double renormalizer = std::max(likelihoodLeft, likelihoodRight);
double pLeft = std::exp(likelihoodLeft - renormalizer);
double pRight = std::exp(likelihoodRight - renormalizer);
double pLeftPlusRight = pLeft + pRight;
pLeft /= pLeftPlusRight;
pRight /= pLeftPlusRight;
if (pLeft < HARD_ASSIGNMENT_THRESHOLD * pRight) {
LOG_TRACE(<< "Adding " << point << " to " << rightCluster->centre());
rightCluster->add(point, count);
clusters.emplace_back(rightCluster->index(), count);
if (this->maybeSplit(rightCluster) || this->maybeMerge(leftCluster, rightCluster)) {
this->cluster(point, clusters, count);
}
} else if (pRight < HARD_ASSIGNMENT_THRESHOLD * pLeft) {
LOG_TRACE(<< "Adding " << point << " to " << leftCluster->centre());
leftCluster->add(point, count);
clusters.emplace_back(leftCluster->index(), count);
if (this->maybeSplit(leftCluster) || this->maybeMerge(leftCluster, rightCluster)) {
this->cluster(point, clusters, count);
}
} else {
// Get the weighted counts.
double countLeft = count * pLeft;
double countRight = count * pRight;
LOG_TRACE(<< "Soft adding " << point << " " << countLeft << " to "
<< leftCluster->centre() << " and " << countRight
<< " to " << rightCluster->centre());
leftCluster->add(point, countLeft);
rightCluster->add(point, countRight);
clusters.emplace_back(leftCluster->index(), countLeft);
clusters.emplace_back(rightCluster->index(), countRight);
if (this->maybeSplit(leftCluster) || this->maybeSplit(rightCluster) ||
this->maybeMerge(leftCluster, rightCluster)) {
this->cluster(point, clusters, count);
}
}
}
if (this->prune()) {
this->cluster(point, clusters, count);
}
}
void CXMeansOnline1d::add(const TDoubleDoublePrVec& points) {
if (m_Clusters.empty()) {
m_Clusters.emplace_back(*this);
}
TSizeDoublePr2Vec dummy;
for (const auto& point : points) {
this->add(point.first, dummy, point.second);
}
}
void CXMeansOnline1d::propagateForwardsByTime(double time) {
if (time < 0.0) {
LOG_ERROR(<< "Can't propagate backwards in time");
return;
}
m_HistoryLength = (m_HistoryLength + time) * std::exp(-m_DecayRate * time);
for (auto& cluster : m_Clusters) {
cluster.propagateForwardsByTime(time);
}
}
bool CXMeansOnline1d::sample(std::size_t index, std::size_t numberSamples, TDoubleVec& samples) const {
const CCluster* cluster = this->cluster(index);
if (cluster == nullptr) {
LOG_ERROR(<< "Cluster " << index << " doesn't exist");
return false;
}
cluster->sample(numberSamples, std::min(m_Smallest[0], 0.0), m_Largest[0], samples);
return true;
}
double CXMeansOnline1d::probability(std::size_t index) const {
double weight = 0.0;
double Z = 0.0;
for (const auto& cluster : m_Clusters) {
if (cluster.index() == index) {
weight = cluster.weight(maths_t::E_ClustersFractionWeight);
}
Z += cluster.weight(maths_t::E_ClustersFractionWeight);
}
return Z == 0.0 ? 0.0 : weight / Z;
}
void CXMeansOnline1d::debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
mem->setName("CXMeansOnline1d");
core::memory_debug::dynamicSize("m_ClusterIndexGenerator", m_ClusterIndexGenerator, mem);
core::memory_debug::dynamicSize("m_Clusters", m_Clusters, mem);
}
std::size_t CXMeansOnline1d::memoryUsage() const {
std::size_t mem = core::memory::dynamicSize(m_ClusterIndexGenerator);
mem += core::memory::dynamicSize(m_Clusters);
return mem;
}
std::size_t CXMeansOnline1d::staticSize() const {
return sizeof(*this);
}
std::uint64_t CXMeansOnline1d::checksum(std::uint64_t seed) const {
seed = CChecksum::calculate(seed, m_DataType);
seed = CChecksum::calculate(seed, m_DecayRate);
seed = CChecksum::calculate(seed, m_HistoryLength);
seed = CChecksum::calculate(seed, m_WeightCalc);
return CChecksum::calculate(seed, m_Clusters);
}
double CXMeansOnline1d::count() const {
return std::accumulate(m_Clusters.begin(), m_Clusters.end(), 0.0,
[](double count, const CCluster& cluster) {
return count + cluster.count();
});
}
const CXMeansOnline1d::TClusterVec& CXMeansOnline1d::clusters() const {
return m_Clusters;
}
std::string CXMeansOnline1d::printClusters() const {
if (m_Clusters.empty()) {
return std::string();
}
std::ostringstream result;
// We'll plot the marginal likelihood function over a range
// where most of the mass is, i.e. the 99.9% confidence
static const double RANGE = 99.9;
static const unsigned int POINTS = 201;
TDoubleDoublePr range(std::numeric_limits<double>::max(),
std::numeric_limits<double>::lowest());
for (const auto& cluster : m_Clusters) {
const CPrior& prior = cluster.prior();
TDoubleDoublePr clusterRange = prior.marginalLikelihoodConfidenceInterval(RANGE);
range.first = std::min(range.first, clusterRange.first);
range.second = std::max(range.second, clusterRange.second);
}
double Z = 0.0;
for (const auto& cluster : m_Clusters) {
Z += cluster.weight(m_WeightCalc);
}
TDouble1Vec x{range.first};
double increment = (range.second - range.first) / (POINTS - 1.0);
std::ostringstream coordinatesStr;
std::ostringstream likelihoodStr;
coordinatesStr << "x = [";
likelihoodStr << "likelihood = [";
for (unsigned int i = 0; i < POINTS; ++i, x[0] += increment) {
double likelihood = 0.0;
for (const auto& cluster : m_Clusters) {
double logLikelihood;
const CPrior& prior = cluster.prior();
if ((prior.jointLogMarginalLikelihood(x, maths_t::CUnitWeights::SINGLE_UNIT, logLikelihood) &
(maths_t::E_FpFailed | maths_t::E_FpOverflowed)) == 0) {
likelihood += cluster.weight(m_WeightCalc) / Z * std::exp(logLikelihood);
}
}
coordinatesStr << x[0] << " ";
likelihoodStr << likelihood << " ";
}
coordinatesStr << "];" << core_t::LINE_ENDING;
likelihoodStr << "];" << core_t::LINE_ENDING << "plot(x, likelihood);";
return coordinatesStr.str() + likelihoodStr.str();
}
CXMeansOnline1d::CIndexGenerator& CXMeansOnline1d::indexGenerator() {
return m_ClusterIndexGenerator;
}
bool CXMeansOnline1d::acceptRestoreTraverser(const SDistributionRestoreParams& params,
core::CStateRestoreTraverser& traverser) {
do {
const std::string& name = traverser.name();
RESTORE_SETUP_TEARDOWN(CLUSTER_TAG, CCluster cluster(*this),
traverser.traverseSubLevel([&](auto& traverser_) {
return cluster.acceptRestoreTraverser(params, traverser_);
}),
m_Clusters.push_back(cluster))
RESTORE(AVAILABLE_DISTRIBUTIONS_TAG,
m_AvailableDistributions.fromString(traverser.value()))
RESTORE_SETUP_TEARDOWN(DECAY_RATE_TAG, double decayRate,
core::CStringUtils::stringToType(traverser.value(), decayRate),
this->decayRate(decayRate))
RESTORE(HISTORY_LENGTH_TAG, m_HistoryLength.fromString(traverser.value()))
RESTORE(SMALLEST_TAG, m_Smallest.fromDelimited(traverser.value()))
RESTORE(LARGEST_TAG, m_Largest.fromDelimited(traverser.value()))
RESTORE(CLUSTER_INDEX_GENERATOR_TAG, traverser.traverseSubLevel([this](auto& traverser_) {
return m_ClusterIndexGenerator.acceptRestoreTraverser(traverser_);
}))
RESTORE_ENUM(WEIGHT_CALC_TAG, m_WeightCalc, maths_t::EClusterWeightCalc)
RESTORE(MINIMUM_CLUSTER_FRACTION_TAG,
m_MinimumClusterFraction.fromString(traverser.value()))
RESTORE(MINIMUM_CLUSTER_COUNT_TAG,
m_MinimumClusterCount.fromString(traverser.value()))
RESTORE(WINSORIZATION_CONFIDENCE_INTERVAL_TAG,
m_WinsorizationConfidenceInterval.fromString(traverser.value()))
} while (traverser.next());
return true;
}
const CXMeansOnline1d::CCluster* CXMeansOnline1d::cluster(std::size_t index) const {
for (const auto& cluster : m_Clusters) {
if (cluster.index() == index) {
return &cluster;
}
}
return nullptr;
}
double CXMeansOnline1d::minimumSplitCount() const {
double result = m_MinimumClusterCount;
if (m_MinimumClusterFraction > 0.0) {
double count = this->count();
double scale = m_HistoryLength * (1.0 - std::exp(-m_InitialDecayRate));
count *= m_MinimumClusterFraction / std::max(scale, 1.0);
result = std::max(result, count);
}
LOG_TRACE(<< "minimumSplitCount = " << result);
return result;
}
bool CXMeansOnline1d::maybeSplit(TClusterVecItr cluster) {
if (cluster == m_Clusters.end()) {
return false;
}
TDoubleDoublePr interval = this->winsorizationInterval();
if (TOptionalClusterClusterPr split =
cluster->split(m_AvailableDistributions, this->minimumSplitCount(),
m_Smallest[0], interval, m_ClusterIndexGenerator)) {
LOG_TRACE(<< "Splitting cluster " << cluster->index() << " at "
<< cluster->centre());
std::size_t index = cluster->index();
*cluster = split->second;
m_Clusters.insert(cluster, split->first);
(this->splitFunc())(index, split->first.index(), split->second.index());
return true;
}
return false;
}
bool CXMeansOnline1d::maybeMerge(TClusterVecItr cluster1, TClusterVecItr cluster2) {
if (cluster1 == m_Clusters.end() || cluster2 == m_Clusters.end()) {
return false;
}
TDoubleDoublePr interval = this->winsorizationInterval();
if (cluster1->shouldMerge(*cluster2, m_AvailableDistributions, m_Smallest[0], interval)) {
LOG_TRACE(<< "Merging cluster " << cluster1->index() << " at "
<< cluster1->centre() << " and cluster " << cluster2->index()
<< " at " << cluster2->centre());
std::size_t index1 = cluster1->index();
std::size_t index2 = cluster2->index();
CCluster merged = cluster1->merge(*cluster2, m_ClusterIndexGenerator);
*cluster1 = merged;
m_Clusters.erase(cluster2);
(this->mergeFunc())(index1, index2, merged.index());
return true;
}
return false;
}
bool CXMeansOnline1d::prune() {
if (m_Clusters.size() <= 1) {
return false;
}
bool result = false;
double minimumCount = this->minimumSplitCount() * CLUSTER_DELETE_FRACTION;
for (std::size_t i = 1; i < m_Clusters.size(); /**/) {
CCluster& left = m_Clusters[i - 1];
CCluster& right = m_Clusters[i];
if (left.count() < minimumCount || right.count() < minimumCount) {
std::size_t leftIndex = left.index();
std::size_t rightIndex = right.index();
LOG_TRACE(<< "Merging cluster " << leftIndex << " at " << left.centre()
<< " and cluster " << rightIndex << " at " << right.centre());
CCluster merge = left.merge(right, m_ClusterIndexGenerator);
left = merge;
m_Clusters.erase(m_Clusters.begin() + i);
(this->mergeFunc())(leftIndex, rightIndex, merge.index());
result = true;
} else {
++i;
}
}
return result;
}
bool CXMeansOnline1d::remove(std::size_t index) {
if (m_Clusters.size() <= 1) {
return false;
}
auto i = std::find_if(m_Clusters.begin(), m_Clusters.end(), [&](const auto& cluster) {
return cluster.index() == index;
});
if (i == m_Clusters.begin()) {
++i;
}
if (i != m_Clusters.end()) {
CCluster& left = *(i - 1);
CCluster& right = *i;
std::size_t leftIndex = left.index();
std::size_t rightIndex = right.index();
LOG_TRACE(<< "Merging cluster " << leftIndex << " at " << left.centre()
<< " and cluster " << rightIndex << " at " << right.centre());
CCluster merge = left.merge(right, m_ClusterIndexGenerator);
left = merge;
m_Clusters.erase(i);
(this->mergeFunc())(leftIndex, rightIndex, merge.index());
return true;
}
return false;
}
TDoubleDoublePr CXMeansOnline1d::winsorizationInterval() const {
double f = (1.0 - m_WinsorizationConfidenceInterval) / 2.0;
if (f * this->count() < 1.0) {
// Don't bother if we don't expect a sample outside the
// Winsorization interval.
return {std::numeric_limits<double>::lowest() / 2.0,
std::numeric_limits<double>::max() / 2.0};
}
// The Winsorization interval are the positions corresponding
// to the f/2 and 1 - f/2 percentile counts where f is the
// Winsorization confidence interval, i.e. we truncate the
// data to the 1 - f central confidence interval.
double totalCount = this->count();
double leftCount = f * totalCount;
double rightCount = (1.0 - f) * totalCount;
LOG_TRACE(<< "totalCount = " << totalCount << " interval = [" << leftCount
<< "," << rightCount << "]"
<< " # clusters = " << m_Clusters.size());
TDoubleDoublePr result;
double partialCount = 0.0;
for (const auto& cluster : m_Clusters) {
double count = cluster.count();
if (partialCount < leftCount && partialCount + count >= leftCount) {
double p = 100.0 * (leftCount - partialCount) / count;
result.first = cluster.percentile(p);
}
if (partialCount < rightCount && partialCount + count >= rightCount) {
double p = 100.0 * (rightCount - partialCount) / count;
result.second = cluster.percentile(p);
break;
}
partialCount += count;
}
LOG_TRACE(<< "Winsorization interval = [" << result.first << "," << result.second << "]");
return result;
}
//////////// CCluster Implementation ////////////
CXMeansOnline1d::CCluster::CCluster(const CXMeansOnline1d& clusterer)
: m_Index(clusterer.m_ClusterIndexGenerator.next()),
m_Prior(CNormalMeanPrecConjugate::nonInformativePrior(clusterer.m_DataType,
clusterer.m_DecayRate)),
m_Structure(STRUCTURE_SIZE, clusterer.m_DecayRate, clusterer.m_MinimumCategoryCount) {
}
CXMeansOnline1d::CCluster::CCluster(std::size_t index,
const CNormalMeanPrecConjugate& prior,
CNaturalBreaksClassifier structure)
: m_Index(index), m_Prior(prior), m_Structure(std::move(structure)) {
}
bool CXMeansOnline1d::CCluster::acceptRestoreTraverser(const SDistributionRestoreParams& params,
core::CStateRestoreTraverser& traverser) {
do {
const std::string& name = traverser.name();
RESTORE_BUILT_IN(INDEX_TAG, m_Index)
RESTORE_NO_ERROR(PRIOR_TAG, m_Prior = CNormalMeanPrecConjugate(params, traverser))
RESTORE(STRUCTURE_TAG, traverser.traverseSubLevel([&](auto& traverser_) {
return m_Structure.acceptRestoreTraverser(params, traverser_);
}))
} while (traverser.next());
return true;
}
void CXMeansOnline1d::CCluster::acceptPersistInserter(core::CStatePersistInserter& inserter) const {
inserter.insertValue(INDEX_TAG, m_Index);
inserter.insertLevel(PRIOR_TAG, [this](auto& inserter_) {
m_Prior.acceptPersistInserter(inserter_);
});
inserter.insertLevel(STRUCTURE_TAG, [this](auto& inserter_) {
m_Structure.acceptPersistInserter(inserter_);
});
}
void CXMeansOnline1d::CCluster::dataType(maths_t::EDataType dataType) {
m_Prior.dataType(dataType);
}
void CXMeansOnline1d::CCluster::add(double point, double count) {
m_Prior.addSamples({point}, {maths_t::countWeight(count)});
m_Structure.add(point, count);
}
void CXMeansOnline1d::CCluster::decayRate(double decayRate) {
m_Prior.decayRate(decayRate);
m_Structure.decayRate(decayRate);
}
void CXMeansOnline1d::CCluster::propagateForwardsByTime(double time) {
m_Prior.propagateForwardsByTime(time);
m_Structure.propagateForwardsByTime(time);
}
std::size_t CXMeansOnline1d::CCluster::index() const {
return m_Index;
}
double CXMeansOnline1d::CCluster::centre() const {
return m_Prior.marginalLikelihoodMean();
}
double CXMeansOnline1d::CCluster::spread() const {
return std::sqrt(m_Prior.marginalLikelihoodVariance());
}
double CXMeansOnline1d::CCluster::percentile(double p) const {
return m_Structure.percentile(p);
}
double CXMeansOnline1d::CCluster::count() const {
return m_Prior.numberSamples();
}
double CXMeansOnline1d::CCluster::weight(maths_t::EClusterWeightCalc calc) const {
switch (calc) {
case maths_t::E_ClustersEqualWeight:
return 1.0;
case maths_t::E_ClustersFractionWeight:
return m_Prior.numberSamples();
}
LOG_ABORT(<< "Unexpected calculation style " << calc);
}
double CXMeansOnline1d::CCluster::logLikelihoodFromCluster(maths_t::EClusterWeightCalc calc,
double point) const {
double result;
if ((detail::logLikelihoodFromCluster(point, m_Prior, this->weight(calc), result) &
maths_t::E_FpFailed) != 0) {
LOG_ERROR(<< "Unable to compute likelihood for: " << m_Index);
}
return result;
}
void CXMeansOnline1d::CCluster::sample(std::size_t numberSamples,
double smallest,
double largest,
TDoubleVec& samples) const {
m_Structure.sample(numberSamples, smallest, largest, samples);
}
CXMeansOnline1d::TOptionalClusterClusterPr
CXMeansOnline1d::CCluster::split(CAvailableModeDistributions distributions,
double minimumCount,
double smallest,
const TDoubleDoublePr& interval,
CIndexGenerator& indexGenerator) {
// We do our clustering top down to minimize space and avoid
// making splits before we are confident they exist. This is
// important for anomaly detection because we do *not* want
// to fit a cluster to an outlier and judge it to be not
// anomalous as a result.
//
// By analogy to x-means we choose a candidate split of the
// data by minimizing the total within class deviation of the
// two classes. In order to decide whether or not to split we
// 1) impose minimum count on the smaller cluster 2) use an
// information theoretic criterion. Specifically, we threshold
// the BIC gain of using the multi-mode distribution verses
// the single mode distribution.
LOG_TRACE(<< "split");
if (m_Structure.buffering()) {
return {};
}
maths_t::EDataType dataType = m_Prior.dataType();
double decayRate = m_Prior.decayRate();
std::size_t n = m_Structure.size();
if (n < 2) {
return {};
}
TSizeVec split;
{
TTupleVec categories;
m_Structure.categories(n, 0, categories);
for (auto& category : categories) {
detail::winsorise(interval, category);
}
if (detail::splitSearch(minimumCount, MINIMUM_SPLIT_DISTANCE, dataType,
distributions, smallest, categories, split) == false) {
return {};
}
}
TTupleVec categories;
m_Structure.categories(split, categories);
CNaturalBreaksClassifier::TClassifierVec classifiers;
m_Structure.split(split, classifiers);
LOG_TRACE(<< "Splitting cluster " << this->index() << " at "
<< this->centre() << " left = " << classifiers[0].print()
<< ", right = " << classifiers[1].print());
std::size_t index1 = indexGenerator.next();
std::size_t index2 = indexGenerator.next();
indexGenerator.recycle(m_Index);
CNormalMeanPrecConjugate leftNormal(dataType, categories[0], decayRate);
CNormalMeanPrecConjugate rightNormal(dataType, categories[1], decayRate);
return TOptionalClusterClusterPr{
std::in_place, //
CCluster{index1, leftNormal, std::move(classifiers[0])},
CCluster{index2, rightNormal, std::move(classifiers[1])}};
}
bool CXMeansOnline1d::CCluster::shouldMerge(CCluster& other,
CAvailableModeDistributions distributions,
double smallest,
const TDoubleDoublePr& interval) {
if (m_Structure.buffering() || m_Structure.size() == 0 ||
other.m_Structure.size() == 0) {
return false;
}
maths_t::EDataType dataType = m_Prior.dataType();
TTupleVec categories;
if (m_Structure.categories(m_Structure.size(), 0, categories) == false) {
return false;
}
std::size_t split = categories.size();
if (other.m_Structure.categories(other.m_Structure.size(), 0, categories, true) == false) {
return false;
}
for (auto& category : categories) {
detail::winsorise(interval, category);
}
double distance;
double nl;
double nr;
detail::BICGain(dataType, distributions, smallest, categories, 0, split,
categories.size(), distance, nl, nr);
LOG_TRACE(<< "max(BIC(1) - BIC(2), 0) = " << distance << " (to merge "
<< MAXIMUM_MERGE_DISTANCE << ")");
return distance <= MAXIMUM_MERGE_DISTANCE;
}
CXMeansOnline1d::CCluster
CXMeansOnline1d::CCluster::merge(CCluster& other, CIndexGenerator& indexGenerator) {
TTupleVec left;
TTupleVec right;
m_Structure.categories(1, 0, left);
other.m_Structure.categories(1, 0, right);
std::size_t index = indexGenerator.next();
CNormalMeanPrecConjugate::TMeanVarAccumulator mergedCategories;
if (left.empty() == false) {
LOG_TRACE(<< "left = " << left[0]);
mergedCategories += left[0];
}
if (right.empty() == false) {
LOG_TRACE(<< "right = " << right[0]);
mergedCategories += right[0];
}
CNormalMeanPrecConjugate prior(m_Prior.dataType(), mergedCategories,
m_Prior.decayRate());
CNaturalBreaksClassifier structure(m_Structure);
structure.merge(other.m_Structure);
CCluster result(index, prior, structure);
indexGenerator.recycle(m_Index);
indexGenerator.recycle(other.m_Index);
return result;
}
const CNormalMeanPrecConjugate& CXMeansOnline1d::CCluster::prior() const {
return m_Prior;
}
std::uint64_t CXMeansOnline1d::CCluster::checksum(std::uint64_t seed) const {
seed = CChecksum::calculate(seed, m_Index);
seed = CChecksum::calculate(seed, m_Prior);
return CChecksum::calculate(seed, m_Structure);
}
void CXMeansOnline1d::CCluster::debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
mem->setName("CXMeansOnline1d::CCluster");
core::memory_debug::dynamicSize("m_Prior", m_Prior, mem);
core::memory_debug::dynamicSize("m_Structure", m_Structure, mem);
}
std::size_t CXMeansOnline1d::CCluster::memoryUsage() const {
std::size_t mem = core::memory::dynamicSize(m_Prior);
mem += core::memory::dynamicSize(m_Structure);
return mem;
}
const double CXMeansOnline1d::WINSORIZATION_CONFIDENCE_INTERVAL(1.0);
const double CXMeansOnline1d::MINIMUM_SPLIT_DISTANCE(6.0);
const double CXMeansOnline1d::MAXIMUM_MERGE_DISTANCE(2.0);
const double CXMeansOnline1d::CLUSTER_DELETE_FRACTION(0.8);
const std::size_t CXMeansOnline1d::STRUCTURE_SIZE(12);
}
}
}