in lib/maths/common/CXMeansOnline1d.cc [176:402]
void BICGain(maths_t::EDataType dataType,
CAvailableModeDistributions distributions,
double smallest,
const TTupleVec& categories,
std::size_t start,
std::size_t split,
std::size_t end,
double& distance,
double& nl,
double& nr) {
// The basic idea is to compute the difference between the
// Bayes Information Content (BIC) for one and two clusters
// for the sketch defined by the categories passed to this
// function. The exact form of BIC for the mixture (i.e. the
// two cluster case) depends on what one assumes about the
// the model. For example, if one assumes that the categories
// are labeled with their correct cluster then their likelihood
// is their likelihood of coming from the corresponding cluster
// multiplied by the cluster weight. Alternatively, if one
// assumes that they have equal prior probability of coming
// from either cluster, their likelihood is the weighted sum
// of their likelihoods for both clusters. In this second case
// the exact BIC (even modeling the modes as normals) can't be
// computed in terms of the statistics we maintain in the sketch.
// We could for example compute the expected BIC over a mixture
// model of the sketch. However, we will assume the former case
// (as they do in x-means). In this case, the BIC for the single
// cluster and normal distribution is
//
// \sum_i{ ni * (log(1/2/pi/v) + (vi + (mi - m)^2) / v) } + 2.0 * log(n)
//
// where ni is the count, mi is the mean and vi is the variance
// of the i'th category, and n, m and v are the overall count
// mean and variance. For the two cluster normal distribution
// case it is
//
// \sum_{i in l}{ ni * (log(1/2/pi/vl/wl/wl) + (vi + (mi - ml)^2) / vl)
// + \sum_{i in r}{ ni * (log(1/2/pi/vl/wr/wr) + (vi + (mi - mr)^2) / vr)
// + 5.0 * log(n)
//
// where wl, ml and vl are the weight, mean and variance of one
// cluster and wr, mr and vr are the weight, mean and variance
// of the other cluster.
//
// We also consider log-normal and mixture of log-normal and
// gamma and mixture of gamma cases. We don't maintain the
// sufficient statistics we need to estimate the exact BIC
// for these distributions instead we compute the best
// approximation with the statistics available. Specifically,
// we estimate the distribution parameters using method of
// moments rather than maximum likelihood and compute the
// BIC using Taylor expansions of log(xi) and log(xi)^2.
static const double MIN_RELATIVE_VARIANCE = 1e-10;
static const double MIN_ABSOLUTE_VARIANCE = 1.0;
TMeanVarAccumulator mv;
TMeanVarAccumulator mvl;
TMeanVarAccumulator mvr;
candidates(categories, start, split, end, mv, mvl, mvr);
double logNormalOffset = std::max(0.0, GAMMA_OFFSET_MARGIN - smallest);
double gammaOffset = std::max(0.0, LOG_NORMAL_OFFSET_MARGIN - smallest);
for (std::size_t i = start; i < end; ++i) {
double x = mean(dataType, categories[i]);
logNormalOffset = std::max(logNormalOffset, LOG_NORMAL_OFFSET_MARGIN - x);
gammaOffset = std::max(gammaOffset, GAMMA_OFFSET_MARGIN - x);
}
LOG_TRACE(<< "offsets = [" << gammaOffset << "," << logNormalOffset << "]");
distance = 0.0;
nl = CBasicStatistics::count(mvl);
nr = CBasicStatistics::count(mvr);
// Compute the BIC gain for splitting the mode.
double ll1n = 0.0;
double ll1l = 0.0;
double ll1g = 0.0;
double ll2nl = 0.0;
double ll2ll = 0.0;
double ll2gl = 0.0;
double ll2nr = 0.0;
double ll2lr = 0.0;
double ll2gr = 0.0;
// Normal
double n = CBasicStatistics::count(mv);
double m = mean(dataType, mv);
double v = variance(dataType, mv);
if (v <= MINIMUM_COEFFICIENT_OF_VARIATION * std::fabs(m)) {
return;
}
// Log-normal (method of moments)
double s = std::log(1.0 + v / CTools::pow2(m + logNormalOffset));
double l = std::log(m + logNormalOffset) - s / 2.0;
// Gamma (method of moments)
double a = CTools::pow2(m + gammaOffset) / v;
double b = (m + gammaOffset) / v;
double smin = std::max(logNormalOffset, gammaOffset);
double vmin = std::min(MIN_RELATIVE_VARIANCE * std::max(v, CTools::pow2(smin)),
MIN_ABSOLUTE_VARIANCE);
// Mixture of normals
double wl = CBasicStatistics::count(mvl) / n;
double ml = mean(dataType, mvl);
double vl = std::max(variance(dataType, mvl), vmin);
double wr = CBasicStatistics::count(mvr) / n;
double mr = mean(dataType, mvr);
double vr = std::max(variance(dataType, mvr), vmin);
bool haveGamma = distributions.haveGamma();
try {
// Mixture of log-normals (method of moments)
double sl = std::log(1.0 + vl / CTools::pow2(ml + logNormalOffset));
double ll = std::log(ml + logNormalOffset) - sl / 2.0;
double sr = std::log(1.0 + vr / CTools::pow2(mr + logNormalOffset));
double lr = std::log(mr + logNormalOffset) - sr / 2.0;
// Mixture of gammas (method of moments)
double al = CTools::pow2(ml + gammaOffset) / vl;
double bl = (ml + gammaOffset) / vl;
double ar = CTools::pow2(mr + gammaOffset) / vr;
double br = (mr + gammaOffset) / vr;
double log2piv = std::log(boost::math::double_constants::two_pi * v);
double log2pis = std::log(boost::math::double_constants::two_pi * s);
double loggn = 0.0;
double loggnl = 0.0;
double loggnr = 0.0;
if (haveGamma && CTools::lgamma(a, loggn, true) &&
CTools::lgamma(al, loggnl, true) && CTools::lgamma(ar, loggnr, true)) {
loggn -= a * std::log(b);
loggnl -= al * std::log(bl) + std::log(wl);
loggnr -= ar * std::log(br) + std::log(wr);
} else {
haveGamma = false;
}
double log2pivl =
std::log(boost::math::double_constants::two_pi * vl / CTools::pow2(wl));
double log2pivr =
std::log(boost::math::double_constants::two_pi * vr / CTools::pow2(wr));
double log2pisl =
std::log(boost::math::double_constants::two_pi * sl / CTools::pow2(wl));
double log2pisr =
std::log(boost::math::double_constants::two_pi * sr / CTools::pow2(wr));
for (std::size_t i = start; i < split; ++i) {
double ni = CBasicStatistics::count(categories[i]);
double mi = mean(dataType, categories[i]);
double vi = variance(dataType, categories[i]);
if (vi == 0.0) {
double li = std::log(mi + logNormalOffset);
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
ll2ll += ni * (CTools::pow2(li - ll) / sl + 2.0 * li + log2pisl);
ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
} else {
double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
double li = std::log(mi + logNormalOffset) - si / 2.0;
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
ll2ll += ni * ((si + CTools::pow2(li - ll)) / sl + 2.0 * li + log2pisl);
ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
}
}
for (std::size_t i = split; i < end; ++i) {
double ni = CBasicStatistics::count(categories[i]);
double mi = mean(dataType, categories[i]);
double vi = variance(dataType, categories[i]);
if (vi == 0.0) {
double li = std::log(mi + logNormalOffset);
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
ll2lr += ni * (CTools::pow2(li - lr) / sr + 2.0 * li + log2pisr);
ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
} else {
double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
double li = std::log(mi + logNormalOffset) - si / 2.0;
ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
ll2lr += ni * ((si + CTools::pow2(li - lr)) / sr + 2.0 * li + log2pisr);
ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
}
}
} catch (const std::exception& e) {
LOG_ERROR(<< "Failed to compute BIC gain: " << e.what() << ", n = " << n
<< ", m = " << m << ", v = " << v << ", wl = " << wl
<< ", ml = " << ml << ", vl = " << vl << ", wr = " << wr
<< ", mr = " << mr << ", vr = " << vr);
return;
}
double logn = std::log(n);
double ll1 =
min(distributions.haveNormal() ? ll1n : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll1l : std::numeric_limits<double>::max(),
haveGamma ? ll1g : std::numeric_limits<double>::max()) +
distributions.parameters() * logn;
double ll2 =
min(distributions.haveNormal() ? ll2nl : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll2ll : std::numeric_limits<double>::max(),
haveGamma ? ll2gl : std::numeric_limits<double>::max()) +
min(distributions.haveNormal() ? ll2nr : std::numeric_limits<double>::max(),
distributions.haveLogNormal() ? ll2lr : std::numeric_limits<double>::max(),
haveGamma ? ll2gr : std::numeric_limits<double>::max()) +
(2.0 * distributions.parameters() + 1.0) * logn;
LOG_TRACE(<< "BIC(1) = " << ll1 << ", BIC(2) = " << ll2);
distance = std::max(ll1 - ll2, 0.0);
}