void BICGain()

in lib/maths/common/CXMeansOnline1d.cc [176:402]


void BICGain(maths_t::EDataType dataType,
             CAvailableModeDistributions distributions,
             double smallest,
             const TTupleVec& categories,
             std::size_t start,
             std::size_t split,
             std::size_t end,
             double& distance,
             double& nl,
             double& nr) {

    // The basic idea is to compute the difference between the
    // Bayes Information Content (BIC) for one and two clusters
    // for the sketch defined by the categories passed to this
    // function. The exact form of BIC for the mixture (i.e. the
    // two cluster case) depends on what one assumes about the
    // the model. For example, if one assumes that the categories
    // are labeled with their correct cluster then their likelihood
    // is their likelihood of coming from the corresponding cluster
    // multiplied by the cluster weight. Alternatively, if one
    // assumes that they have equal prior probability of coming
    // from either cluster, their likelihood is the weighted sum
    // of their likelihoods for both clusters. In this second case
    // the exact BIC (even modeling the modes as normals) can't be
    // computed in terms of the statistics we maintain in the sketch.
    // We could for example compute the expected BIC over a mixture
    // model of the sketch. However, we will assume the former case
    // (as they do in x-means). In this case, the BIC for the single
    // cluster and normal distribution is
    //
    //   \sum_i{ ni * (log(1/2/pi/v) + (vi + (mi - m)^2) / v) } + 2.0 * log(n)
    //
    // where ni is the count, mi is the mean and vi is the variance
    // of the i'th category, and n, m and v are the overall count
    // mean and variance. For the two cluster normal distribution
    // case it is
    //
    //   \sum_{i in l}{ ni * (log(1/2/pi/vl/wl/wl) + (vi + (mi - ml)^2) / vl)
    // + \sum_{i in r}{ ni * (log(1/2/pi/vl/wr/wr) + (vi + (mi - mr)^2) / vr)
    // + 5.0 * log(n)
    //
    // where wl, ml and vl are the weight, mean and variance of one
    // cluster and wr, mr and vr are the weight, mean and variance
    // of the other cluster.
    //
    // We also consider log-normal and mixture of log-normal and
    // gamma and mixture of gamma cases. We don't maintain the
    // sufficient statistics we need to estimate the exact BIC
    // for these distributions instead we compute the best
    // approximation with the statistics available. Specifically,
    // we estimate the distribution parameters using method of
    // moments rather than maximum likelihood and compute the
    // BIC using Taylor expansions of log(xi) and log(xi)^2.

    static const double MIN_RELATIVE_VARIANCE = 1e-10;
    static const double MIN_ABSOLUTE_VARIANCE = 1.0;

    TMeanVarAccumulator mv;
    TMeanVarAccumulator mvl;
    TMeanVarAccumulator mvr;
    candidates(categories, start, split, end, mv, mvl, mvr);
    double logNormalOffset = std::max(0.0, GAMMA_OFFSET_MARGIN - smallest);
    double gammaOffset = std::max(0.0, LOG_NORMAL_OFFSET_MARGIN - smallest);
    for (std::size_t i = start; i < end; ++i) {
        double x = mean(dataType, categories[i]);
        logNormalOffset = std::max(logNormalOffset, LOG_NORMAL_OFFSET_MARGIN - x);
        gammaOffset = std::max(gammaOffset, GAMMA_OFFSET_MARGIN - x);
    }
    LOG_TRACE(<< "offsets = [" << gammaOffset << "," << logNormalOffset << "]");

    distance = 0.0;
    nl = CBasicStatistics::count(mvl);
    nr = CBasicStatistics::count(mvr);

    // Compute the BIC gain for splitting the mode.

    double ll1n = 0.0;
    double ll1l = 0.0;
    double ll1g = 0.0;
    double ll2nl = 0.0;
    double ll2ll = 0.0;
    double ll2gl = 0.0;
    double ll2nr = 0.0;
    double ll2lr = 0.0;
    double ll2gr = 0.0;

    // Normal
    double n = CBasicStatistics::count(mv);
    double m = mean(dataType, mv);
    double v = variance(dataType, mv);
    if (v <= MINIMUM_COEFFICIENT_OF_VARIATION * std::fabs(m)) {
        return;
    }

    // Log-normal (method of moments)
    double s = std::log(1.0 + v / CTools::pow2(m + logNormalOffset));
    double l = std::log(m + logNormalOffset) - s / 2.0;
    // Gamma (method of moments)
    double a = CTools::pow2(m + gammaOffset) / v;
    double b = (m + gammaOffset) / v;

    double smin = std::max(logNormalOffset, gammaOffset);
    double vmin = std::min(MIN_RELATIVE_VARIANCE * std::max(v, CTools::pow2(smin)),
                           MIN_ABSOLUTE_VARIANCE);

    // Mixture of normals
    double wl = CBasicStatistics::count(mvl) / n;
    double ml = mean(dataType, mvl);
    double vl = std::max(variance(dataType, mvl), vmin);
    double wr = CBasicStatistics::count(mvr) / n;
    double mr = mean(dataType, mvr);
    double vr = std::max(variance(dataType, mvr), vmin);

    bool haveGamma = distributions.haveGamma();
    try {
        // Mixture of log-normals (method of moments)
        double sl = std::log(1.0 + vl / CTools::pow2(ml + logNormalOffset));
        double ll = std::log(ml + logNormalOffset) - sl / 2.0;
        double sr = std::log(1.0 + vr / CTools::pow2(mr + logNormalOffset));
        double lr = std::log(mr + logNormalOffset) - sr / 2.0;
        // Mixture of gammas (method of moments)
        double al = CTools::pow2(ml + gammaOffset) / vl;
        double bl = (ml + gammaOffset) / vl;
        double ar = CTools::pow2(mr + gammaOffset) / vr;
        double br = (mr + gammaOffset) / vr;

        double log2piv = std::log(boost::math::double_constants::two_pi * v);
        double log2pis = std::log(boost::math::double_constants::two_pi * s);

        double loggn = 0.0;
        double loggnl = 0.0;
        double loggnr = 0.0;

        if (haveGamma && CTools::lgamma(a, loggn, true) &&
            CTools::lgamma(al, loggnl, true) && CTools::lgamma(ar, loggnr, true)) {
            loggn -= a * std::log(b);
            loggnl -= al * std::log(bl) + std::log(wl);
            loggnr -= ar * std::log(br) + std::log(wr);
        } else {
            haveGamma = false;
        }

        double log2pivl =
            std::log(boost::math::double_constants::two_pi * vl / CTools::pow2(wl));
        double log2pivr =
            std::log(boost::math::double_constants::two_pi * vr / CTools::pow2(wr));
        double log2pisl =
            std::log(boost::math::double_constants::two_pi * sl / CTools::pow2(wl));
        double log2pisr =
            std::log(boost::math::double_constants::two_pi * sr / CTools::pow2(wr));

        for (std::size_t i = start; i < split; ++i) {
            double ni = CBasicStatistics::count(categories[i]);
            double mi = mean(dataType, categories[i]);
            double vi = variance(dataType, categories[i]);

            if (vi == 0.0) {
                double li = std::log(mi + logNormalOffset);
                ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
                ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
                ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
                ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
                ll2ll += ni * (CTools::pow2(li - ll) / sl + 2.0 * li + log2pisl);
                ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
            } else {
                double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
                double li = std::log(mi + logNormalOffset) - si / 2.0;
                ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
                ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
                ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
                ll2nl += ni * ((vi + CTools::pow2(mi - ml)) / vl + log2pivl);
                ll2ll += ni * ((si + CTools::pow2(li - ll)) / sl + 2.0 * li + log2pisl);
                ll2gl += ni * 2.0 * (bl * (mi + gammaOffset) - (al - 1.0) * li + loggnl);
            }
        }

        for (std::size_t i = split; i < end; ++i) {
            double ni = CBasicStatistics::count(categories[i]);
            double mi = mean(dataType, categories[i]);
            double vi = variance(dataType, categories[i]);

            if (vi == 0.0) {
                double li = std::log(mi + logNormalOffset);
                ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
                ll1l += ni * (CTools::pow2(li - l) / s + 2.0 * li + log2pis);
                ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
                ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
                ll2lr += ni * (CTools::pow2(li - lr) / sr + 2.0 * li + log2pisr);
                ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
            } else {
                double si = std::log(1.0 + vi / CTools::pow2(mi + logNormalOffset));
                double li = std::log(mi + logNormalOffset) - si / 2.0;
                ll1n += ni * ((vi + CTools::pow2(mi - m)) / v + log2piv);
                ll1l += ni * ((si + CTools::pow2(li - l)) / s + 2.0 * li + log2pis);
                ll1g += ni * 2.0 * (b * (mi + gammaOffset) - (a - 1.0) * li + loggn);
                ll2nr += ni * ((vi + CTools::pow2(mi - mr)) / vr + log2pivr);
                ll2lr += ni * ((si + CTools::pow2(li - lr)) / sr + 2.0 * li + log2pisr);
                ll2gr += ni * 2.0 * (br * (mi + gammaOffset) - (ar - 1.0) * li + loggnr);
            }
        }
    } catch (const std::exception& e) {
        LOG_ERROR(<< "Failed to compute BIC gain: " << e.what() << ", n = " << n
                  << ", m = " << m << ", v = " << v << ", wl = " << wl
                  << ", ml = " << ml << ", vl = " << vl << ", wr = " << wr
                  << ", mr = " << mr << ", vr = " << vr);
        return;
    }

    double logn = std::log(n);
    double ll1 =
        min(distributions.haveNormal() ? ll1n : std::numeric_limits<double>::max(),
            distributions.haveLogNormal() ? ll1l : std::numeric_limits<double>::max(),
            haveGamma ? ll1g : std::numeric_limits<double>::max()) +
        distributions.parameters() * logn;
    double ll2 =
        min(distributions.haveNormal() ? ll2nl : std::numeric_limits<double>::max(),
            distributions.haveLogNormal() ? ll2ll : std::numeric_limits<double>::max(),
            haveGamma ? ll2gl : std::numeric_limits<double>::max()) +
        min(distributions.haveNormal() ? ll2nr : std::numeric_limits<double>::max(),
            distributions.haveLogNormal() ? ll2lr : std::numeric_limits<double>::max(),
            haveGamma ? ll2gr : std::numeric_limits<double>::max()) +
        (2.0 * distributions.parameters() + 1.0) * logn;

    LOG_TRACE(<< "BIC(1) = " << ll1 << ", BIC(2) = " << ll2);

    distance = std::max(ll1 - ll2, 0.0);
}