in lib/maths/common/CMultinomialConjugate.cc [833:1183]
bool CMultinomialConjugate::probabilityOfLessLikelySamples(maths_t::EProbabilityCalculation calculation,
const TDouble1Vec& samples,
const TDoubleWeightsAry1Vec& weights,
double& lowerBound,
double& upperBound,
maths_t::ETail& tail) const {
lowerBound = 0.0;
upperBound = 1.0;
tail = maths_t::E_UndeterminedTail;
if (samples.empty()) {
LOG_ERROR(<< "Can't compute distribution for empty sample set");
return false;
}
// To compute the overall joint probability we use our approximate
// aggregation scheme which interprets the probabilities as arising
// from jointly independent Gaussians. Note that trying to compute
// the exact aggregation requires us to calculate:
// P(R) = P({ y | f(y | p) <= f(x | p) })
//
// where,
// f(. | p) = n! * Product_i{ p(i)^n(i) / n(i)! }, i.e. the
// multinomial distribution function.
// n(i) are the counts of the i'th category in vector argument.
//
// This is computationally hard. (We might be able to make head way
// on the relaxation of the problem for the case that {n(i)} are
// no longer integer, but must sum to one. In which case the values
// live on a k-dimensional simplex. We would be interested in contours
// of constant probability, which might be computable by the method
// of Lagrange multipliers.)
switch (calculation) {
case maths_t::E_OneSidedBelow: {
// See minusLogJointCdf for a discussion of the calculation
// of the marginal c.d.f. for a single sample.
CJointProbabilityOfLessLikelySamples jointLowerBound;
CJointProbabilityOfLessLikelySamples jointUpperBound;
detail::CCdf cdf(m_Categories, m_Concentrations, m_TotalConcentration);
for (std::size_t i = 0; i < samples.size(); ++i) {
if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
continue;
}
double x = samples[i];
double n = maths_t::count(weights[i]);
double sampleLowerBound;
double sampleUpperBound;
cdf(x, sampleLowerBound, sampleUpperBound);
jointLowerBound.add(sampleLowerBound, n);
jointUpperBound.add(sampleUpperBound, n);
}
if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
<< jointLowerBound << ", " << jointUpperBound);
return false;
}
tail = maths_t::E_LeftTail;
} break;
case maths_t::E_TwoSided: {
// The probability of a less likely category is given by:
// E[ Sum_{p(j) <= p(i)}{ p(j) } ]
//
// where the expectation is taken over the prior for the distribution
// probabilities. This is hard to compute because the terms in the sum
// vary as the probabilities vary. We approximate this by taking the
// expectation over the marginal for each p(i). In particular, we compute:
// P(i) = Sum_{E[p(j)] <= E[p(i)]}{ E[p(j)] }
//
// Here, P(i) is the probability of less likely category and the
// expected probability of the i'th category is:
// E[p(i)] = a(i) / Sum_j{ a(j) } (1)
//
// where, a(i) are the concentration parameters of the prior. So, (1)
// reduces to:
// Sum_{j:a(j)<=a(i)}{ E[p(j)] }
//
// We can think of P(.) as a function of probability, i.e. if the
// probability of a category were p then its corresponding P(.) would
// be:
// P(argmin_{i : E[p(i)] >= p}{ E[p(i)] })
//
// Given this definition we can compute:
// E[ P(p) ] (2)
//
// where the expectation is taken over the marginal for each probability.
// This can be computed exactly by noting that marginal for p(i) is
// beta distributed with alpha = a(i) and beta = Sum_j{ a(j) } - a(i).
// However, this requires us to compute quantiles at every E[p(i)] which
// would be expensive for a large number of categories. Instead we
// approximate the probability by using a fixed number of samples from
// the marginal and computing the probability by taking the mean of these.
// Finally, note that if E[p(i)] and E[p(j)] are very close we want P(i)
// and P(j) to be close, but they can in fact be very different if there
// are many probabilities E[p(k)] which satisfy:
// E[p(i)] <= E[p(k)] <= E[p(j)],
//
// To avoid this problem we scale all probabilities by (1 + eps), for
// small eps > 0, when computing (2).
//
// In the case that the number of categories has overflowed we derive a
// sharp lower bound by considering the case that all the additional
// mass is in one category which is not in the sample set. In this case
// we have three sets of categories:
// U = "Uncounted categories"
// L = {i : i is counted and P(i) < P(U)}
// M = {i : i is counted and P(i) >= P(U)}
//
// where, clearly P(U) is the extra mass. If X denotes the sample set
// and N the total concentration then for this case:
// P(i in XU) >= 1 / N
// P(i in XL) >= P(i)
// P(i in XM) >= P(i) + P(U)
//
// If |X| = 1 then a sharp upper bound is easy to compute:
// P(i in XU) <= P(U) + P(L)
// P(i in X\U) <= P(i) + P(U) (3)
//
// For the case that X contains a mixture of values that are and aren't
// in U then a sharp upper bound is difficult to compute: it depends on
// the number of different categories in XU, P(U) and the probabilities
// P(i in L). In this case we just fall back to using (3) which isn't
// sharp.
using TSizeVec = std::vector<std::size_t>;
tail = maths_t::E_MixedOrNeitherTail;
TDoubleDoubleSizeTrVec pCategories;
pCategories.reserve(m_Categories.size());
double pU = 0.0;
double pmin = 1.0 / m_TotalConcentration;
{
double r = 1.0 / static_cast<double>(m_Concentrations.size());
for (std::size_t i = 0; i < m_Concentrations.size(); ++i) {
double p = m_Concentrations[i] / m_TotalConcentration;
pCategories.emplace_back(p, p, i);
pU += r - p;
}
}
std::sort(pCategories.begin(), pCategories.end());
LOG_TRACE(<< "p = " << pCategories);
double pl = 0.0;
{
// Get the index of largest probability less than or equal to P(U).
std::size_t l = pCategories.size();
if (pU > 0.0) {
l = std::lower_bound(pCategories.begin(), pCategories.end(),
TDoubleDoubleSizeTr(pU, pU, 0)) -
pCategories.begin();
}
// Compute probabilities of less likely categories.
double pCumulative = 0.0;
for (std::size_t i = 0, j = 0; i < pCategories.size(); /**/) {
// Find the probability equal range [i, j).
double p = std::get<1>(pCategories[i]);
pCumulative += p;
while (++j < pCategories.size() && std::get<1>(pCategories[j]) == p) {
pCumulative += p;
}
// Update the equal range probabilities [i, j).
for (/**/; i < j; ++i) {
std::get<1>(pCategories[i]) = pCumulative;
}
}
if (l < pCategories.size()) {
pl = std::get<1>(pCategories[l]);
}
}
std::size_t nSamples = detail::numberPriorSamples(m_TotalConcentration);
LOG_TRACE(<< "n = " << nSamples);
if (nSamples > 1) {
// Extract the indices of the categories we want.
TSizeVec categoryIndices;
categoryIndices.reserve(samples.size());
for (std::size_t i = 0; i < samples.size(); ++i) {
std::size_t index = std::lower_bound(m_Categories.begin(),
m_Categories.end(), samples[i]) -
m_Categories.begin();
if (index < m_Categories.size() && m_Categories[index] == samples[i]) {
categoryIndices.push_back(index);
}
}
std::sort(categoryIndices.begin(), categoryIndices.end());
for (std::size_t i = 0; i < pCategories.size(); ++i) {
// For all categories that we actually want compute the
// average probability over a set of independent samples
// from the marginal prior for this category, which by the
// law of large numbers converges to E[ P(p) ] w.r.t. to
// marginal for p. The constants a and b are a(i) and
// Sum_j( a(j) ) - a(i), respectively.
std::size_t j = std::get<2>(pCategories[i]);
if (std::binary_search(categoryIndices.begin(), categoryIndices.end(), j)) {
TDouble7Vec marginalSamples;
double a = m_Concentrations[j];
double b = m_TotalConcentration - m_Concentrations[j];
detail::generateBetaSamples(a, b, nSamples, marginalSamples);
LOG_TRACE(<< "E[p] = " << std::get<0>(pCategories[i])
<< ", mean = " << CBasicStatistics::mean(marginalSamples)
<< ", samples = " << marginalSamples);
TMeanAccumulator pAcc;
for (std::size_t k = 0; k < marginalSamples.size(); ++k) {
TDoubleDoubleSizeTr x(1.05 * marginalSamples[k], 0.0, 0);
std::ptrdiff_t r = std::min(
std::upper_bound(pCategories.begin(), pCategories.end(), x) -
pCategories.begin(),
static_cast<std::ptrdiff_t>(pCategories.size()) - 1);
double fl = r > 0 ? std::get<0>(pCategories[r - 1]) : 0.0;
double fr = std::get<0>(pCategories[r]);
double pl_ = r > 0 ? std::get<1>(pCategories[r - 1]) : 0.0;
double pr_ = std::get<1>(pCategories[r]);
double alpha = std::min(
(fr - fl == 0.0) ? 0.0 : (std::get<0>(x) - fl) / (fr - fl), 1.0);
double px = (1.0 - alpha) * pl_ + alpha * pr_;
LOG_TRACE(<< "E[p(l)] = " << fl << ", P(l) = " << pl_
<< ", E[p(r)] = " << fr << ", P(r) = " << pr_
<< ", alpha = " << alpha << ", p = " << px);
pAcc.add(px);
}
std::get<1>(pCategories[i]) = CBasicStatistics::mean(pAcc);
}
}
}
LOG_TRACE(<< "pCategories = " << pCategories);
LOG_TRACE(<< "P(U) = " << pU << ", P(l) = " << pl);
// We can use radix sort to reorder the probabilities in O(n).
// To understand the following loop note that on each iteration
// at least one extra probability is in its correct position
// so it will necessarily terminate in at most n iterations.
for (std::size_t i = 0; i < pCategories.size(); /**/) {
if (i == std::get<2>(pCategories[i])) {
++i;
} else {
std::swap(pCategories[i], pCategories[std::get<2>(pCategories[i])]);
}
}
LOG_TRACE(<< "pCategories = " << pCategories);
if (samples.size() == 1) {
// No special aggregation is required if there is a single sample.
std::size_t index = std::lower_bound(m_Categories.begin(),
m_Categories.end(), samples[0]) -
m_Categories.begin();
if (index < m_Categories.size() && m_Categories[index] == samples[0]) {
double p = std::get<1>(pCategories[index]);
lowerBound = p + (p >= pU ? pU : 0.0);
upperBound = p + pU;
} else {
lowerBound = pmin;
upperBound = std::max(pU + pl, pmin);
}
return true;
}
TDoubleDoubleMap categoryCounts;
// Count the occurrences of each category in the sample set.
for (std::size_t i = 0; i < samples.size(); ++i) {
if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
continue;
}
double x = samples[i];
double n = maths_t::count(weights[i]);
categoryCounts[x] += n;
}
LOG_TRACE(<< "categoryCounts = " << categoryCounts);
CJointProbabilityOfLessLikelySamples jointLowerBound;
CJointProbabilityOfLessLikelySamples jointUpperBound;
for (auto& categoryCount : categoryCounts) {
double category = categoryCount.first;
double count = categoryCount.second;
LOG_TRACE(<< "category = " << category << ", count = " << count);
std::size_t index = std::lower_bound(m_Categories.begin(),
m_Categories.end(), category) -
m_Categories.begin();
double p = std::get<1>(pCategories[index]);
if (index < m_Categories.size() && m_Categories[index] == category) {
jointLowerBound.add(p + (p >= pU ? pU : 0.0), count);
jointUpperBound.add(p + pU, count);
} else {
jointLowerBound.add(pmin, count);
jointUpperBound.add(std::max(pU + pl, pmin), count);
}
}
if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
<< jointLowerBound << ", " << jointUpperBound);
return false;
}
LOG_TRACE(<< "probability = [" << lowerBound << ", " << upperBound << "]");
} break;
case maths_t::E_OneSidedAbove: {
// See minusLogJointCdf for a discussion of the calculation
// of the marginal c.d.f. for a single sample.
CJointProbabilityOfLessLikelySamples jointLowerBound;
CJointProbabilityOfLessLikelySamples jointUpperBound;
detail::CCdfComplement cdfComplement(m_Categories, m_Concentrations, m_TotalConcentration);
for (std::size_t i = 0; i < samples.size(); ++i) {
if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
continue;
}
double x = samples[i];
double n = maths_t::count(weights[i]);
double sampleLowerBound;
double sampleUpperBound;
cdfComplement(x, sampleLowerBound, sampleUpperBound);
jointLowerBound.add(sampleLowerBound, n);
jointUpperBound.add(sampleUpperBound, n);
}
if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
<< jointLowerBound << ", " << jointUpperBound);
return false;
}
tail = maths_t::E_RightTail;
} break;
}
return true;
}