in lib/maths/common/ProbabilityAggregators.cc [723:945]
bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) {
result = 0.0;
if (m_NumberSamples == 0) {
return true;
}
// We can express the probability as the sum of two terms:
// 1 - (1 - pM)^N +
// Sum_i( 2^M * N! * c(i, M) / (i+1) / (N-M)! * (1 - (1 - pM/2)^(i+1)) )
//
// It is possible to derive recurrence relations for the coefficients
// c(i, M) as follows:
//
// c(m+1) = c(m) / 2 / (N - M + m)
// c(0, m+1) = -1 * ( c(M+1) * (1 - pm) ^ (N-M+m)
// + Sum_i( c(i, m+1) * (1 - pm/2)^(i+1) ) )
// c(i, m+1) = c(i-1, m) / i
//
// where i > 0 in the last relation and c(0) = 1 and c(0, 0) = 0.
//
// The strategy in computing this quantity is to maintain the coefficients
// {c(i, .)} in normalized form so that the largest is equal to 1 and to
// keep track of a normalization factor, which is the log of the largest
// coefficient, to avoid underflow. Note that since:
// c(m+1) = (N - M)!/(N - M + m + 1)!/2^m
//
// it can become very small! This means we always work to full precision.
std::size_t M = m_MinValues.count();
std::size_t N = m_NumberSamples;
LOG_TRACE(<< "M = " << M << ", N = " << N);
double logc = 0.0;
double logLargestCoeff = 0.0;
TDoubleVec coeffs;
if (M > 1) {
coeffs.reserve(M - 1);
}
m_MinValues.sort();
for (std::size_t i = 0; i < M; ++i) {
m_MinValues[i] =
CTools::truncate(m_MinValues[i], CTools::smallestProbability(), 1.0);
}
for (std::size_t m = 1; m < M; ++m) {
double p = m_MinValues[M - m];
LOG_TRACE(<< "p(" << m << ") = " << p);
logc -= std::log(2.0 * static_cast<double>(N - M + m));
// Update the coefficients (they are stored in reverse order).
double sum = 0.0;
for (std::size_t i = 0; i < coeffs.size(); ++i) {
double index = static_cast<double>(coeffs.size() - i);
coeffs[i] /= index;
sum += coeffs[i] * CTools::powOneMinusX(p / 2.0, index);
}
LOG_TRACE(<< "sum = " << sum);
// Re-normalize the "c" term before adding. Note that somewhat
// surprisingly 1 / logLargestCoeff can be infinity, because
// small numbers lose precision before underflowing. That means
// that the following calculation can't use the re-normalized
// "c" directly because it might be infinite. Instead, we make
// use the fact that c * (1 - p)^(N - M + m) won't overflow.
double q = CTools::truncate(
CTools::powOneMinusX(p, static_cast<double>(N - M + m)), 0.0, 1.0);
coeffs.push_back(-sum - q * std::exp(logc - logLargestCoeff));
LOG_TRACE(<< "c(0) = " << coeffs.back());
// Re-normalize the coefficients if they aren't all identically zero.
double cmax = 0.0;
for (std::size_t i = 0; i < coeffs.size(); ++i) {
if (std::fabs(coeffs[i]) > 1.0 / std::numeric_limits<double>::max()) {
cmax = std::max(cmax, std::fabs(coeffs[i]));
}
}
if (cmax > 0.0) {
LOG_TRACE(<< "cmax = " << cmax);
for (std::size_t i = 0; i < coeffs.size(); ++i) {
coeffs[i] /= cmax;
}
logLargestCoeff += std::log(cmax);
LOG_TRACE(<< "logLargestCoeff = " << logLargestCoeff);
}
}
// Re-normalize in the case that we haven't been able to in the loop
// because of overflow.
double cmax = 0.0;
for (std::size_t i = 0; i < coeffs.size(); ++i) {
cmax = std::max(cmax, std::fabs(coeffs[i]));
}
if (cmax > 0.0 && cmax < 1.0 / std::numeric_limits<double>::max()) {
logLargestCoeff = std::log(cmax);
for (std::size_t i = 0; i < coeffs.size(); ++i) {
coeffs[i] /= cmax;
}
}
LOG_TRACE(<< "coeffs = " << coeffs);
double pM = m_MinValues[0];
LOG_TRACE(<< "p(" << M << ") = " << pM);
double pMin = CTools::oneMinusPowOneMinusX(pM, static_cast<double>(N));
LOG_TRACE(<< "1 - (1 - p(" << M << "))^" << N << " = " << pMin);
if (M > 1) {
double logScale = static_cast<double>(M) * std::log(2.0) +
std::lgamma(static_cast<double>(N + 1)) -
std::lgamma(static_cast<double>(N - M + 1)) + logLargestCoeff;
LOG_TRACE(<< "log(scale) = " << logScale);
double sum = 0.0;
double positive = 0.0;
double negative = 0.0;
TDoubleVec terms;
terms.reserve(coeffs.size());
for (std::size_t i = 0; i < coeffs.size(); ++i) {
double index = static_cast<double>(coeffs.size() - i);
double c = coeffs[i] / index;
double p = CTools::oneMinusPowOneMinusX(pM / 2.0, index);
LOG_TRACE(<< "term(" << index << ") = " << (c * p) << " (c("
<< index << ") = " << c << ", 1 - (1 - p(M)/2)^" << index
<< " = " << p << ")");
terms.push_back(c * p);
sum += std::fabs(c * p);
(c * p < 0.0 ? negative : positive) += std::fabs(c * p);
}
LOG_TRACE(<< "negative = " << negative << ", positive = " << positive);
if (sum == 0.0) {
result = std::log(pMin);
} else {
// To minimize cancellation errors we add pMin inside the loop
// and compute weights s.t. Sum_i( w(i) ) = 1.0 and w(i) * pMin
// is roughly the same size as the i'th coefficient.
static const double PRECISION = 1e6;
result = 0.0;
double condition = 0.0;
double logPMin = std::log(pMin);
if (logPMin - logScale > core::constants::LOG_MAX_DOUBLE) {
for (std::size_t i = 0; i < terms.size(); ++i) {
LOG_TRACE(<< "remainder(" << i << ") = " << std::fabs(terms[i]));
result += std::fabs(terms[i]);
}
result = std::log(result * pMin / sum);
} else {
if (logPMin - logScale < core::constants::LOG_MIN_DOUBLE) {
pMin = 0.0;
for (std::size_t i = 0; i < terms.size(); ++i) {
result += terms[i];
condition = std::max(condition, std::fabs(terms[i]));
}
} else {
pMin /= std::exp(logScale);
LOG_TRACE(<< "pMin = " << pMin);
for (std::size_t i = 0; i < terms.size(); ++i) {
double remainder = std::fabs(terms[i]) * pMin / sum + terms[i];
result += remainder;
double absTerms[] = {std::fabs(terms[i]),
std::fabs(terms[i] * pMin / sum),
std::fabs(remainder)};
condition = std::max(
condition, *std::max_element(absTerms, absTerms + 3));
}
}
LOG_TRACE(<< "result = " << result << ", condition = " << condition);
if (result <= 0.0 || condition > PRECISION * result) {
// Whoops we've lost all our precision. Fall back to numerical
// integration (note this caps M <= 10 so the runtime doesn't
// blow up).
LOG_TRACE(<< "Falling back to numerical integration");
CNumericalLogProbabilityOfMFromNExtremeSamples numerical(m_MinValues, N);
result = numerical.calculate();
} else {
result = logScale + std::log(result);
}
}
}
} else {
result = std::log(pMin);
}
// Numerical error means the probability can be slightly greater
// than one on occasion we use a tolerance which should be much
// larger than necessary, but we are only interested in values
// well outside the range as indicative of a genuine problem.
for (std::size_t i = 0; i < 2; ++i) {
if (!(result < 0.001)) {
std::ostringstream minValues;
minValues << std::setprecision(16) << "[" << m_MinValues[0];
for (std::size_t j = 1; j < m_MinValues.count(); ++j) {
minValues << " " << m_MinValues[j];
}
minValues << "]";
LOG_ERROR(<< "Invalid log(extreme probability) = " << result
<< ", m_NumberSamples = " << m_NumberSamples
<< ", m_MinValues = " << minValues.str() << ", coeffs = " << coeffs
<< ", log(max{coeffs}) = " << logLargestCoeff
<< ", pM = " << pM << ", pMin = " << pMin);
result = 0.0;
} else {
break;
}
LOG_TRACE(<< "Falling back to numerical integration");
CNumericalLogProbabilityOfMFromNExtremeSamples numerical(m_MinValues, N);
result = numerical.calculate();
}
result = std::min(result, 0.0);
LOG_TRACE(<< "result = " << result);
return true;
}