bool CLogProbabilityOfMFromNExtremeSamples::calculate()

in lib/maths/common/ProbabilityAggregators.cc [723:945]


bool CLogProbabilityOfMFromNExtremeSamples::calculate(double& result) {
    result = 0.0;

    if (m_NumberSamples == 0) {
        return true;
    }

    // We can express the probability as the sum of two terms:
    //   1 - (1 - pM)^N +
    //   Sum_i( 2^M * N! * c(i, M) / (i+1) / (N-M)! * (1 - (1 - pM/2)^(i+1)) )
    //
    // It is possible to derive recurrence relations for the coefficients
    // c(i, M) as follows:
    //
    //   c(m+1)    = c(m) / 2 / (N - M + m)
    //   c(0, m+1) = -1 * ( c(M+1) * (1 - pm) ^ (N-M+m)
    //                     + Sum_i( c(i, m+1) * (1 - pm/2)^(i+1) ) )
    //   c(i, m+1) = c(i-1, m) / i
    //
    // where i > 0 in the last relation and c(0) = 1 and c(0, 0) = 0.
    //
    // The strategy in computing this quantity is to maintain the coefficients
    // {c(i, .)} in normalized form so that the largest is equal to 1 and to
    // keep track of a normalization factor, which is the log of the largest
    // coefficient, to avoid underflow. Note that since:
    //   c(m+1) = (N - M)!/(N - M + m + 1)!/2^m
    //
    // it can become very small! This means we always work to full precision.

    std::size_t M = m_MinValues.count();
    std::size_t N = m_NumberSamples;

    LOG_TRACE(<< "M = " << M << ", N = " << N);

    double logc = 0.0;

    double logLargestCoeff = 0.0;
    TDoubleVec coeffs;
    if (M > 1) {
        coeffs.reserve(M - 1);
    }

    m_MinValues.sort();
    for (std::size_t i = 0; i < M; ++i) {
        m_MinValues[i] =
            CTools::truncate(m_MinValues[i], CTools::smallestProbability(), 1.0);
    }

    for (std::size_t m = 1; m < M; ++m) {
        double p = m_MinValues[M - m];
        LOG_TRACE(<< "p(" << m << ") = " << p);

        logc -= std::log(2.0 * static_cast<double>(N - M + m));

        // Update the coefficients (they are stored in reverse order).
        double sum = 0.0;
        for (std::size_t i = 0; i < coeffs.size(); ++i) {
            double index = static_cast<double>(coeffs.size() - i);
            coeffs[i] /= index;
            sum += coeffs[i] * CTools::powOneMinusX(p / 2.0, index);
        }
        LOG_TRACE(<< "sum = " << sum);

        // Re-normalize the "c" term before adding. Note that somewhat
        // surprisingly 1 / logLargestCoeff can be infinity, because
        // small numbers lose precision before underflowing. That means
        // that the following calculation can't use the re-normalized
        // "c" directly because it might be infinite. Instead, we make
        // use the fact that c * (1 - p)^(N - M + m) won't overflow.
        double q = CTools::truncate(
            CTools::powOneMinusX(p, static_cast<double>(N - M + m)), 0.0, 1.0);
        coeffs.push_back(-sum - q * std::exp(logc - logLargestCoeff));
        LOG_TRACE(<< "c(0) = " << coeffs.back());

        // Re-normalize the coefficients if they aren't all identically zero.
        double cmax = 0.0;
        for (std::size_t i = 0; i < coeffs.size(); ++i) {
            if (std::fabs(coeffs[i]) > 1.0 / std::numeric_limits<double>::max()) {
                cmax = std::max(cmax, std::fabs(coeffs[i]));
            }
        }
        if (cmax > 0.0) {
            LOG_TRACE(<< "cmax = " << cmax);
            for (std::size_t i = 0; i < coeffs.size(); ++i) {
                coeffs[i] /= cmax;
            }
            logLargestCoeff += std::log(cmax);
            LOG_TRACE(<< "logLargestCoeff = " << logLargestCoeff);
        }
    }

    // Re-normalize in the case that we haven't been able to in the loop
    // because of overflow.
    double cmax = 0.0;
    for (std::size_t i = 0; i < coeffs.size(); ++i) {
        cmax = std::max(cmax, std::fabs(coeffs[i]));
    }
    if (cmax > 0.0 && cmax < 1.0 / std::numeric_limits<double>::max()) {
        logLargestCoeff = std::log(cmax);
        for (std::size_t i = 0; i < coeffs.size(); ++i) {
            coeffs[i] /= cmax;
        }
    }
    LOG_TRACE(<< "coeffs = " << coeffs);

    double pM = m_MinValues[0];
    LOG_TRACE(<< "p(" << M << ") = " << pM);

    double pMin = CTools::oneMinusPowOneMinusX(pM, static_cast<double>(N));
    LOG_TRACE(<< "1 - (1 - p(" << M << "))^" << N << " = " << pMin);

    if (M > 1) {
        double logScale = static_cast<double>(M) * std::log(2.0) +
                          std::lgamma(static_cast<double>(N + 1)) -
                          std::lgamma(static_cast<double>(N - M + 1)) + logLargestCoeff;
        LOG_TRACE(<< "log(scale) = " << logScale);

        double sum = 0.0;
        double positive = 0.0;
        double negative = 0.0;
        TDoubleVec terms;
        terms.reserve(coeffs.size());
        for (std::size_t i = 0; i < coeffs.size(); ++i) {
            double index = static_cast<double>(coeffs.size() - i);
            double c = coeffs[i] / index;
            double p = CTools::oneMinusPowOneMinusX(pM / 2.0, index);
            LOG_TRACE(<< "term(" << index << ") = " << (c * p) << " (c("
                      << index << ") = " << c << ", 1 - (1 - p(M)/2)^" << index
                      << " = " << p << ")");
            terms.push_back(c * p);
            sum += std::fabs(c * p);
            (c * p < 0.0 ? negative : positive) += std::fabs(c * p);
        }
        LOG_TRACE(<< "negative = " << negative << ", positive = " << positive);

        if (sum == 0.0) {
            result = std::log(pMin);
        } else {
            // To minimize cancellation errors we add pMin inside the loop
            // and compute weights s.t. Sum_i( w(i) ) = 1.0 and w(i) * pMin
            // is roughly the same size as the i'th coefficient.

            static const double PRECISION = 1e6;

            result = 0.0;
            double condition = 0.0;
            double logPMin = std::log(pMin);
            if (logPMin - logScale > core::constants::LOG_MAX_DOUBLE) {
                for (std::size_t i = 0; i < terms.size(); ++i) {
                    LOG_TRACE(<< "remainder(" << i << ") = " << std::fabs(terms[i]));
                    result += std::fabs(terms[i]);
                }
                result = std::log(result * pMin / sum);
            } else {
                if (logPMin - logScale < core::constants::LOG_MIN_DOUBLE) {
                    pMin = 0.0;
                    for (std::size_t i = 0; i < terms.size(); ++i) {
                        result += terms[i];
                        condition = std::max(condition, std::fabs(terms[i]));
                    }
                } else {
                    pMin /= std::exp(logScale);
                    LOG_TRACE(<< "pMin = " << pMin);
                    for (std::size_t i = 0; i < terms.size(); ++i) {
                        double remainder = std::fabs(terms[i]) * pMin / sum + terms[i];
                        result += remainder;
                        double absTerms[] = {std::fabs(terms[i]),
                                             std::fabs(terms[i] * pMin / sum),
                                             std::fabs(remainder)};
                        condition = std::max(
                            condition, *std::max_element(absTerms, absTerms + 3));
                    }
                }

                LOG_TRACE(<< "result = " << result << ", condition = " << condition);

                if (result <= 0.0 || condition > PRECISION * result) {
                    // Whoops we've lost all our precision. Fall back to numerical
                    // integration (note this caps M <= 10 so the runtime doesn't
                    // blow up).
                    LOG_TRACE(<< "Falling back to numerical integration");
                    CNumericalLogProbabilityOfMFromNExtremeSamples numerical(m_MinValues, N);
                    result = numerical.calculate();
                } else {
                    result = logScale + std::log(result);
                }
            }
        }
    } else {
        result = std::log(pMin);
    }

    // Numerical error means the probability can be slightly greater
    // than one on occasion we use a tolerance which should be much
    // larger than necessary, but we are only interested in values
    // well outside the range as indicative of a genuine problem.
    for (std::size_t i = 0; i < 2; ++i) {
        if (!(result < 0.001)) {
            std::ostringstream minValues;
            minValues << std::setprecision(16) << "[" << m_MinValues[0];
            for (std::size_t j = 1; j < m_MinValues.count(); ++j) {
                minValues << " " << m_MinValues[j];
            }
            minValues << "]";
            LOG_ERROR(<< "Invalid log(extreme probability) = " << result
                      << ", m_NumberSamples = " << m_NumberSamples
                      << ", m_MinValues = " << minValues.str() << ", coeffs = " << coeffs
                      << ", log(max{coeffs}) = " << logLargestCoeff
                      << ", pM = " << pM << ", pMin = " << pMin);
            result = 0.0;
        } else {
            break;
        }
        LOG_TRACE(<< "Falling back to numerical integration");
        CNumericalLogProbabilityOfMFromNExtremeSamples numerical(m_MinValues, N);
        result = numerical.calculate();
    }

    result = std::min(result, 0.0);
    LOG_TRACE(<< "result = " << result);

    return true;
}