bool CMultinomialConjugate::probabilityOfLessLikelySamples()

in lib/maths/common/CMultinomialConjugate.cc [833:1183]


bool CMultinomialConjugate::probabilityOfLessLikelySamples(maths_t::EProbabilityCalculation calculation,
                                                           const TDouble1Vec& samples,
                                                           const TDoubleWeightsAry1Vec& weights,
                                                           double& lowerBound,
                                                           double& upperBound,
                                                           maths_t::ETail& tail) const {
    lowerBound = 0.0;
    upperBound = 1.0;
    tail = maths_t::E_UndeterminedTail;

    if (samples.empty()) {
        LOG_ERROR(<< "Can't compute distribution for empty sample set");
        return false;
    }

    // To compute the overall joint probability we use our approximate
    // aggregation scheme which interprets the probabilities as arising
    // from jointly independent Gaussians. Note that trying to compute
    // the exact aggregation requires us to calculate:
    //   P(R) = P({ y | f(y | p) <= f(x | p) })
    //
    // where,
    //   f(. | p) = n! * Product_i{ p(i)^n(i) / n(i)! }, i.e. the
    //   multinomial distribution function.
    //   n(i) are the counts of the i'th category in vector argument.
    //
    // This is computationally hard. (We might be able to make head way
    // on the relaxation of the problem for the case that {n(i)} are
    // no longer integer, but must sum to one. In which case the values
    // live on a k-dimensional simplex. We would be interested in contours
    // of constant probability, which might be computable by the method
    // of Lagrange multipliers.)

    switch (calculation) {
    case maths_t::E_OneSidedBelow: {
        // See minusLogJointCdf for a discussion of the calculation
        // of the marginal c.d.f. for a single sample.

        CJointProbabilityOfLessLikelySamples jointLowerBound;
        CJointProbabilityOfLessLikelySamples jointUpperBound;

        detail::CCdf cdf(m_Categories, m_Concentrations, m_TotalConcentration);
        for (std::size_t i = 0; i < samples.size(); ++i) {
            if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
                continue;
            }
            double x = samples[i];
            double n = maths_t::count(weights[i]);
            double sampleLowerBound;
            double sampleUpperBound;
            cdf(x, sampleLowerBound, sampleUpperBound);
            jointLowerBound.add(sampleLowerBound, n);
            jointUpperBound.add(sampleUpperBound, n);
        }

        if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
            LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
                      << jointLowerBound << ", " << jointUpperBound);
            return false;
        }
        tail = maths_t::E_LeftTail;
    } break;

    case maths_t::E_TwoSided: {
        // The probability of a less likely category is given by:
        //   E[ Sum_{p(j) <= p(i)}{ p(j) } ]
        //
        // where the expectation is taken over the prior for the distribution
        // probabilities. This is hard to compute because the terms in the sum
        // vary as the probabilities vary. We approximate this by taking the
        // expectation over the marginal for each p(i). In particular, we compute:
        //   P(i) = Sum_{E[p(j)] <= E[p(i)]}{ E[p(j)] }
        //
        // Here, P(i) is the probability of less likely category and the
        // expected probability of the i'th category is:
        //   E[p(i)] = a(i) / Sum_j{ a(j) }                                (1)
        //
        // where, a(i) are the concentration parameters of the prior. So, (1)
        // reduces to:
        //   Sum_{j:a(j)<=a(i)}{ E[p(j)] }
        //
        // We can think of P(.) as a function of probability, i.e. if the
        // probability of a category were p then its corresponding P(.) would
        // be:
        //   P(argmin_{i : E[p(i)] >= p}{ E[p(i)] })
        //
        // Given this definition we can compute:
        //   E[ P(p) ]                                                     (2)
        //
        // where the expectation is taken over the marginal for each probability.
        // This can be computed exactly by noting that marginal for p(i) is
        // beta distributed with alpha = a(i) and beta = Sum_j{ a(j) } - a(i).
        // However, this requires us to compute quantiles at every E[p(i)] which
        // would be expensive for a large number of categories. Instead we
        // approximate the probability by using a fixed number of samples from
        // the marginal and computing the probability by taking the mean of these.
        // Finally, note that if E[p(i)] and E[p(j)] are very close we want P(i)
        // and P(j) to be close, but they can in fact be very different if there
        // are many probabilities E[p(k)] which satisfy:
        //   E[p(i)] <= E[p(k)] <= E[p(j)],
        //
        // To avoid this problem we scale all probabilities by (1 + eps), for
        // small eps > 0, when computing (2).
        //
        // In the case that the number of categories has overflowed we derive a
        // sharp lower bound by considering the case that all the additional
        // mass is in one category which is not in the sample set. In this case
        // we have three sets of categories:
        //   U = "Uncounted categories"
        //   L = {i : i is counted and P(i) < P(U)}
        //   M = {i : i is counted and P(i) >= P(U)}
        //
        // where, clearly P(U) is the extra mass. If X denotes the sample set
        // and N the total concentration then for this case:
        //   P(i in XU) >= 1 / N
        //   P(i in XL) >= P(i)
        //   P(i in XM) >= P(i) + P(U)
        //
        // If |X| = 1 then a sharp upper bound is easy to compute:
        //   P(i in XU) <= P(U) + P(L)
        //   P(i in X\U) <= P(i) + P(U)                                    (3)
        //
        // For the case that X contains a mixture of values that are and aren't
        // in U then a sharp upper bound is difficult to compute: it depends on
        // the number of different categories in XU, P(U) and the probabilities
        // P(i in L). In this case we just fall back to using (3) which isn't
        // sharp.

        using TSizeVec = std::vector<std::size_t>;

        tail = maths_t::E_MixedOrNeitherTail;

        TDoubleDoubleSizeTrVec pCategories;
        pCategories.reserve(m_Categories.size());
        double pU = 0.0;
        double pmin = 1.0 / m_TotalConcentration;

        {
            double r = 1.0 / static_cast<double>(m_Concentrations.size());
            for (std::size_t i = 0; i < m_Concentrations.size(); ++i) {
                double p = m_Concentrations[i] / m_TotalConcentration;
                pCategories.emplace_back(p, p, i);
                pU += r - p;
            }
        }
        std::sort(pCategories.begin(), pCategories.end());
        LOG_TRACE(<< "p = " << pCategories);

        double pl = 0.0;
        {
            // Get the index of largest probability less than or equal to P(U).
            std::size_t l = pCategories.size();
            if (pU > 0.0) {
                l = std::lower_bound(pCategories.begin(), pCategories.end(),
                                     TDoubleDoubleSizeTr(pU, pU, 0)) -
                    pCategories.begin();
            }

            // Compute probabilities of less likely categories.
            double pCumulative = 0.0;
            for (std::size_t i = 0, j = 0; i < pCategories.size(); /**/) {
                // Find the probability equal range [i, j).
                double p = std::get<1>(pCategories[i]);
                pCumulative += p;
                while (++j < pCategories.size() && std::get<1>(pCategories[j]) == p) {
                    pCumulative += p;
                }

                // Update the equal range probabilities [i, j).
                for (/**/; i < j; ++i) {
                    std::get<1>(pCategories[i]) = pCumulative;
                }
            }

            if (l < pCategories.size()) {
                pl = std::get<1>(pCategories[l]);
            }
        }

        std::size_t nSamples = detail::numberPriorSamples(m_TotalConcentration);
        LOG_TRACE(<< "n = " << nSamples);
        if (nSamples > 1) {
            // Extract the indices of the categories we want.
            TSizeVec categoryIndices;
            categoryIndices.reserve(samples.size());
            for (std::size_t i = 0; i < samples.size(); ++i) {
                std::size_t index = std::lower_bound(m_Categories.begin(),
                                                     m_Categories.end(), samples[i]) -
                                    m_Categories.begin();
                if (index < m_Categories.size() && m_Categories[index] == samples[i]) {
                    categoryIndices.push_back(index);
                }
            }
            std::sort(categoryIndices.begin(), categoryIndices.end());

            for (std::size_t i = 0; i < pCategories.size(); ++i) {
                // For all categories that we actually want compute the
                // average probability over a set of independent samples
                // from the marginal prior for this category, which by the
                // law of large numbers converges to E[ P(p) ] w.r.t. to
                // marginal for p. The constants a and b are a(i) and
                // Sum_j( a(j) ) - a(i), respectively.

                std::size_t j = std::get<2>(pCategories[i]);
                if (std::binary_search(categoryIndices.begin(), categoryIndices.end(), j)) {
                    TDouble7Vec marginalSamples;
                    double a = m_Concentrations[j];
                    double b = m_TotalConcentration - m_Concentrations[j];
                    detail::generateBetaSamples(a, b, nSamples, marginalSamples);
                    LOG_TRACE(<< "E[p] = " << std::get<0>(pCategories[i])
                              << ", mean = " << CBasicStatistics::mean(marginalSamples)
                              << ", samples = " << marginalSamples);

                    TMeanAccumulator pAcc;
                    for (std::size_t k = 0; k < marginalSamples.size(); ++k) {
                        TDoubleDoubleSizeTr x(1.05 * marginalSamples[k], 0.0, 0);
                        std::ptrdiff_t r = std::min(
                            std::upper_bound(pCategories.begin(), pCategories.end(), x) -
                                pCategories.begin(),
                            static_cast<std::ptrdiff_t>(pCategories.size()) - 1);

                        double fl = r > 0 ? std::get<0>(pCategories[r - 1]) : 0.0;
                        double fr = std::get<0>(pCategories[r]);
                        double pl_ = r > 0 ? std::get<1>(pCategories[r - 1]) : 0.0;
                        double pr_ = std::get<1>(pCategories[r]);
                        double alpha = std::min(
                            (fr - fl == 0.0) ? 0.0 : (std::get<0>(x) - fl) / (fr - fl), 1.0);
                        double px = (1.0 - alpha) * pl_ + alpha * pr_;
                        LOG_TRACE(<< "E[p(l)] = " << fl << ", P(l) = " << pl_
                                  << ", E[p(r)] = " << fr << ", P(r) = " << pr_
                                  << ", alpha = " << alpha << ", p = " << px);

                        pAcc.add(px);
                    }
                    std::get<1>(pCategories[i]) = CBasicStatistics::mean(pAcc);
                }
            }
        }

        LOG_TRACE(<< "pCategories = " << pCategories);
        LOG_TRACE(<< "P(U) = " << pU << ", P(l) = " << pl);

        // We can use radix sort to reorder the probabilities in O(n).
        // To understand the following loop note that on each iteration
        // at least one extra probability is in its correct position
        // so it will necessarily terminate in at most n iterations.
        for (std::size_t i = 0; i < pCategories.size(); /**/) {
            if (i == std::get<2>(pCategories[i])) {
                ++i;
            } else {
                std::swap(pCategories[i], pCategories[std::get<2>(pCategories[i])]);
            }
        }

        LOG_TRACE(<< "pCategories = " << pCategories);

        if (samples.size() == 1) {
            // No special aggregation is required if there is a single sample.
            std::size_t index = std::lower_bound(m_Categories.begin(),
                                                 m_Categories.end(), samples[0]) -
                                m_Categories.begin();

            if (index < m_Categories.size() && m_Categories[index] == samples[0]) {
                double p = std::get<1>(pCategories[index]);
                lowerBound = p + (p >= pU ? pU : 0.0);
                upperBound = p + pU;
            } else {
                lowerBound = pmin;
                upperBound = std::max(pU + pl, pmin);
            }

            return true;
        }

        TDoubleDoubleMap categoryCounts;

        // Count the occurrences of each category in the sample set.
        for (std::size_t i = 0; i < samples.size(); ++i) {
            if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
                continue;
            }
            double x = samples[i];
            double n = maths_t::count(weights[i]);
            categoryCounts[x] += n;
        }

        LOG_TRACE(<< "categoryCounts = " << categoryCounts);

        CJointProbabilityOfLessLikelySamples jointLowerBound;
        CJointProbabilityOfLessLikelySamples jointUpperBound;

        for (auto& categoryCount : categoryCounts) {
            double category = categoryCount.first;
            double count = categoryCount.second;
            LOG_TRACE(<< "category = " << category << ", count = " << count);

            std::size_t index = std::lower_bound(m_Categories.begin(),
                                                 m_Categories.end(), category) -
                                m_Categories.begin();

            double p = std::get<1>(pCategories[index]);
            if (index < m_Categories.size() && m_Categories[index] == category) {
                jointLowerBound.add(p + (p >= pU ? pU : 0.0), count);
                jointUpperBound.add(p + pU, count);
            } else {
                jointLowerBound.add(pmin, count);
                jointUpperBound.add(std::max(pU + pl, pmin), count);
            }
        }

        if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
            LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
                      << jointLowerBound << ", " << jointUpperBound);
            return false;
        }

        LOG_TRACE(<< "probability = [" << lowerBound << ", " << upperBound << "]");
    } break;

    case maths_t::E_OneSidedAbove: {
        // See minusLogJointCdf for a discussion of the calculation
        // of the marginal c.d.f. for a single sample.

        CJointProbabilityOfLessLikelySamples jointLowerBound;
        CJointProbabilityOfLessLikelySamples jointUpperBound;

        detail::CCdfComplement cdfComplement(m_Categories, m_Concentrations, m_TotalConcentration);
        for (std::size_t i = 0; i < samples.size(); ++i) {
            if (CMathsFuncs::isNan(samples[i]) || CMathsFuncs::isNan(weights[i])) {
                continue;
            }
            double x = samples[i];
            double n = maths_t::count(weights[i]);
            double sampleLowerBound;
            double sampleUpperBound;
            cdfComplement(x, sampleLowerBound, sampleUpperBound);
            jointLowerBound.add(sampleLowerBound, n);
            jointUpperBound.add(sampleUpperBound, n);
        }

        if (!jointLowerBound.calculate(lowerBound) || !jointUpperBound.calculate(upperBound)) {
            LOG_ERROR(<< "Unable to compute probability for " << samples << ": "
                      << jointLowerBound << ", " << jointUpperBound);
            return false;
        }
        tail = maths_t::E_RightTail;
    } break;
    }

    return true;
}