lib/maths/common/CSampling.cc

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <maths/common/CSampling.h> #include <core/CLogger.h> #include <core/CScopedFastLock.h> #include <core/CStatePersistInserter.h> #include <core/CStateRestoreTraverser.h> #include <maths/common/CLinearAlgebraEigen.h> #include <maths/common/COrderings.h> #include <maths/common/COrderingsSimultaneousSort.h> #include <maths/common/CTools.h> #include <boost/iterator/counting_iterator.hpp> #include <boost/math/distributions/gamma.hpp> #include <boost/math/distributions/normal.hpp> #include <boost/random/binomial_distribution.hpp> #include <boost/random/chi_squared_distribution.hpp> #include <boost/random/mersenne_twister.hpp> #include <boost/random/normal_distribution.hpp> #include <boost/random/poisson_distribution.hpp> #include <boost/random/sobol.hpp> #include <boost/random/uniform_01.hpp> #include <boost/random/uniform_real_distribution.hpp> #include <boost/random/variate_generator.hpp> #include <algorithm> #include <cmath> #include <numeric> #include <sstream> #include <utility> #include <vector> namespace ml { namespace maths { namespace common { namespace { using TDoubleVec = std::vector<double>; using TDoubleVecVec = std::vector<TDoubleVec>; using TSizeVec = std::vector<std::size_t>; //! \brief A mockable random number generator which uses boost::random::mt11213b. class CRandomNumberGenerator { public: using result_type = boost::random::mt11213b::result_type; public: //! Mock the random number generator to produce a constant. void mock() { m_Mock.emplace((min() + max()) / 2); } //! Unmock the random number generator. void unmock() { m_Mock.reset(); } //! Seed the random number generator. void seed() { m_Rng.seed(); } //! Returns the smallest value that the generator can produce. static result_type min() { return boost::random::mt11213b::min(); } //! Returns the largest value that the generator can produce. static result_type max() { return boost::random::mt11213b::max(); } //! Produces the next value of the generator. result_type operator()() { if (m_Mock) { return *m_Mock; } return m_Rng.operator()(); } //! Writes the mersenne_twister_engine to a std::ostream. template<class CHAR, class TRAITS> friend std::basic_ostream<CHAR, TRAITS>& operator<<(std::basic_ostream<CHAR, TRAITS>& o, const CRandomNumberGenerator& g) { return o << g.m_Rng; } //! Reads a mersenne_twister_engine from a std::istream. template<class CHAR, class TRAITS> friend std::basic_istream<CHAR, TRAITS>& operator>>(std::basic_istream<CHAR, TRAITS>& i, CRandomNumberGenerator& g) { return i >> g.m_Rng; } private: using TOptionalResultType = std::optional<result_type>; private: TOptionalResultType m_Mock; boost::random::mt11213b m_Rng; }; //! The mutex for protecting access to the default random number generator. core::CFastMutex defaultRngMutex; //! The default uniform random number generator. CRandomNumberGenerator defaultRng; //! Defines the appropriate integer random number generator. template<typename INTEGER> struct SRng { using Type = boost::random::uniform_int_distribution<INTEGER>; static INTEGER min(INTEGER a) { return a; } static INTEGER max(INTEGER b) { return b - 1; } }; //! Specialization for a real uniform random number generator. template<> struct SRng<double> { using Type = boost::random::uniform_real_distribution<double>; static double min(double a) { return a; } static double max(double b) { return b; } }; //! Implementation of uniform sampling. template<typename RNG, typename TYPE> TYPE doUniformSample(RNG& rng, TYPE a, TYPE b) { if (a >= b) { return a; } typename SRng<TYPE>::Type uniform(SRng<TYPE>::min(a), SRng<TYPE>::max(b)); return uniform(rng); } //! Implementation of uniform sampling. template<typename RNG, typename TYPE> void doUniformSample(RNG& rng, TYPE a, TYPE b, std::size_t n, std::vector<TYPE>& result) { result.clear(); if (a >= b) { result.resize(n, a); return; } result.reserve(n); typename SRng<TYPE>::Type uniform(SRng<TYPE>::min(a), SRng<TYPE>::max(b)); for (std::size_t i = 0; i < n; ++i) { result.push_back(uniform(rng)); } } //! Implementation of normal sampling. template<typename RNG> double doNormalSample(RNG& rng, double mean, double variance) { if (variance < 0.0) { LOG_ERROR(<< "Invalid variance " << variance); return mean; } boost::random::normal_distribution<double> normal(mean, std::sqrt(variance)); return normal(rng); } //! Implementation of normal sampling. template<typename RNG> void doNormalSample(RNG& rng, double mean, double variance, std::size_t n, TDoubleVec& result) { result.clear(); if (variance < 0.0) { LOG_ERROR(<< "Invalid variance " << variance); return; } if (variance == 0.0) { result.resize(n, mean); return; } result.reserve(n); boost::random::normal_distribution<double> normal(mean, std::sqrt(variance)); for (std::size_t i = 0; i < n; ++i) { result.push_back(normal(rng)); } } //! Implementation of Poisson sampling. template<typename RNG> void doPoissonSample(RNG& rng, double rate, std::size_t n, TSizeVec& result) { result.clear(); if (rate <= 0.0) { LOG_ERROR(<< "Invalid rate " << rate); return; } result.reserve(n); boost::random::poisson_distribution<std::size_t> poisson{rate}; for (std::size_t i = 0; i < n; ++i) { result.push_back(poisson(rng)); } } //! Implementation of chi^2 sampling. template<typename RNG> void doChiSquaredSample(RNG& rng, double f, std::size_t n, TDoubleVec& result) { result.clear(); if (f <= 0.0) { LOG_ERROR(<< "Invalid degrees freedom " << f); return; } result.reserve(n); boost::random::chi_squared_distribution<double> chi2(f); for (std::size_t i = 0; i < n; ++i) { result.push_back(chi2(rng)); } } //! Implementation of categorical sampling. template<typename RNG> std::size_t doCategoricalSample(RNG& rng, TDoubleVec& weights) { // We use inverse transform sampling to generate the categorical // samples from a random samples on [0,w] with w the sum of weights. std::size_t m{weights.size()}; // Construct the transform function. std::partial_sum(weights.begin(), weights.end(), weights.begin()); if (weights[m - 1] == 0.0) { return doUniformSample(rng, static_cast<std::size_t>(0), m); } return std::min(static_cast<std::size_t>( std::lower_bound(weights.begin(), weights.end(), doUniformSample(rng, 0.0, weights[m - 1])) - weights.begin()), m - 1); } //! Implementation of categorical sampling with replacement. template<typename RNG> void doCategoricalSampleWithReplacement(RNG& rng, TDoubleVec& weights, std::size_t n, TSizeVec& result) { // We use inverse transform sampling to generate the categorical // samples from random samples on [0,w] with w the sum of weights. result.clear(); if (n == 0) { return; } std::size_t m{weights.size()}; // Construct the cumulative density function. std::partial_sum(weights.begin(), weights.end(), weights.begin()); if (weights[m - 1] == 0.0) { doUniformSample(rng, static_cast<std::size_t>(0), m, n, result); } else { result.reserve(n); boost::random::uniform_real_distribution<> uniform(0.0, weights[m - 1]); for (std::size_t i = 0; i < n; ++i) { result.push_back(std::min( static_cast<std::size_t>( std::lower_bound(weights.begin(), weights.end(), uniform(rng)) - weights.begin()), m - 1)); } } } //! Implementation of categorical sampling without replacement. template<typename RNG> void doCategoricalSampleWithoutReplacement(RNG& rng, TDoubleVec& weights, std::size_t n, TSizeVec& result) { // We switch between two strategies: // 1. For n < eps |weights| we perform categorical sampling and discard // duplicates. This approach has better constants but compexity // O(|weights| + n log(|weights|)) // 2. Otherwise, we notionally reservoir sample n from a stream of size // |weights| and use the Efraimidis and Spirakis trick. This approach // has complexity O(|weights|). using TBoolVec = std::vector<bool>; result.clear(); if (n == 0) { return; } if (5 * n < weights.size()) { result.resize(n); std::partial_sum(weights.begin(), weights.end(), weights.begin()); TBoolVec mask(weights.size(), false); for (std::size_t i = 0, m = 0; 2 * i < 3 * n; ++i) { // Note that weights contains cumulative sums up to the last item, // which is proportional to the cumulative density function, so to // perform inverse transform sampling we generate s ~ U[0,w] with // w the sum of all weights and read off the first item x with // s <= F(x). auto j = std::min(std::lower_bound(weights.begin(), weights.end(), doUniformSample(rng, 0.0, weights.back())) - weights.begin(), static_cast<std::ptrdiff_t>(weights.size()) - 1); if (mask[j] == false) { mask[j] = true; result[m++] = j; if (m == n) { return; } } } // This should happen rarely so the amortised cost is negligible. std::adjacent_difference(weights.begin(), weights.end(), weights.begin()); } result.resize(weights.size()); std::iota(result.begin(), result.end(), 0); // We won't sample any values assigned zero weight. result.erase(std::remove_if(result.begin(), result.end(), [&](auto i) { return weights[i] == 0.0; }), result.end()); if (n >= result.size()) { return; } // Use the inverse transformation method to generate exponential samples. for (auto i : result) { weights[i] = -CTools::fastLog(doUniformSample(rng, 0.0, 1.0)) / weights[i]; } std::nth_element( result.begin(), result.begin() + n, result.end(), [&](auto lhs, auto rhs) { return weights[lhs] < weights[rhs]; }); result.resize(n); } //! Implementation of multivariate normal sampling. template<typename RNG> bool doMultivariateNormalSample(RNG& rng, const TDoubleVec& mean, const TDoubleVecVec& covariance, std::size_t n, TDoubleVecVec& samples) { if (mean.size() != covariance.size()) { LOG_ERROR(<< "Incompatible mean and covariance: " << mean << ", " << covariance); return false; } samples.clear(); if (n == 0) { return true; } // This computes the singular value decomposition of the covariance // matrix (note since the covariance matrix is symmetric only its // upper triangle is passed), i.e. C = U * S * V'. // // The singular values S should be rank full and U should equal V // for positive definite matrix. Therefore, using the fact that // linear combinations of normal random variables are normal we can // construct each sample using independent samples // X = { X_i = N(0, [S]_ii) } // // and transforming as Y = m + U * X. Note that E[Y] is clearly m // and its covariance matrix is // E[(Y - E[Y]) * (Y - E[Y])'] = U * E[X * X'] * U' = U * S * U' // // as required. std::size_t d = mean.size(); LOG_TRACE(<< "Dimension = " << d); LOG_TRACE(<< "mean = " << mean); CDenseMatrix<double> C(d, d); for (std::size_t i = 0; i < d; ++i) { C(i, i) = covariance[i][i]; if (covariance[i].size() < d - i) { LOG_ERROR(<< "Bad covariance matrix: " << covariance); return false; } for (std::size_t j = 0; j < i; ++j) { C(i, j) = covariance[i][j]; C(j, i) = covariance[i][j]; } } LOG_TRACE(<< "C =" << core_t::LINE_ENDING << ' ' << C); auto svd = C.jacobiSvd(Eigen::ComputeThinU | Eigen::ComputeThinV); // Get the singular values, these are the variances of the normals // to sample. const auto& S = svd.singularValues(); const auto& U = svd.matrixU(); TDoubleVec stddevs; stddevs.reserve(d); for (std::size_t i = 0; i < d; ++i) { stddevs.push_back(std::sqrt(std::max(S(i), 0.0))); } LOG_TRACE(<< "Singular values of C = " << S.transpose()); LOG_TRACE(<< "stddevs = " << stddevs); LOG_TRACE(<< "U =" << core_t::LINE_ENDING << ' ' << U); LOG_TRACE(<< "V =" << core_t::LINE_ENDING << ' ' << svd.matrixV()); { samples.resize(n, mean); CDenseVector<double> sample(d); for (std::size_t i = 0; i < n; ++i) { for (std::size_t j = 0; j < d; ++j) { if (stddevs[j] == 0.0) { sample(j) = 0.0; } else { boost::random::normal_distribution<> normal(0.0, stddevs[j]); sample(j) = normal(rng); } } sample = U * sample; for (std::size_t j = 0; j < d; ++j) { samples[i][j] += sample(j); } } } return true; } //! Implementation of multivariate normal sampling. template<typename RNG, typename T, std::size_t N> void doMultivariateNormalSample(RNG& rng, const CVectorNx1<T, N>& mean, const CSymmetricMatrixNxN<T, N>& covariance, std::size_t n, std::vector<CVectorNx1<T, N>>& samples) { using TDenseVector = typename SDenseVector<CVectorNx1<T, N>>::Type; samples.clear(); if (n == 0) { return; } // See the other implementation for an explanation. auto svd = toDenseMatrix(covariance).jacobiSvd(Eigen::ComputeFullU | Eigen::ComputeFullV); // Get the singular values, these are the variances of the normals // to sample. const auto& S = svd.singularValues(); const auto& U = svd.matrixU(); T stddevs[N] = {}; for (std::size_t i = 0; i < N; ++i) { stddevs[i] = std::sqrt(std::max(S(i), 0.0)); } samples.resize(n, mean); TDenseVector sample(N); for (std::size_t i = 0; i < n; ++i) { for (std::size_t j = 0; j < N; ++j) { if (stddevs[j] == 0.0) { sample(j) = 0.0; } else { boost::random::normal_distribution<> normal(0.0, stddevs[j]); sample(j) = normal(rng); } } sample = U * sample; samples[i] += fromDenseVector(sample); } } //! Implementation of distribution quantile sampling. template<typename DISTRIBUTION> void sampleQuantiles(const DISTRIBUTION& distribution, std::size_t n, TDoubleVec& result) { CTools::SIntervalExpectation expectation; double dq = 1.0 / static_cast<double>(n); double a = std::numeric_limits<double>::lowest(); for (std::size_t i = 1; i < n; ++i) { double q = static_cast<double>(i) * dq; double b = boost::math::quantile(distribution, q); result.push_back(expectation(distribution, a, b)); a = b; } double b = std::numeric_limits<double>::max(); result.push_back(expectation(distribution, a, b)); } const std::string RNG_TAG("a"); } bool CSampling::staticsAcceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { // Note we require that we only ever do one persistence per process. do { const std::string& name = traverser.name(); if (name == RNG_TAG) { std::string value(traverser.value()); // See acceptPersistInserter std::replace(value.begin(), value.end(), '_', ' '); std::istringstream ss(value); core::CScopedFastLock scopedLock{defaultRngMutex}; ss >> defaultRng; } } while (traverser.next()); return true; } void CSampling::staticsAcceptPersistInserter(core::CStatePersistInserter& inserter) { // Note we require that we only ever do one persistence per process. std::ostringstream ss; { core::CScopedFastLock scopedLock{defaultRngMutex}; ss << defaultRng; } std::string rng(ss.str()); // These are space separated integers. We replace spaces or else // the JSON parser gets confused. std::replace(rng.begin(), rng.end(), ' ', '_'); inserter.insertValue(RNG_TAG, rng); } void CSampling::seed() { core::CScopedFastLock scopedLock{defaultRngMutex}; defaultRng.seed(); } #define UNIFORM_SAMPLE(TYPE) \ TYPE CSampling::uniformSample(TYPE a, TYPE b) { \ core::CScopedFastLock scopedLock{defaultRngMutex}; \ return doUniformSample(defaultRng, a, b); \ } \ TYPE CSampling::uniformSample(CPRNG::CXorOShiro128Plus& rng, TYPE a, TYPE b) { \ return doUniformSample(rng, a, b); \ } \ TYPE CSampling::uniformSample(CPRNG::CXorShift1024Mult& rng, TYPE a, TYPE b) { \ return doUniformSample(rng, a, b); \ } \ void CSampling::uniformSample(TYPE a, TYPE b, std::size_t n, std::vector<TYPE>& result) { \ core::CScopedFastLock scopedLock{defaultRngMutex}; \ doUniformSample(defaultRng, a, b, n, result); \ } \ void CSampling::uniformSample(CPRNG::CXorOShiro128Plus& rng, TYPE a, TYPE b, \ std::size_t n, std::vector<TYPE>& result) { \ doUniformSample(rng, a, b, n, result); \ } \ void CSampling::uniformSample(CPRNG::CXorShift1024Mult& rng, TYPE a, TYPE b, \ std::size_t n, std::vector<TYPE>& result) { \ doUniformSample(rng, a, b, n, result); \ } UNIFORM_SAMPLE(std::size_t) UNIFORM_SAMPLE(std::ptrdiff_t) UNIFORM_SAMPLE(double) #undef UNIFORM_SAMPLE double CSampling::normalSample(double mean, double variance) { core::CScopedFastLock scopedLock{defaultRngMutex}; return doNormalSample(defaultRng, mean, variance); } double CSampling::normalSample(CPRNG::CXorOShiro128Plus& rng, double mean, double variance) { return doNormalSample(rng, mean, variance); } double CSampling::normalSample(CPRNG::CXorShift1024Mult& rng, double mean, double variance) { return doNormalSample(rng, mean, variance); } void CSampling::normalSample(double mean, double variance, std::size_t n, TDoubleVec& result) { core::CScopedFastLock scopedLock{defaultRngMutex}; doNormalSample(defaultRng, mean, variance, n, result); } void CSampling::normalSample(CPRNG::CXorOShiro128Plus& rng, double mean, double variance, std::size_t n, TDoubleVec& result) { doNormalSample(rng, mean, variance, n, result); } void CSampling::normalSample(CPRNG::CXorShift1024Mult& rng, double mean, double variance, std::size_t n, TDoubleVec& result) { doNormalSample(rng, mean, variance, n, result); } void CSampling::poissonSample(double rate, std::size_t n, TSizeVec& result) { core::CScopedFastLock scopedLock{defaultRngMutex}; doPoissonSample(defaultRng, rate, n, result); } void CSampling::poissonSample(CPRNG::CXorOShiro128Plus& rng, double rate, std::size_t n, TSizeVec& result) { doPoissonSample(rng, rate, n, result); } void CSampling::poissonSample(CPRNG::CXorShift1024Mult& rng, double rate, std::size_t n, TSizeVec& result) { doPoissonSample(rng, rate, n, result); } void CSampling::chiSquaredSample(double f, std::size_t n, TDoubleVec& result) { core::CScopedFastLock scopedLock{defaultRngMutex}; doChiSquaredSample(defaultRng, f, n, result); } void CSampling::chiSquaredSample(CPRNG::CXorOShiro128Plus& rng, double f, std::size_t n, TDoubleVec& result) { doChiSquaredSample(rng, f, n, result); } void CSampling::chiSquaredSample(CPRNG::CXorShift1024Mult& rng, double f, std::size_t n, TDoubleVec& result) { doChiSquaredSample(rng, f, n, result); } bool CSampling::multivariateNormalSample(const TDoubleVec& mean, const TDoubleVecVec& covariance, std::size_t n, TDoubleVecVec& samples) { core::CScopedFastLock scopedLock{defaultRngMutex}; return doMultivariateNormalSample(defaultRng, mean, covariance, n, samples); } bool CSampling::multivariateNormalSample(CPRNG::CXorOShiro128Plus& rng, const TDoubleVec& mean, const TDoubleVecVec& covariance, std::size_t n, TDoubleVecVec& samples) { return doMultivariateNormalSample(rng, mean, covariance, n, samples); } bool CSampling::multivariateNormalSample(CPRNG::CXorShift1024Mult& rng, const TDoubleVec& mean, const TDoubleVecVec& covariance, std::size_t n, TDoubleVecVec& samples) { return doMultivariateNormalSample(rng, mean, covariance, n, samples); } #define MULTIVARIATE_NORMAL_SAMPLE(N) \ void CSampling::multivariateNormalSample( \ const CVectorNx1<double, N>& mean, const CSymmetricMatrixNxN<double, N>& covariance, \ std::size_t n, std::vector<CVectorNx1<double, N>>& samples) { \ core::CScopedFastLock scopedLock{defaultRngMutex}; \ doMultivariateNormalSample(defaultRng, mean, covariance, n, samples); \ } \ void CSampling::multivariateNormalSample( \ CPRNG::CXorOShiro128Plus& rng, const CVectorNx1<double, N>& mean, \ const CSymmetricMatrixNxN<double, N>& covariance, std::size_t n, \ std::vector<CVectorNx1<double, N>>& samples) { \ doMultivariateNormalSample(rng, mean, covariance, n, samples); \ } \ void CSampling::multivariateNormalSample( \ CPRNG::CXorShift1024Mult& rng, const CVectorNx1<double, N>& mean, \ const CSymmetricMatrixNxN<double, N>& covariance, std::size_t n, \ std::vector<CVectorNx1<double, N>>& samples) { \ doMultivariateNormalSample(rng, mean, covariance, n, samples); \ } MULTIVARIATE_NORMAL_SAMPLE(2) MULTIVARIATE_NORMAL_SAMPLE(3) MULTIVARIATE_NORMAL_SAMPLE(4) MULTIVARIATE_NORMAL_SAMPLE(5) #undef MULTIVARIATE_NORMAL_SAMPLE std::size_t CSampling::categoricalSample(TDoubleVec& probabilities) { core::CScopedFastLock scopedLock{defaultRngMutex}; return doCategoricalSample(defaultRng, probabilities); } std::size_t CSampling::categoricalSample(CPRNG::CXorOShiro128Plus& rng, TDoubleVec& probabilities) { return doCategoricalSample(rng, probabilities); } std::size_t CSampling::categoricalSample(CPRNG::CXorShift1024Mult& rng, TDoubleVec& probabilities) { return doCategoricalSample(rng, probabilities); } void CSampling::categoricalSampleWithReplacement(TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { core::CScopedFastLock scopedLock{defaultRngMutex}; doCategoricalSampleWithReplacement(defaultRng, probabilities, n, result); } void CSampling::categoricalSampleWithReplacement(CPRNG::CXorOShiro128Plus& rng, TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { doCategoricalSampleWithReplacement(rng, probabilities, n, result); } void CSampling::categoricalSampleWithReplacement(CPRNG::CXorShift1024Mult& rng, TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { doCategoricalSampleWithReplacement(rng, probabilities, n, result); } void CSampling::categoricalSampleWithoutReplacement(TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { core::CScopedFastLock scopedLock{defaultRngMutex}; doCategoricalSampleWithoutReplacement(defaultRng, probabilities, n, result); } void CSampling::categoricalSampleWithoutReplacement(CPRNG::CXorOShiro128Plus& rng, TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { doCategoricalSampleWithoutReplacement(rng, probabilities, n, result); } void CSampling::categoricalSampleWithoutReplacement(CPRNG::CXorShift1024Mult& rng, TDoubleVec& probabilities, std::size_t n, TSizeVec& result) { doCategoricalSampleWithoutReplacement(rng, probabilities, n, result); } void CSampling::multinomialSampleFast(TDoubleVec& probabilities, std::size_t n, TSizeVec& sample, bool sorted) { sample.clear(); if (n == 0 || probabilities.empty()) { return; } // This uses the fact that we can decompose the probability mass // function for the multinomial as follows: // f({n_i}) = Prod_i{ f(n_i | { n_j : j < i }) } // // The conditional mass function of category i given categories // j < i is found by marginalizing over the categories j > i of // the full distribution. The marginal distribution is multinomial // with number of trials n - Sum_{j < i}{ nj } and probabilities // p_j -> p_j / (1 - Sum_k{k < i}{ pk }). The marginal distribution // is just binomial with the same number of trials and probability // p_i (adjusted as above). // // In order to sample from the full distribution we sample each // marginal and then condition the next marginal on the values // we've sampled so far. It is straightforward to verify that // the probability distribution is then: // P({n_i}) = Int{ Prod_i{ f(n_i | { n_j : j < i }) } }dn_i // = Int{ f(n_i) }dn_i // // So the samples are distributed according to the full distribution // function as required. // // We sort the probabilities in descending order to make sampling // as efficient as possible: this means that the loop will terminate // as early as possible on average. if (!sorted) { std::sort(probabilities.begin(), probabilities.end(), std::greater<>()); } { std::size_t r = n; double p = 1.0; std::size_t m = probabilities.size() - 1; core::CScopedFastLock scopedLock{defaultRngMutex}; for (std::size_t i = 0; r > 0 && i < m; ++i) { boost::random::binomial_distribution<> binomial(static_cast<int>(r), probabilities[i] / p); auto ni = static_cast<std::size_t>(binomial(defaultRng)); sample.push_back(ni); r -= ni; p -= probabilities[i]; } if (r > 0) { sample.push_back(r); } } } void CSampling::multinomialSampleStable(TDoubleVec probabilities, std::size_t n, TSizeVec& sample) { TSizeVec indices; indices.reserve(probabilities.size()); for (std::size_t i = 0; i < probabilities.size(); ++i) { indices.push_back(i); } COrderings::simultaneousSortWith(std::greater<>(), probabilities, indices); sample.reserve(probabilities.size()); multinomialSampleFast(probabilities, n, sample); sample.resize(probabilities.size(), 0); for (std::size_t i = 0; i < sample.size(); /**/) { std::size_t j = indices[i]; if (i != j) { std::swap(sample[i], sample[j]); std::swap(indices[i], indices[j]); } else { ++i; } } } void CSampling::weightedSample(std::size_t n, const TDoubleVec& weights, TSizeVec& sampling) { // We sample each category according to its weight. // // Formally, we have a set of weights w(i) s.t. Sum_i{ w(i) } = 1.0. // We can only sample a model an integer number of times and we'd // like to sample each model as near as possible to w(i) * n times, // where n is the total number of samples. In addition we have the // constraint that the total number of samples must equal n. Defining // r(j, i) as follows: // r(0, i) = w(i) * n - floor(w(i) * n) // r(1, i) = w(i) * n - ceil(w(i) * n) // // The problem is equivalent to finding the rounding function c(i) // such that: // Sum_i( r(c(i), i) ) = 0 // Sum_i( |r(c(i), i)| ) is minimized // // and then sampling the i'th model w(i) * n - r(i, c(i)) times. // This problem is solved optimally by a greedy algorithm. Note // that Sum_i{ w(i) } != 1.0 we round the number of samples to // the nearest integer to n * Sum_i{ p(i) }. using TUIntVec = std::vector<unsigned int>; using TDoubleSizePr = std::pair<double, std::size_t>; using TDoubleSizePrVec = std::vector<TDoubleSizePr>; LOG_TRACE(<< "Number samples = " << n); sampling.clear(); double totalWeight = std::accumulate(weights.begin(), weights.end(), 0.0); n = std::max(static_cast<std::size_t>(totalWeight * static_cast<double>(n) + 0.5), static_cast<std::size_t>(1u)); LOG_TRACE(<< "totalWeight = " << totalWeight << ", n = " << n); TUIntVec choices; TDoubleVec remainders[] = {TDoubleVec(), TDoubleVec()}; choices.reserve(weights.size()); remainders[0].reserve(weights.size()); remainders[1].reserve(weights.size()); double totalRemainder = 0.0; for (std::size_t i = 0; i < weights.size(); ++i) { // We need to re-normalize so that the probabilities sum to one. double number = weights[i] * static_cast<double>(n) / totalWeight; choices.push_back((number - std::floor(number) < 0.5) ? 0 : 1); remainders[0].push_back(number - std::floor(number)); remainders[1].push_back(number - std::ceil(number)); totalRemainder += remainders[choices.back()].back(); } // The remainder will be integral so checking against 0.5 avoids // floating point problems. if (std::fabs(totalRemainder) > 0.5) { LOG_TRACE(<< "ideal choice function = " << choices); TDoubleSizePrVec candidates; for (std::size_t i = 0; i < choices.size(); ++i) { if ((totalRemainder > 0.0 && choices[i] == 0) || (totalRemainder < 0.0 && choices[i] == 1)) { candidates.emplace_back(-std::fabs(remainders[choices[i]][i]), i); } } std::sort(candidates.begin(), candidates.end()); LOG_TRACE(<< "candidates = " << candidates); for (std::size_t i = 0; i < candidates.size() && std::fabs(totalRemainder) > 0.5; ++i) { std::size_t j = candidates[i].second; unsigned int choice = choices[j]; choices[j] = (choice + 1u) % 2; totalRemainder += remainders[choices[j]][j] - remainders[choice][j]; } } LOG_TRACE(<< "choice function = " << choices); sampling.reserve(weights.size()); for (std::size_t i = 0; i < weights.size(); ++i) { double number = weights[i] * static_cast<double>(n) / totalWeight; sampling.push_back(static_cast<std::size_t>( choices[i] == 0 ? std::floor(number) : std::ceil(number))); } } void CSampling::normalSampleQuantiles(double mean, double variance, std::size_t n, TDoubleVec& result) { result.clear(); if (n == 0) { return; } if (variance <= 0.0) { result.resize(n, mean); return; } try { boost::math::normal normal(mean, std::sqrt(variance)); sampleQuantiles(normal, n, result); } catch (const std::exception& e) { LOG_ERROR(<< "Failed to sample normal quantiles: " << e.what() << ", mean = " << mean << ", variance = " << variance); result.clear(); } } void CSampling::gammaSampleQuantiles(double shape, double rate, std::size_t n, TDoubleVec& result) { result.clear(); if (n == 0) { return; } try { boost::math::gamma_distribution<> gamma(shape, 1.0 / rate); sampleQuantiles(gamma, n, result); } catch (const std::exception& e) { LOG_ERROR(<< "Failed to sample normal quantiles: " << e.what() << ", shape = " << shape << ", rate = " << rate); result.clear(); } } void CSampling::sobolSequenceSample(std::size_t dim, std::size_t n, TDoubleVecVec& samples) { samples.clear(); if (n == 0 || dim == 0) { return; } try { using TSobolGenerator = boost::variate_generator<boost::random::sobol&, boost::uniform_01<double>>; boost::random::sobol engine(dim); TSobolGenerator gen(engine, boost::uniform_01<double>()); samples.resize(n, TDoubleVec(dim)); for (std::size_t i = 0; i < n; ++i) { for (std::size_t j = 0; j < dim; ++j) { samples[i][j] = gen(); } } } catch (const std::exception& e) { LOG_ERROR(<< "Failed to sample Sobol sequence: " << e.what() << ", dim = " << dim << ", n = " << n); samples.clear(); } } CSampling::CScopeMockRandomNumberGenerator::CScopeMockRandomNumberGenerator() { defaultRng.mock(); } CSampling::CScopeMockRandomNumberGenerator::~CScopeMockRandomNumberGenerator() { defaultRng.unmock(); } } } }

lib/maths/common/CSampling.cc (703 lines of code) (raw):