include/maths/common/CBasicStatistics.h

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_maths_common_CBasicStatistics_h #define INCLUDED_ml_maths_common_CBasicStatistics_h #include <core/CLoggerTrace.h> #include <core/CMemoryFwd.h> #include <core/CSmallVectorFwd.h> #include <core/WindowsSafe.h> #include <maths/common/CLinearAlgebraShims.h> #include <maths/common/CTypeTraits.h> #include <maths/common/ImportExport.h> #include <boost/operators.hpp> #include <algorithm> #include <array> #include <cmath> #include <cstdint> #include <functional> #include <string> #include <vector> namespace ml { namespace maths { namespace common { //! \brief Some basic stats utilities. //! //! DESCRIPTION:\n //! Some utilities for computing basic sample statistics such //! as central moments, covariance matrices and so on. class MATHS_COMMON_EXPORT CBasicStatistics { public: using TDoubleDoublePr = std::pair<double, double>; using TDoubleVec = std::vector<double>; public: //! Compute the mean of a pair. static double mean(const TDoubleDoublePr& data); //! Compute the vector mean of a pair. template<typename VECTOR> static VECTOR mean(const std::pair<VECTOR, VECTOR>& data) { std::size_t n = std::min(data.first.size(), data.second.size()); VECTOR result; result.reserve(n); for (std::size_t i = 0; i < n; ++i) { result.push_back(0.5 * (data.first[i] + data.second[i])); } return result; } //! Compute the sample mean. static double mean(const TDoubleVec& data); //! Compute the sample median. static double median(TDoubleVec data); //! Compute the median absolute deviation. static double mad(TDoubleVec data); //! Compute the maximum of \p first, \p second and \p third. template<typename T> static T max(T first, T second, T third) { return first >= second ? (third >= first ? third : first) : (third >= second ? third : second); } //! Compute the minimum of \p first, \p second and \p third. template<typename T> static T min(T first, T second, T third) { return first <= second ? (third <= first ? third : first) : (third <= second ? third : second); } //! Compute the \p percentage percentile variance assuming it's calculated //! from \p degreesFreedom normal random variables. //! //! \param[in] percentage The required percentile in the range (0, 100). //! \param[in] variance The variance statistic value. //! \param[in] degreesFreedom The number of values used to compute \p variance. static double varianceAtPercentile(double percentage, double variance, double degreesFreedom); /////////////////////////// ACCUMULATORS /////////////////////////// ////////////////////////// Central Moments ///////////////////////// //! Delimiter used to persist basic types to a string. static const char INTERNAL_DELIMITER; //! Delimiter used to persist central moments to a string buffer. static const char EXTERNAL_DELIMITER; //! \brief An accumulator class for sample central moments. //! //! DESCRIPTION:\n //! This function object accumulates sample central moments for a set //! of samples passed to its function operator. //! //! It is capable of calculating the mean and the 2nd and 3rd central //! moments. These can be used to calculate the sample mean, variance //! and skewness. Free functions are defined to compute these. //! //! IMPLEMENTATION DECISIONS:\n //! This is templatized to support, for example, float when space is at //! a premium or something with higher precision than double when accuracy //! is at a premium. The type T must technically be an infinite field //! (in the mathematical sense). In particular, any floating point would //! do, also custom rational or complex types would work provided they //! overload "*" and "/" and define the necessary constructors; however, //! integral types won't work. //! //! The number of moments computed is supplied as a template parameter //! so exactly enough memory is allocated to store the results of the //! calculations. It is up to the user to not pass bad values for this //! parameter. Typedefs are provided to get hold of appropriate objects //! for computing the mean, mean and variance and mean, variance and //! skewness. Free functions are used to get hold of these quantities. //! They are overloaded for explicit values of the number of moments to //! provide compile time checking that the moment is available. //! //! We use recurrence relations for the higher order moments which //! minimize the cancellation errors. These can be derived by considering,\n //! <pre class="fragment"> //! \f$M(n, N) = \sum_{i=1}^{N}{ (x_i - M(1, N))^n } = \sum_{i=1}^{N}{ (x_i - M(1, N-1) + (M(1, N-1) - M(1, N)))^n }\f$ //! </pre> //! //! where,\n //! \f$M(1, N)\f$ is defined to be the sample mean of \f$N\f$ samples,\n //! \f$n > 1\f$ for the higher order central moments, and\n //! //! Using these relations means that the lower order moments are used to //! calculate the higher order moments, so we only allow the user to select //! the highest order moment to compute. //! //! Note, that this is loosely modeled on the boost accumulator statistics //! library. This makes use of boost::mpl which doesn't compile on all our //! supported platforms. Also, it isn't very good at managing cancellation //! errors. //! //! \tparam T The "floating point" type. //! \tparam ORDER The highest order moment to gather. template<typename T, unsigned int ORDER> struct SSampleCentralMoments { using TCoordinate = typename SCoordinate<T>::Type; using TValue = T; //! See core::CMemory. static constexpr bool dynamicSizeAlwaysZero() { return core::memory_detail::SDynamicSizeAlwaysZero<T>::value(); } explicit SSampleCentralMoments(const T& initial = T(0)) : s_Count(0) { std::fill_n(s_Moments, ORDER, initial); } //! Copy construction from implicitly convertible type. //! //! \note Because this is template it is *not* an copy constructor //! so this class has implicit move semantics. template<typename U> SSampleCentralMoments(const SSampleCentralMoments<U, ORDER>& other) : s_Count{other.s_Count} { std::copy(other.s_Moments, other.s_Moments + ORDER, s_Moments); } //! Assignment from implicitly convertible type. //! //! \note Because this is template it is *not* an copy assignment //! operator so this class has implicit move semantics. template<typename U> const SSampleCentralMoments& operator=(const SSampleCentralMoments<U, ORDER>& other) { s_Count = other.s_Count; std::copy(other.s_Moments, other.s_Moments + ORDER, s_Moments); return *this; } //! \name Persistence //@{ //! Initialize from a delimited string. bool fromDelimited(const std::string& str); //! Convert to a delimited string. std::string toDelimited() const; //@} //! Total order based on count then lexicographical less of moments. bool operator<(const SSampleCentralMoments& rhs) const { return s_Count < rhs.s_Count || (s_Count == rhs.s_Count && std::lexicographical_compare(s_Moments, s_Moments + ORDER, rhs.s_Moments, rhs.s_Moments + ORDER)); } //! \name Update //@{ //! Define a function operator for use with std:: algorithms. inline void operator()(const T& x) { this->add(x); } //! Update the moments with the collection \p x. template<typename U> void add(const std::vector<U>& x) { for (const auto& xi : x) { this->add(xi); } } //! Update the moments with the collection \p x. template<typename U, std::size_t N> void add(const core::CSmallVector<U, N>& x) { for (const auto& xi : x) { this->add(xi); } } //! Update the moments with the collection \p x. template<typename U> void add(const std::vector<SSampleCentralMoments<U, ORDER>>& x) { for (const auto& xi : x) { this->operator+=(xi); } } //! Update with a generic value \p x. //! //! \note This copies \p x if type U isn't type T. template<typename U> void add(const U& x, const TCoordinate& n = TCoordinate{1}); //! Update the moments with \p x. \p n is the optional number //! of times to add \p x. void add(const T& x, const TCoordinate& n, int) { if (n == TCoordinate{0}) { return; } s_Count += n; // Note we don't trap the case alpha is less than epsilon, // because then we'd have to compute epsilon and it is very // unlikely the count will get big enough. TCoordinate alpha{n / s_Count}; TCoordinate beta{TCoordinate{1} - alpha}; T mean{s_Moments[0]}; s_Moments[0] = beta * mean + alpha * x; if constexpr (ORDER > 1) { T r{x - s_Moments[0]}; T r2{las::componentwise(r) * las::componentwise(r)}; T dMean{mean - s_Moments[0]}; T dMean2{las::componentwise(dMean) * las::componentwise(dMean)}; T variance{s_Moments[1]}; s_Moments[1] = beta * (variance + dMean2) + alpha * r2; if constexpr (ORDER > 2) { T skew{s_Moments[2]}; T dSkew{TCoordinate(3) * variance + dMean2}; dSkew = las::componentwise(dSkew) * las::componentwise(dMean); s_Moments[2] = beta * (skew + dSkew) + alpha * r2 * r; } } } //! Combine two moments. This is equivalent to running //! a single accumulator on the entire collection. template<typename U> const SSampleCentralMoments& operator+=(const SSampleCentralMoments<U, ORDER>& rhs) { if (rhs.s_Count == TCoordinate{0}) { return *this; } s_Count = s_Count + rhs.s_Count; // Note we don't trap the case alpha is less than epsilon, // because then we'd have to compute epsilon and it is very // unlikely the count will get big enough. TCoordinate alpha{rhs.s_Count / s_Count}; TCoordinate beta{TCoordinate{1} - alpha}; T meanLhs{s_Moments[0]}; T meanRhs{rhs.s_Moments[0]}; s_Moments[0] = beta * meanLhs + alpha * meanRhs; if constexpr (ORDER > 1) { T dMeanLhs{meanLhs - s_Moments[0]}; T dMean2Lhs{las::componentwise(dMeanLhs) * las::componentwise(dMeanLhs)}; T varianceLhs{s_Moments[1]}; T dMeanRhs{meanRhs - s_Moments[0]}; T dMean2Rhs{las::componentwise(dMeanRhs) * las::componentwise(dMeanRhs)}; T varianceRhs{rhs.s_Moments[1]}; s_Moments[1] = beta * (varianceLhs + dMean2Lhs) + alpha * (varianceRhs + dMean2Rhs); if constexpr (ORDER > 2) { T skewLhs{s_Moments[2]}; T dSkewLhs{TCoordinate{3} * varianceLhs + dMean2Lhs}; dSkewLhs = las::componentwise(dSkewLhs) * las::componentwise(dMeanLhs); T skewRhs{rhs.s_Moments[2]}; T dSkewRhs{TCoordinate{3} * varianceRhs + dMean2Rhs}; dSkewRhs = las::componentwise(dSkewRhs) * las::componentwise(dMeanRhs); s_Moments[2] = beta * (skewLhs + dSkewLhs) + alpha * (skewRhs + dSkewRhs); } } return *this; } //! Combine two moments. This is equivalent to running //! a single accumulator on the entire collection. template<typename U> SSampleCentralMoments operator+(const SSampleCentralMoments<U, ORDER>& rhs) const { SSampleCentralMoments result{*this}; return result += rhs; } //! Subtract \p rhs from these. //! //! \note That this isn't always well defined. For example, //! the count and variance of these moments must be larger //! than \p rhs. The caller must ensure that these conditions //! are satisfied. template<typename U> const SSampleCentralMoments& operator-=(const SSampleCentralMoments<U, ORDER>& rhs) { if (rhs.s_Count == TCoordinate{0}) { return *this; } using std::max; s_Count = max(s_Count - rhs.s_Count, TCoordinate{0}); if (s_Count == TCoordinate{0}) { std::fill_n(s_Moments, ORDER, T{0}); return *this; } // Note we don't trap the case alpha is less than epsilon, // because then we'd have to compute epsilon and it is very // unlikely the count will get big enough. TCoordinate alpha{rhs.s_Count / s_Count}; TCoordinate beta{TCoordinate{1} + alpha}; T meanLhs{s_Moments[0]}; T meanRhs{rhs.s_Moments[0]}; s_Moments[0] = beta * meanLhs - alpha * meanRhs; if constexpr (ORDER > 1) { T dMeanLhs{s_Moments[0] - meanLhs}; T dMean2Lhs{las::componentwise(dMeanLhs) * las::componentwise(dMeanLhs)}; T dMeanRhs{meanRhs - meanLhs}; T dMean2Rhs{las::componentwise(dMeanRhs) * las::componentwise(dMeanRhs)}; T varianceRhs{rhs.s_Moments[1]}; s_Moments[1] = max(beta * (s_Moments[1] - dMean2Lhs) - alpha * (varianceRhs + dMean2Rhs - dMean2Lhs), T{0}); if constexpr (ORDER > 2) { T skewLhs{s_Moments[2]}; T dSkewLhs{TCoordinate{3} * s_Moments[1] + dMean2Lhs}; dSkewLhs = las::componentwise(dSkewLhs) * las::componentwise(dMeanLhs); T skewRhs{rhs.s_Moments[2]}; T dSkewRhs{TCoordinate{3} * varianceRhs + dMean2Rhs}; dSkewRhs = las::componentwise(dSkewRhs) * las::componentwise(dMeanRhs); s_Moments[2] = beta * (skewLhs - dSkewLhs) - alpha * (skewRhs + dSkewRhs - dSkewLhs); } } return *this; } //! Subtract \p rhs from these. //! //! \note That this isn't always well defined. For example, //! the count and variance of these moments must be larger //! than \p rhs. The caller must ensure that these conditions //! are satisfied. template<typename U> SSampleCentralMoments operator-(const SSampleCentralMoments<U, ORDER>& rhs) const { SSampleCentralMoments result{*this}; return result -= rhs; } //! Age the moments by reducing the count. //! \note \p factor should be in the range [0,1]. //! \note It must be possible to multiply T by double to use //! this method. void age(double factor) { s_Count = s_Count * TCoordinate{factor}; } //@} //! Get a checksum for this object. std::uint64_t checksum() const; TCoordinate s_Count; T s_Moments[ORDER]; }; //! \name Accumulator Typedefs //@{ //! Accumulator object to compute the sample mean. template<typename T> struct SSampleMean { using TAccumulator = SSampleCentralMoments<T, 1>; }; //! Accumulator object to compute the sample mean and variance. template<typename T> struct SSampleMeanVar { using TAccumulator = SSampleCentralMoments<T, 2>; }; //! Accumulator object to compute the sample mean, variance and skewness. template<typename T> struct SSampleMeanVarSkew { using TAccumulator = SSampleCentralMoments<T, 3>; }; //@} //! \name Factory Functions //@{ //! Make a mean accumulator. template<typename T, typename U> static SSampleCentralMoments<T, 1> momentsAccumulator(const U& count, const T& m1) { SSampleCentralMoments<T, 1> result; result.s_Count = count; result.s_Moments[0] = m1; return result; } //! Make a mean and variance accumulator. template<typename T, typename U> static SSampleCentralMoments<T, 2> momentsAccumulator(const U& count, const T& m1, const T& m2) { SSampleCentralMoments<T, 2> result; result.s_Count = count; result.s_Moments[0] = m1; result.s_Moments[1] = m2; return result; } //! Make a mean, variance and skew accumulator. template<typename T, typename U> static SSampleCentralMoments<T, 3> momentsAccumulator(const U& count, const T& m1, const T& m2, const T& m3) { SSampleCentralMoments<T, 3> result; result.s_Count = count; result.s_Moments[0] = m1; result.s_Moments[1] = m2; result.s_Moments[2] = m3; return result; } //@} //! Get the specified moment provided it exists template<unsigned int M, typename T, unsigned int N> static const T& moment(const SSampleCentralMoments<T, N>& accumulator) { static_assert(M <= N, "M cannot be greater than N"); return accumulator.s_Moments[M]; } //! Get the specified moment provided it exists template<unsigned int M, typename T, unsigned int N> static T& moment(SSampleCentralMoments<T, N>& accumulator) { static_assert(M <= N, "M cannot be greater than N"); return accumulator.s_Moments[M]; } //! Extract the count from an accumulator object. template<typename T, unsigned int N> static inline const typename SSampleCentralMoments<T, N>::TCoordinate& count(const SSampleCentralMoments<T, N>& accumulator) { return accumulator.s_Count; } //! Extract the count from an accumulator object. template<typename T, unsigned int N> static inline typename SSampleCentralMoments<T, N>::TCoordinate& count(SSampleCentralMoments<T, N>& accumulator) { return accumulator.s_Count; } //! Extract the counts from a vector of accumulators. template<typename T, unsigned int M, std::size_t N> static core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> count(const core::CSmallVector<SSampleCentralMoments<T, M>, N>& accumulators) { core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(count(accumulator)); } return result; } //! Extract the counts from a vector of accumulators. template<typename T, unsigned int N> static std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> count(const std::vector<SSampleCentralMoments<T, N>>& accumulators) { std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(count(accumulator)); } return result; } //! Extract the mean from an accumulator object. template<typename T, unsigned int N> static inline const T& mean(const SSampleCentralMoments<T, N>& accumulator) { return accumulator.s_Moments[0]; } //! Extract the means from a vector of accumulators. template<typename T, unsigned int M, std::size_t N> static core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> mean(const core::CSmallVector<SSampleCentralMoments<T, M>, N>& accumulators) { core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(mean(accumulator)); } return result; } //! Extract the means from a vector of accumulators. template<typename T, unsigned int N> static std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> mean(const std::vector<SSampleCentralMoments<T, N>>& accumulators) { std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(mean(accumulator)); } return result; } //! Extract the variance from an accumulator object. //! //! \note This is the unbiased form. template<typename T, unsigned int N> static inline T variance(const SSampleCentralMoments<T, N>& accumulator) { using TCoordinate = typename SSampleCentralMoments<T, N>::TCoordinate; static_assert(N >= 2, "N must be at least 2"); if (accumulator.s_Count <= TCoordinate{1}) { return T{0}; } TCoordinate bias{accumulator.s_Count / (accumulator.s_Count - TCoordinate{1})}; return bias * accumulator.s_Moments[1]; } //! Extract the variances from a vector of accumulators. template<typename T, unsigned int M, std::size_t N> static core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> variance(const core::CSmallVector<SSampleCentralMoments<T, M>, N>& accumulators) { core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(variance(accumulator)); } return result; } //! Extract the variances from a vector of accumulators. template<typename T, unsigned int N> static std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> variance(const std::vector<SSampleCentralMoments<T, N>>& accumulators) { std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(variance(accumulator)); } return result; } //! Extract the maximum likelihood variance from an accumulator object. //! //! \note This is the biased form. template<typename T, unsigned int N> static inline const T& maximumLikelihoodVariance(const SSampleCentralMoments<T, N>& accumulator) { static_assert(N >= 2, "N must be at least 2"); return accumulator.s_Moments[1]; } //! Extract the maximum likelihood variances from a vector of accumulators. template<typename T, unsigned int M, std::size_t N> static core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> maximumLikelihoodVariance(const core::CSmallVector<SSampleCentralMoments<T, M>, N>& accumulators) { core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(maximumLikelihoodVariance(accumulator)); } return result; } //! Extract the maximum likelihood variances from a vector of accumulators. template<typename T, unsigned int N> static std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> maximumLikelihoodVariance(const std::vector<SSampleCentralMoments<T, N>>& accumulators) { std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(maximumLikelihoodVariance(accumulator)); } return result; } //! Extract the skewness from an accumulator object. template<typename T, unsigned int N> static inline T skewness(const SSampleCentralMoments<T, N>& accumulator) { using TCoordinate = typename SSampleCentralMoments<T, N>::TCoordinate; static_assert(N >= 3, "N must be at least 3"); if (accumulator.s_Count <= TCoordinate{2}) { return T{0}; } T normalization{variance(accumulator)}; using std::sqrt; normalization = normalization * sqrt(normalization); return accumulator.s_Moments[2] == T{0} ? T{0} : accumulator.s_Moments[2] / normalization; } //! Extract the skewnesses from a vector of accumulators. template<typename T, unsigned int M, std::size_t N> static core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> skewness(const core::CSmallVector<SSampleCentralMoments<T, M>, N>& accumulators) { core::CSmallVector<typename SSampleCentralMoments<T, M>::TCoordinate, N> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(skewness(accumulator)); } return result; } //! Extract the skewnesses from a vector of accumulators. template<typename T, unsigned int N> static std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> skewness(const std::vector<SSampleCentralMoments<T, N>>& accumulators) { std::vector<typename SSampleCentralMoments<T, N>::TCoordinate> result; result.reserve(accumulators.size()); for (const auto& accumulator : accumulators) { result.push_back(skewness(accumulator)); } return result; } //! \name Print Functions //@{ //! Print a mean accumulator. template<typename T> static inline std::string print(const SSampleCentralMoments<T, 1>& accumulator); //! Print a mean and variance accumulator. template<typename T> static inline std::string print(const SSampleCentralMoments<T, 2>& accumulator); //! Print a mean, variance and skew accumulator. template<typename T> static inline std::string print(const SSampleCentralMoments<T, 3>& accumulator); //@} //! Get a copy of \p moments with count scaled by \p scale. template<typename T, unsigned int N, typename U> static SSampleCentralMoments<T, N> scaled(SSampleCentralMoments<T, N> accumulator, const U& scale) { accumulator.s_Count *= typename SSampleCentralMoments<T, N>::TCoordinate{scale}; return accumulator; } //! Get a copy of \p moments with count scaled by \p scale. template<typename T, unsigned int N, typename U> static void scale(const U& scale, SSampleCentralMoments<T, N>& accumulator) { accumulator.s_Count *= typename SSampleCentralMoments<T, N>::TCoordinate{scale}; } //////////////////////////// Covariances /////////////////////////// //! \brief An accumulator class for vector sample mean and covariances. //! //! DESCRIPTION:\n //! This function object accumulates sample mean and covariances for a //! set of vector samples passed to its function operator. //! //! Free functions are defined to retrieve the mean vector and covariances //! to match the behavior of SSampleCentralMoments. //! //! IMPLEMENTATION DECISIONS:\n //! This is templatized to support, for example, float when space is at //! a premium or long double when accuracy is at a premium. The type T //! must technically be an infinite field (in the mathematical sense). //! In particular, any floating point would do, also custom rational or //! complex types would work provided they overload "*" and "/" and define //! the necessary constructors; however, integral types won't work. //! //! This uses the same recurrence relations as SSampleCentralMoments so //! see that class for more information on these. //! //! \tparam POINT The "floating point" vector type. template<typename POINT> struct SSampleCovariances { //! See core::CMemory. static constexpr bool dynamicSizeAlwaysZero() { return core::memory_detail::SDynamicSizeAlwaysZero<POINT>::value(); } using TVector = POINT; using TMatrix = typename SConformableMatrix<POINT>::Type; explicit SSampleCovariances(std::size_t dimension); SSampleCovariances(const TVector& count, const TVector& mean, const TMatrix& covariances) : s_Count{count}, s_Mean{mean}, s_Covariances{covariances} {} //! Copy construction from implicitly convertible type. //! //! \note Because this is template it is *not* a copy constructor //! so this class has implicit move semantics. template<typename OTHER_POINT> SSampleCovariances(const SSampleCovariances<OTHER_POINT>& other) : s_Count{other.s_Count}, s_Mean{other.s_Mean}, s_Covariances{other.s_Covariances} { } //! Assignment from implicitly convertible type. //! //! \note Because this is template it is *not* an copy assignment //! operator so this class has implicit move semantics. template<typename OTHER_POINT> const SSampleCovariances& operator=(const SSampleCovariances<OTHER_POINT>& other) { s_Count = other.s_Count; s_Mean = other.s_Mean; s_Covariances = other.s_Covariances; return *this; } //! \name Persistence //@{ //! Initialize from a delimited string. bool fromDelimited(std::string str); //! Convert to a delimited string. std::string toDelimited() const; //@} //! \name Update //@{ //! Define a function operator for use with std:: algorithms. inline void operator()(const TVector& x) { this->add(x); } //! Update the moments with the collection \p x. template<typename OTHER_POINT> void add(const std::vector<OTHER_POINT>& x) { for (const auto& x_ : x) { this->add(x_); } } //! Update with a generic point \p x. template<typename OTHER_POINT> void add(const OTHER_POINT& x); //! Update with a generic point \p x with count \p n. template<typename OTHER_POINT> void add(const OTHER_POINT& x, const OTHER_POINT& n); //! Update the mean and covariances with \p x. void add(const TVector& x, const TVector& n, int); //! Combine two moments. This is equivalent to running //! a single accumulator on the entire collection. template<typename OTHER_POINT> const SSampleCovariances& operator+=(const SSampleCovariances<OTHER_POINT>& rhs); //! Combine two moments. This is equivalent to running //! a single accumulator on the entire collection. template<typename OTHER_POINT> SSampleCovariances operator+(const SSampleCovariances<OTHER_POINT>& rhs) const { SSampleCovariances result{*this}; return result += rhs; } //! Subtract \p rhs from these. //! //! \note That this isn't always well defined. For example, //! the count and variance of these covariances must be //! larger than \p rhs. The caller must ensure that these //! conditions are satisfied. template<typename OTHER_POINT> const SSampleCovariances& operator-=(const SSampleCovariances<OTHER_POINT>& rhs); //! Subtract \p rhs from these. //! //! \note That this isn't always well defined. For example, //! the count and variance of these covariances must be //! larger than \p rhs. The caller must ensure that these //! conditions are satisfied. template<typename OTHER_POINT> SSampleCovariances operator-(const SSampleCovariances<OTHER_POINT>& rhs) { SSampleCovariances result{*this}; return result -= rhs; } //! Age the mean and covariances by reducing the count. //! //! \note \p factor should be in the range [0,1]. //! \note It must be possible to cast double to T to use //! this method. void age(double factor) { s_Count = s_Count * typename SCoordinate<POINT>::Type(factor); } //@} //! Get a checksum for this object. std::uint64_t checksum() const; TVector s_Count; TVector s_Mean; TMatrix s_Covariances; }; //! Make a covariances accumulator. template<typename POINT> static inline SSampleCovariances<POINT> covariancesAccumulator(const POINT& count, const POINT& mean, const typename SConformableMatrix<POINT>::Type& covariances) { return SSampleCovariances<POINT>(count, mean, covariances); } //! Extract the count from an accumulator object. template<typename POINT> static inline typename SCoordinate<POINT>::Type count(const SSampleCovariances<POINT>& accumulator); //! Extract the mean vector from an accumulator object. template<typename POINT> static inline const POINT& mean(const SSampleCovariances<POINT>& accumulator) { return accumulator.s_Mean; } //! Extract the covariance matrix from an accumulator object. //! //! \note This is the unbiased form. template<typename POINT> static inline typename SConformableMatrix<POINT>::Type covariances(const SSampleCovariances<POINT>& accumulator); //! Extract the covariance matrix from an accumulator object. //! //! \note This is the unbiased form. template<typename POINT> static inline const typename SConformableMatrix<POINT>::Type& maximumLikelihoodCovariances(const SSampleCovariances<POINT>& accumulator) { return accumulator.s_Covariances; } //! Print a covariances accumulator. template<typename POINT> static inline std::string print(const SSampleCovariances<POINT>& accumulator); //! Interface for Ledoit Wolf shrinkage estimator of the sample //! covariance matrix. //! //! See http://perso.ens-lyon.fr/patrick.flandrin/LedoitWolf_JMA2004.pdf //! for the details. //! //! \param[in] points The points for which to estimate the covariance //! matrix. //! \param[out] result Filled in with the count, mean and "shrunk" //! covariance matrix estimate. template<typename POINT, typename OTHER_POINT> static void covariancesLedoitWolf(const std::vector<POINT>& points, SSampleCovariances<OTHER_POINT>& result); ///////////////////////// Order Statistics ///////////////////////// private: //! \brief Implementation of an accumulator class for order statistics. //! //! DESCRIPTION:\n //! This implements the underlying algorithm for determining the first //! n order statistics online. //! //! IMPLEMENTATION DECISIONS:\n //! This maintains the statistics in a heap for worst case complexity //! \f$O(N log(n))\f$ and typical complexity \f$O(N)\f$ (by checking //! against the maximum value) for \f$n << N\f$. //! //! The container is supplied as a template argument so that a fixed //! size array can be used for the case n is small. Similarly the less //! function is supplied so that T can be any type which supports a //! partial ordering. (T must also have a default constructor.) template<typename T, typename CONTAINER, typename LESS> class COrderStatisticsImpl { public: using iterator = typename CONTAINER::iterator; using const_iterator = typename CONTAINER::const_iterator; using reverse_iterator = typename CONTAINER::reverse_iterator; using const_reverse_iterator = typename CONTAINER::const_reverse_iterator; using TToString = std::function<std::string(const T&)>; using TFromString = std::function<bool(const std::string&, T&)>; public: COrderStatisticsImpl(const CONTAINER& statistics, const LESS& less) : m_Less(less), m_Statistics(statistics), m_UnusedCount(statistics.size()) {} //! \name Persistence //@{ //! Initialize from a delimited string. bool fromDelimited(const std::string& value); //! Initialize from a delimited string using \p fromString to initialize //! values of type T from a string. //! //! \warning This function must not use CBasicStatistics::EXTERNAL_DELIMITER. bool fromDelimited(const std::string& value, const TFromString& fromString); //! Convert to a delimited string. std::string toDelimited() const; //! Convert to a delimited string using \p toString to convert individual //! values of type T to a string. //! //! \warning This function must not use CBasicStatistics::EXTERNAL_DELIMITER. std::string toDelimited(const TToString& toString) const; //@} //! \name Update //@{ //! Define a function operator for use with std:: algorithms. inline bool operator()(const T& x) { return this->add(x); } //! Check if we would add \p x. bool wouldAdd(const T& x) const { return m_UnusedCount > 0 || m_Less(x, *this->begin()); } //! Update the statistics with the collection \p x. bool add(const std::vector<T>& x) { bool result = false; for (const auto& xi : x) { result |= this->add(xi); } return result; } //! Update the statistic with \p n copies of \p x. bool add(const T& x, std::size_t n) { bool result = false; for (std::size_t i = 0; i < std::min(n, m_Statistics.size()); ++i) { result |= this->add(x); } return result; } //! Update the statistics with \p x. bool add(const T& x) { if (m_UnusedCount > 0) { m_Statistics[--m_UnusedCount] = x; if (m_UnusedCount == 0) { // We need a heap for subsequent insertion. std::make_heap(this->begin(), this->end(), m_Less); } return true; } if (m_Less(x, *this->begin())) { // We need to drop the largest value and update the heap. std::pop_heap(this->begin(), this->end(), m_Less); m_Statistics.back() = x; std::push_heap(this->begin(), this->end(), m_Less); return true; } return false; } //! An efficient sort of the statistics (which are not stored //! in sorted order during accumulation for efficiency). void sort() { if (m_UnusedCount > 0) { std::sort(this->begin(), this->end(), m_Less); } else { std::sort_heap(this->begin(), this->end(), m_Less); } } //! Age the values by scaling them. //! \note \p factor should be in the range (0,1]. //! \note It must be possible to multiply T by double to use //! this method. void age(double factor) { if (this->count() == 0) { return; } // Check if we need to multiply or divide by the factor. T tmp{(*this)[0]}; if (m_Less(static_cast<T>(tmp * factor), tmp)) { factor = 1.0 / factor; } for (iterator i = this->begin(); i != this->end(); ++i) { *i = static_cast<T>((*i) * factor); } } //@} //! \name Access //@{ //! Get the "biggest" in the collection. This depends on the //! order predicate and is effectively the first value which //! will be removed if a new value displaces it. inline const T& biggest() const { return m_UnusedCount > 0 ? *std::max_element(this->begin(), this->end(), m_Less) : *this->begin(); } //! Get the maximum number of statistics. inline std::size_t capacity() const { return m_Statistics.size(); } //! Get the number of statistics. inline std::size_t count() const { return m_Statistics.size() - m_UnusedCount; } //! Get the i'th statistic. inline T& operator[](std::size_t i) { return m_Statistics[m_UnusedCount + i]; } //! Get the i'th statistic. inline const T& operator[](std::size_t i) const { return m_Statistics[m_UnusedCount + i]; } //! Get an iterator over the statistics. inline iterator begin() { return m_Statistics.begin() + m_UnusedCount; } //! Get an iterator over the statistics. inline const_iterator begin() const { return m_Statistics.begin() + m_UnusedCount; } //! Get a reverse iterator over the order statistics. inline reverse_iterator rbegin() { return m_Statistics.rbegin(); } //! Get a reverse iterator over the order statistics. inline const_reverse_iterator rbegin() const { return m_Statistics.rbegin(); } //! Get an iterator representing the end of the statistics. inline iterator end() { return m_Statistics.end(); } //! Get an iterator representing the end of the statistics. inline const_iterator end() const { return m_Statistics.end(); } //! Get an iterator representing the end of the statistics. inline reverse_iterator rend() { return m_Statistics.rbegin() + m_UnusedCount; } //! Get an iterator representing the end of the statistics. inline const_reverse_iterator rend() const { return m_Statistics.rbegin() + m_UnusedCount; } //@} //! Remove all statistics. void clear() { std::fill(m_Statistics.begin() + m_UnusedCount, m_Statistics.end(), T{}); m_UnusedCount = m_Statistics.size(); } //! Get a checksum of this object. std::uint64_t checksum(std::uint64_t seed) const; protected: //! Get the statistics. CONTAINER& statistics() { return m_Statistics; } private: LESS m_Less; CONTAINER m_Statistics; //! How many elements of the container are unused? std::size_t m_UnusedCount; }; public: //! \brief A stack based accumulator class for order statistics. //! //! DESCRIPTION:\n //! This function object accumulates the first n order statistics //! for a partially ordered collection when n is relatively small //! and known at compile time. //! //! The ordering function is supplied by the user and so this can //! also be used to calculate the maximum statistics of a collection. //! For example:\n //! \code{cpp} //! COrderStatistics<double, 2, std::greater<double>> //! \endcode //! //! would find the largest two values of a collection. //! //! IMPLEMENTATION DECISIONS:\n //! This is templatized to support any type for which a partial ordering //! can be defined. To this end the ordering is a template parameter //! which is also supplied to the constructor in the case it doesn't //! have default constructor. //! //! This object has been implemented to give near optimal performance //! both in terms of space and complexity when the number of order //! statistics being computed is small. As such, the number of statistics //! to compute is supplied as a template parameter so that exactly enough //! memory is allocated to store the results and so they are stored on //! the stack by default. //! //! \tparam T The numeric type for which the order statistic is being //! computed. //! \tparam N The number of order statistics being computed. //! \tparam LESS The comparison function object type used to test //! if one object of type T is less than another. template<typename T, std::size_t N, typename LESS = std::less<T>> class EMPTY_BASE_OPT COrderStatisticsStack : public COrderStatisticsImpl<T, std::array<T, N>, LESS>, private boost::addable<COrderStatisticsStack<T, N, LESS>> { static_assert(N > 0, "N must be > 0"); private: using TArray = std::array<T, N>; using TImpl = COrderStatisticsImpl<T, TArray, LESS>; public: // Forward typedefs using iterator = typename TImpl::iterator; using const_iterator = typename TImpl::const_iterator; //! See core::CMemory. static constexpr bool dynamicSizeAlwaysZero() { return core::memory_detail::SDynamicSizeAlwaysZero<T>::value(); } public: explicit COrderStatisticsStack(const LESS& less = LESS{}) : TImpl{TArray(), less} { this->statistics().fill(T{}); } explicit COrderStatisticsStack(std::size_t /*n*/, const LESS& less = LESS{}) : TImpl{TArray(), less} { this->statistics().fill(T{}); } //! Combine two statistics. This is equivalent to running a //! single accumulator on the entire collection. const COrderStatisticsStack& operator+=(const COrderStatisticsStack& rhs) { for (const auto& xi : rhs) { this->add(xi); } return *this; } //! Create a member function so this class works with CChecksum. std::uint64_t checksum(std::uint64_t seed = 0) const { return this->TImpl::checksum(seed); } //! Create a member function template so this class works with CPersistUtils::restore template<typename... Args> auto fromDelimited(Args&&... args) -> decltype(TImpl::fromDelimited(std::forward<Args>(args)...)) { return this->TImpl::fromDelimited(std::forward<Args>(args)...); } //! Create a member function template so this class works with CPersistUtils::persist template<typename... Args> auto toDelimited(Args&&... args) const -> decltype(TImpl::toDelimited(std::forward<Args>(args)...)) { return this->TImpl::toDelimited(std::forward<Args>(args)...); } }; //! Make a stack based order statistics accumulator from \p less. template<typename T, std::size_t N, typename LESS> static COrderStatisticsStack<T, N, LESS> orderStatisticsAccumulator(LESS less) { return COrderStatisticsStack<T, N, LESS>{less}; } //! \brief A heap based accumulator class for order statistics. //! //! DESCRIPTION:\n //! This function object accumulates the first n order statistics //! for a partially ordered collection when n is relatively large //! or not known at compile time. //! //! The ordering function is supplied by the user and so this can be //! also used to calculate the maximum statistics of a collection. //! For example:\n //! \code{cpp} //! COrderStatistics<double, std::greater<double>> //! \endcode //! //! would find the largest values of a collection. //! //! IMPLEMENTATION DECISIONS:\n //! This is templatized to support any type for which a partial ordering //! can be defined. To this end the ordering is a template parameter //! which is also supplied to the constructor in the case it doesn't //! have default constructor. //! //! The statistics are stored on the heap in an up front allocated vector. //! Since the number of statistics must be fixed for the life time of //! accumulation it makes sense to allocate the memory once up front. //! This also initializes the memory since it shares its implementation //! with the array based accumulator. //! //! \tparam T The numeric type for which the order statistic is being //! computed. //! \tparam LESS The comparison function object type used to test //! if one object of type T is less than another. template<typename T, typename LESS = std::less<T>> class EMPTY_BASE_OPT COrderStatisticsHeap : public COrderStatisticsImpl<T, std::vector<T>, LESS>, private boost::addable<COrderStatisticsHeap<T, LESS>> { private: using TImpl = COrderStatisticsImpl<T, std::vector<T>, LESS>; public: // Forward typedefs using iterator = typename TImpl::iterator; using const_iterator = typename TImpl::const_iterator; public: explicit COrderStatisticsHeap(std::size_t n, const T& initial = T{}, const LESS& less = LESS{}) : TImpl{std::vector<T>(std::max(n, std::size_t(1)), initial), less} { if (n == 0) { LOG_TRACE(<< "Invalid size of 0 for order statistics accumulator"); } } //! Reset the number of statistics to gather to \p n. void resize(std::size_t n) { if (n == 0) { LOG_TRACE(<< "Invalid resize to 0 for order statistics accumulator"); n = 1; } this->clear(); this->statistics().resize(n); } //! Combine two statistics. This is equivalent to running a //! single accumulator on the entire collection. const COrderStatisticsHeap& operator+=(const COrderStatisticsHeap& rhs) { for (const auto& xi : rhs) { this->add(xi); } return *this; } //! Create a member function so this class works with CChecksum. std::uint64_t checksum(std::uint64_t seed = 0) const { return this->TImpl::checksum(seed); } //! Create a member function template so this class works with CPersistUtils::restore template<typename... Args> auto fromDelimited(Args&&... args) -> decltype(TImpl::fromDelimited(std::forward<Args>(args)...)) { return this->TImpl::fromDelimited(std::forward<Args>(args)...); } //! Create a member function template so this class works with CPersistUtils::persist template<typename... Args> auto toDelimited(Args&&... args) const -> decltype(TImpl::toDelimited(std::forward<Args>(args)...)) { return this->TImpl::toDelimited(std::forward<Args>(args)...); } }; //! Make a heap based order statistics accumulator from \p less. template<typename T, typename LESS> static COrderStatisticsHeap<T, LESS> orderStatisticsAccumulator(std::size_t n, LESS less) { return COrderStatisticsHeap<T, LESS>{n, T{}, less}; } //! \name Accumulator Typedefs //@{ //! Accumulator object to compute the sample maximum. template<typename T, std::size_t N = 1> struct SMax { using TAccumulator = COrderStatisticsStack<T, N, std::greater<T>>; }; //! Accumulator object to compute the sample minimum. template<typename T, std::size_t N = 1> struct SMin { using TAccumulator = COrderStatisticsStack<T, N>; }; //@} ////////////////////////////// Minmax ////////////////////////////// //! \brief An accumulator of the minimum and maximum value in a collection. //! //! IMPLEMENTATION DECISIONS:\n //! This is templatized to support any type for which a partial ordering //! can be defined. To this end the orderings are template parameters //! which are also supplied to the constructor in the case they don't //! have default constructors. template<typename T, typename LESS = std::less<T>, typename GREATER = std::greater<T>> class CMinMax : boost::addable<CMinMax<T, LESS, GREATER>> { public: //! See core::CMemory. static constexpr bool dynamicSizeAlwaysZero() { return core::memory_detail::SDynamicSizeAlwaysZero<T>::value(); } public: explicit CMinMax(const LESS& less = LESS{}, const GREATER& greater = GREATER{}) : m_Min{less}, m_Max{greater} {} //! Define a function operator for use with std:: algorithms. inline bool operator()(const T& x) { return this->add(x); } //! Check if we would add \p x. bool wouldAdd(const T& x) const { return m_Min.wouldAdd(x) || m_Max.wouldAdd(x); } //! Update the statistic with the collection \p x. bool add(const std::vector<T>& x) { bool result{false}; for (const auto& xi : x) { result |= this->add(xi); } return result; } //! Update the statistic with \p n copies of \p x. bool add(const T& x, std::size_t n = 1) { bool result{false}; if (n > 0) { result |= m_Min.add(x); result |= m_Max.add(x); } return result; } //! Combine two statistics. This is equivalent to running a //! single accumulator on the entire collection. const CMinMax& operator+=(const CMinMax& rhs) { m_Min += rhs.m_Min; m_Max += rhs.m_Max; return *this; } //! Get the count of statistics. bool initialized() const { return m_Min.count() > 0; } //! Get the minimum value. T min() const { return m_Min[0]; } //! Get the maximum value. T max() const { return m_Max[0]; } //! Get the range. T range() const { return m_Max[0] - m_Min[0]; } //! Get the margin by which all the values have the same sign. T signMargin() const { if (this->initialized()) { if (m_Min[0] * m_Max[0] > T{0}) { return m_Min[0] > T{0} ? m_Min[0] : m_Max[0]; } } return T{0}; } //! Get a checksum for this object. std::uint64_t checksum() const; private: //! The set minimum. COrderStatisticsStack<T, 1, LESS> m_Min; //! The set maximum. COrderStatisticsStack<T, 1, GREATER> m_Max; }; }; namespace basic_statistics_detail { //! \brief Default custom add function for values to the central //! moments estimator. template<typename U> struct SCentralMomentsCustomAdd { template<typename T, unsigned int ORDER> static inline void add(const U& x, typename SCoordinate<T>::Type n, CBasicStatistics::SSampleCentralMoments<T, ORDER>& moments) { moments.add(x, n, 0); } }; } template<typename T, unsigned int ORDER> template<typename U> void CBasicStatistics::SSampleCentralMoments<T, ORDER>::add(const U& x, const TCoordinate& n) { basic_statistics_detail::SCentralMomentsCustomAdd<U>::add(x, n, *this); } //! \brief Defines a promoted type for a SSampleCentralMoments. //! //! \see CTypeConversions.h for details. template<typename T, unsigned int N> struct SPromoted<CBasicStatistics::SSampleCentralMoments<T, N>> { using Type = CBasicStatistics::SSampleCentralMoments<typename SPromoted<T>::Type, N>; }; //! \brief Defines SSampleCentralMoments on a suitable floating point type. //! //! \see CTypeConversions.h for details. template<typename T, unsigned int N, typename U> struct SFloatingPoint<CBasicStatistics::SSampleCentralMoments<T, N>, U> { using Type = CBasicStatistics::SSampleCentralMoments<typename SFloatingPoint<T, U>::Type, N>; }; } } } #endif // INCLUDED_ml_maths_common_CBasicStatistics_h

include/maths/common/CBasicStatistics.h (801 lines of code) (raw):