include/maths/common/CInformationCriteria.h

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_maths_common_CInformationCriteria_h #define INCLUDED_ml_maths_common_CInformationCriteria_h #include <core/Constants.h> #include <maths/common/CBasicStatisticsCovariances.h> #include <maths/common/CLinearAlgebra.h> #include <maths/common/CLinearAlgebraShims.h> #include <maths/common/CSphericalCluster.h> #include <maths/common/ImportExport.h> #include <cmath> #include <cstddef> #include <limits> #include <vector> namespace ml { namespace maths { namespace common { namespace information_criteria_detail { //! The confidence interval we use when computing the singular values //! of the covariance matrix. This is high to stop x-means creating //! clusters with small numbers of points where there is chance that //! the evidence from the covariance matrix is a fluke. MATHS_COMMON_EXPORT double confidence(double df); #define LOG_DETERMINANT(N) \ MATHS_COMMON_EXPORT \ double logDeterminant(const CSymmetricMatrixNxN<double, N>& c, double upper) LOG_DETERMINANT(2); LOG_DETERMINANT(3); LOG_DETERMINANT(4); LOG_DETERMINANT(5); #undef LOG_DETERMINANT //! The log determinant of our internal heap symmetric matrix. MATHS_COMMON_EXPORT double logDeterminant(const CSymmetricMatrix<double>& c, double upper); //! The log determinant of an Eigen matrix. MATHS_COMMON_EXPORT double logDeterminant(const CDenseMatrix<double>& c, double upper); } // information_criteria_detail:: //! Enumeration of different types of information criterion supported. enum EInfoCriterionType { E_AICc, E_BIC }; //! \brief Computes the information content of a collection of point //! clouds under the assumption that they are distributed as a weighted //! sum of spherically symmetric Gaussians. //! //! DESCRIPTION:\n //! This can calculate two types of information criterion values: Akaike //! and Bayes. The difference is in the treatment of the penalty on the //! maximum log likelihood due to the number of parameters. In particular, //! BIC is defined as: //! <pre class="fragment"> //! \f$BIC = -2 log(L(x)) - k log(n)\f$ //! </pre> //! //! Here, \f$L(x)\f$ is the maximum likelihood of the data, \f$k\f$ is //! the number of free parameters in the distribution and \f$n\f$ is the //! number of data points. The AICc is defined as: //! <pre class="fragment"> //! \f$AIC_c = -2 log(L(x)) + 2 k + \frac{k(k+1)}{n - k - 1}\f$ //! </pre> //! //! Here the finite sample correction applies to data that are normally //! distributed. The maximum likelihood as a function of the input data //! can be found by maximizing the log likelihood w.r.t. the free //! parameters, in this case the mean vector and the single variance //! parameter (from the spherical covariance assumption). Specifically, //! the maximum likelihood mean is //! <pre class="fragment"> //! \f$m = \frac{1}{n}\sum_{i=1}{n}{x_i}\f$ //! </pre> //! and the maximum likelihood variance is //! <pre class="fragment"> //! \f$v = \sum_{i=1}{n}{ (x_i - m)^t (x_i - m) }\f$ //! </pre> //! //! See also http://en.wikipedia.org/wiki/Bayesian_information_criterion //! and http://en.wikipedia.org/wiki/Akaike_information_criterion. template<typename POINT, EInfoCriterionType TYPE> class CSphericalGaussianInfoCriterion { public: using TPointVec = std::vector<POINT>; using TPointVecVec = std::vector<TPointVec>; using TBarePoint = typename SUnannotated<POINT>::Type; using TBarePointPrecise = typename SFloatingPoint<TBarePoint, double>::Type; using TCoordinate = typename SCoordinate<TBarePointPrecise>::Type; using TMeanVarAccumulator = typename CBasicStatistics::SSampleMeanVar<TBarePointPrecise>::TAccumulator; public: CSphericalGaussianInfoCriterion() = default; explicit CSphericalGaussianInfoCriterion(const TPointVecVec& x) { this->add(x); } explicit CSphericalGaussianInfoCriterion(const TPointVec& x) { this->add(x); } //! Update the sufficient statistics for computing info content. void add(const TPointVecVec& x) { for (const auto& xi : x) { this->add(xi); } } //! Update the sufficient statistics for computing info content. void add(const TPointVec& x) { if (x.size() > 0) { TMeanVarAccumulator moments(las::zero(x[0])); moments.add(x); this->add(moments); } } //! Update the sufficient statistics for computing info content. void add(const TMeanVarAccumulator& moments) { double ni = CBasicStatistics::count(moments); const TBarePointPrecise& mean{CBasicStatistics::mean(moments)}; const TBarePointPrecise& covarianceDiag{ CBasicStatistics::maximumLikelihoodVariance(moments)}; std::size_t d{las::dimension(covarianceDiag)}; double vi{0.0}; for (std::size_t i = 0; i < d; ++i) { vi += covarianceDiag(i); } vi = std::max(vi, EPS * las::norm(mean)); m_D = static_cast<double>(d); m_K += 1.0; m_N += ni; if (ni > 1.0) { double upper = information_criteria_detail::confidence(ni - 1.0); m_Likelihood += ni * log(ni) - 0.5 * m_D * ni * (1.0 + core::constants::LOG_TWO_PI + std::log(upper * vi / m_D)); } else { m_Likelihood += ni * log(ni) - 0.5 * m_D * ni * (1.0 + core::constants::LOG_TWO_PI + core::constants::LOG_MAX_DOUBLE); } } //! Calculate the information content of the clusters added so far. double calculate(double p = 0.0) const { if (m_N != 0.0) { double logN{std::log(m_N)}; p += m_D * m_K + 2.0 * m_K - 1.0; switch (TYPE) { case E_BIC: return -2.0 * (m_Likelihood - m_N * logN) + p * logN; case E_AICc: return -2.0 * (m_Likelihood - m_N * logN) + 2.0 * p + p * (p + 1.0) / (m_N - p - 1.0); } } return 0.0; } private: static constexpr double EPS{10.0 * std::numeric_limits<TCoordinate>::epsilon()}; private: //! The point dimension. double m_D = 0.0; //! The number of clusters. double m_K = 0.0; //! The number of points. double m_N = 0.0; //! The data likelihood for the k spherically symmetric Gaussians. double m_Likelihood = 0.0; }; //! \brief Computes the information content of a collection of point //! clouds under the assumption that they are distributed as a weighted //! sum of Gaussians. //! //! DESCRIPTION:\n //! This places no restriction on the covariance matrix in particular //! it is assumed to have \f$frac{D(D+1)}{2}\f$ parameters. For more //! details on the information criteria see CSphericalGaussianInfoCriterion. template<typename POINT, EInfoCriterionType TYPE> class CGaussianInfoCriterion { public: using TPointVec = std::vector<POINT>; using TPointVecVec = std::vector<TPointVec>; using TBarePoint = typename SUnannotated<POINT>::Type; using TBarePointPrecise = typename SFloatingPoint<TBarePoint, double>::Type; using TCoordinate = typename SCoordinate<TBarePointPrecise>::Type; using TCovariances = CBasicStatistics::SSampleCovariances<TBarePointPrecise>; public: CGaussianInfoCriterion() = default; explicit CGaussianInfoCriterion(const TPointVecVec& x) { this->add(x); } explicit CGaussianInfoCriterion(const TPointVec& x) { this->add(x); } //! Update the sufficient statistics for computing info content. void add(const TPointVecVec& x) { for (const auto& xi : x) { this->add(xi); } } //! Update the sufficient statistics for computing info content. void add(const TPointVec& x) { if (x.size() > 0) { TCovariances covariances(las::dimension(x[0])); covariances.add(x); this->add(covariances); } } //! Update the sufficient statistics for computing info content. void add(const TCovariances& covariance) { double ni{CBasicStatistics::count(covariance)}; m_D = static_cast<double>(las::dimension(CBasicStatistics::mean(covariance))); m_K += 1.0; m_N += ni; m_Likelihood += ni * log(ni) - 0.5 * ni * (m_D + m_D * core::constants::LOG_TWO_PI + (ni <= m_D + 1.0 ? core::constants::LOG_MAX_DOUBLE : this->logDeterminant(covariance))); } //! Calculate the information content of the clusters added so far. double calculate(double p = 0.0) const { if (m_N != 0.0) { double logN{std::log(m_N)}; p += m_D * (1.0 + 0.5 * (m_D + 1.0)) * m_K + m_K - 1.0; switch (TYPE) { case E_BIC: return -2.0 * (m_Likelihood - m_N * logN) + p * logN; case E_AICc: return -2.0 * (m_Likelihood - m_N * logN) + 2.0 * p + p * (p + 1.0) / (m_N - p - 1.0); } } return 0.0; } private: //! Compute the log of the determinant of \p covariance. double logDeterminant(const TCovariances& covariance) const { double n{CBasicStatistics::count(covariance)}; const auto& c = CBasicStatistics::maximumLikelihoodCovariances(covariance); double upper{information_criteria_detail::confidence(n - m_D - 1.0)}; return information_criteria_detail::logDeterminant(c, upper); } private: //! The point dimension. double m_D = 0.0; //! The number of clusters. double m_K = 0.0; //! The number of points. double m_N = 0.0; //! The data likelihood for the k Gaussians. double m_Likelihood = 0.0; }; } } } #endif // INCLUDED_ml_maths_common_CInformationCriteria_h

include/maths/common/CInformationCriteria.h (176 lines of code) (raw):