include/maths/analytics/COutliers.h (607 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_maths_analytics_COutliers_h #define INCLUDED_ml_maths_analytics_COutliers_h #include <core/CDataFrame.h> #include <core/CHashing.h> #include <core/CNonInstantiatable.h> #include <core/CSmallVector.h> #include <core/Concurrency.h> #include <maths/analytics/ImportExport.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CKdTree.h> #include <maths/common/CLinearAlgebraShims.h> #include <maths/common/COrthogonaliser.h> #include <maths/common/CPRNG.h> #include <maths/common/CSampling.h> #include <maths/common/CTools.h> #include <algorithm> #include <cmath> #include <cstddef> #include <functional> #include <limits> #include <memory> #include <string> #include <vector> namespace ml { namespace maths { namespace analytics { class CDataFrameOutliersInstrumentationInterface; namespace outliers_detail { using TDoubleVec = std::vector<double>; using TDouble1Vec = core::CSmallVector<double, 1>; using TDouble2Vec = core::CSmallVector<double, 2>; using TDouble1VecVec = std::vector<TDouble1Vec>; using TDouble1VecVec2Vec = core::CSmallVector<TDouble1VecVec, 2>; using TDouble1Vec2Vec = core::CSmallVector<TDouble1Vec, 2>; using TProgressCallback = std::function<void(double)>; using TMemoryUsageCallback = std::function<void(std::int64_t)>; using TMeanAccumulator = common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanAccumulatorVec = std::vector<TMeanAccumulator>; //! \brief The interface for a nearest neighbour outlier calculation method. template<typename POINT, typename NEAREST_NEIGHBOURS> class CNearestNeighbourMethod { public: using TPointVec = std::vector<POINT>; using TMatrix = typename common::SConformableMatrix<POINT>::Type; public: CNearestNeighbourMethod(bool computeFeatureInfluence, std::size_t k, NEAREST_NEIGHBOURS lookup, TProgressCallback recordProgress) : m_ComputeFeatureInfluence{computeFeatureInfluence}, m_K{k}, m_Lookup{std::move(lookup)}, m_RecordProgress{std::move(recordProgress)} {} virtual ~CNearestNeighbourMethod() = default; //! Check whether to compute influences of features on the outlier scores. bool computeFeatureInfluence() const { return m_ComputeFeatureInfluence; } //! The number of points. std::size_t n() const { return m_Lookup.size(); } //! The number of nearest neighbours. std::size_t k() const { return m_K; } //! Get the nearest neighbours lookup. const NEAREST_NEIGHBOURS& lookup() const { return m_Lookup; } //! Compute the outlier scores for \p points. TDouble1VecVec2Vec run(const TPointVec& points, const TMatrix& projection, double eps, std::size_t numberScores) { this->setup(points, projection); // We call add exactly once for each point. Scores is presized // so any writes to it are safe. TDouble1VecVec2Vec scores(this->numberMethods(), TDouble1VecVec(numberScores)); core::parallel_for_each( points.begin(), points.end(), [&, neighbours = TPointVec{} ](const POINT& point) mutable { m_Lookup.nearestNeighbours(m_K + 1, point, neighbours); this->add(point, projection, eps, neighbours, scores); }, [this](double fractionalProgress) { this->recordProgress(fractionalProgress); }); this->compute(points, projection, eps, scores); return scores; } //! Recover any temporary memory used by run. virtual void recoverMemory() {} //! Get the size of this object. virtual std::size_t staticSize() const { return sizeof(*this); } //! Get the memory that the method uses. virtual std::size_t memoryUsage() const { return 0; } //! \name Progress Monitoring //@{ //! Get the progress recorder. TProgressCallback& progressRecorder() { return m_RecordProgress; } //! Record \p fractionalProgress. void recordProgress(double fractionalProgress) { m_RecordProgress(fractionalProgress); } //@} //! Get a human readable description of the outlier detection method. virtual std::string print() const { return this->name() + "(n = " + std::to_string(this->n()) + ", k = " + std::to_string(m_K) + ")"; } virtual void setup(const TPointVec&, const TMatrix&) {} virtual void add(const POINT&, const TMatrix&, double, const TPointVec&, TDouble1VecVec&) { } virtual void compute(const TPointVec&, const TMatrix&, double, TDouble1VecVec&) {} protected: static double distanceAtEps(const TMatrix& projection, std::size_t j, double eps, const POINT& x, const POINT& y) { double result{0.0}; for (std::size_t i = 0; i < common::las::dimension(x); ++i) { result += common::CTools::pow2(x(i) + eps * projection(i, j) - y(i)); } return std::sqrt(result); } private: virtual void add(const POINT& point, const TMatrix& projection, double eps, const TPointVec& neighbours, TDouble1VecVec2Vec& scores) { this->add(point, projection, eps, neighbours, scores[0]); } virtual void compute(const TPointVec& points, const TMatrix& projection, double eps, TDouble1VecVec2Vec& scores) { this->compute(points, projection, eps, scores[0]); } virtual std::size_t numberMethods() const { return 1; } virtual std::string name() const = 0; private: bool m_ComputeFeatureInfluence; std::size_t m_K; NEAREST_NEIGHBOURS m_Lookup; TProgressCallback m_RecordProgress; }; //! \brief Computes the local outlier factor score. template<typename POINT, typename NEAREST_NEIGHBOURS> class CLof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> { public: using TPointVec = std::vector<POINT>; using TMatrix = typename common::SConformableMatrix<POINT>::Type; using TCoordinate = typename common::SCoordinate<POINT>::Type; using TCoordinateVec = std::vector<TCoordinate>; using TUInt32CoordinatePr = std::pair<std::uint32_t, TCoordinate>; using TUInt32CoordinatePrVec = std::vector<TUInt32CoordinatePr>; using CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>::distanceAtEps; static const TCoordinate UNSET_DISTANCE; public: CLof(bool computeFeatureInfluence, std::size_t k, TProgressCallback recordProgress, NEAREST_NEIGHBOURS lookup = NEAREST_NEIGHBOURS()) : CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>{ computeFeatureInfluence, k, std::move(lookup), std::move(recordProgress)} {} void recoverMemory() override { m_KDistances.resize(this->k() * m_StartAddresses); m_KDistances.shrink_to_fit(); m_Lrd.resize(m_StartAddresses); m_Lrd.shrink_to_fit(); if (this->computeFeatureInfluence()) { m_LrdAtEps.resize(m_NumberInfluences * m_StartAddresses); m_LrdAtEps.shrink_to_fit(); } } std::size_t staticSize() const override { return sizeof(*this); } std::size_t memoryUsage() const override { return core::memory::dynamicSize(m_KDistances) + core::memory::dynamicSize(m_Lrd) + core::memory::dynamicSize(m_LrdAtEps); } static std::size_t estimateOwnMemoryOverhead(bool computeFeatureInfluence, std::size_t k, std::size_t numberPoints, std::size_t dimension) { return numberPoints * (k * sizeof(TUInt32CoordinatePr) + (computeFeatureInfluence ? dimension + 1 : 1) * sizeof(TCoordinate)); } private: void setup(const TPointVec& points, const TMatrix& projection) override { m_NumberInfluences = common::las::columns(projection); auto minmax = std::minmax_element( points.begin(), points.end(), [](const POINT& lhs, const POINT& rhs) { return lhs.annotation() < rhs.annotation(); }); m_StartAddresses = minmax.first->annotation(); m_EndAddresses = minmax.second->annotation() + 1; // In the following, we first shrink then grow to ensure we overwrite // values in the range [start addresses, end addresses). std::size_t k{this->k()}; m_KDistances.resize(k * m_StartAddresses, {std::uint32_t{0}, UNSET_DISTANCE}); m_KDistances.resize(k * m_EndAddresses, {std::uint32_t{0}, UNSET_DISTANCE}); m_Lrd.resize(m_StartAddresses, UNSET_DISTANCE); m_Lrd.resize(m_EndAddresses, UNSET_DISTANCE); if (this->computeFeatureInfluence()) { m_LrdAtEps.resize(m_NumberInfluences * m_StartAddresses, UNSET_DISTANCE); m_LrdAtEps.resize(m_NumberInfluences * m_EndAddresses, UNSET_DISTANCE); } } void add(const POINT& point, const TMatrix&, double, const TPointVec& neighbours, TDouble1VecVec&) override { // This is called exactly once for each point therefore an element // of m_KDistances is only ever written by one thread. if (neighbours.size() < 2) { return; } std::size_t i{point.annotation()}; std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; for (std::size_t j = a; j <= b; ++j) { std::size_t index{this->kDistanceIndex(i, j - a)}; m_KDistances[index].first = static_cast<std::uint32_t>(neighbours[j].annotation()); m_KDistances[index].second = common::las::distance(point, neighbours[j]); } } void compute(const TPointVec& points, const TMatrix& projection, double eps, TDouble1VecVec& scores) override { this->computeLocalReachabilityDistances(points, projection, eps); this->computeLocalOutlierFactors(points, eps, scores); } std::string name() const override { return "lof"; } void computeLocalReachabilityDistances(const TPointVec& points, const TMatrix& projection, double eps) { // We bind minimum accumulators (by value) to each lambda (since // one copy is then accessed by each thread) and take the minimum // of these at the end. using TMinAccumulator = common::CBasicStatistics::SMin<double>::TAccumulator; auto results = core::parallel_for_each( points.begin(), points.end(), core::bindRetrievableState( [this](TMinAccumulator& min, const POINT& point) mutable { std::size_t i{point.annotation()}; TMeanAccumulator reachability_; for (std::size_t j = 0; j < this->k(); ++j) { const auto& neighbour = m_KDistances[this->kDistanceIndex(i, j)]; if (distance(neighbour) != UNSET_DISTANCE) { reachability_.add(this->reachabilityDistance(neighbour)); } } double reachability{common::CBasicStatistics::mean(reachability_)}; if (reachability > 0.0) { m_Lrd[i] = 1.0 / reachability; min.add(reachability); } }, TMinAccumulator{})); TMinAccumulator min; for (const auto& result : results) { min += result.s_FunctionState; } if (min.count() > 0) { // Use twice the maximum "density" at any other point if there are // k-fold duplicates. Note it is possible that all lookup points are // duplicates, in which case we need to set their local reachability // density in this loop. The overwritten densities are reset in setup. for (std::size_t i = 0; i < m_EndAddresses; ++i) { if (m_Lrd[i] == UNSET_DISTANCE) { m_Lrd[i] = 2.0 / min[0]; } } } if (eps <= 0.0 || this->computeFeatureInfluence() == false) { return; } // Unfortunately, we need to look up nearest neighbours again or cache // the distance for each coordinate perturbation for each neighbour // which is prohibitive. core::parallel_for_each( points.begin(), points.end(), [&, neighbours = TPointVec{} ](const POINT& point) mutable { this->lookup().nearestNeighbours(this->k() + 1, point, neighbours); if (neighbours.size() < 2) { return; } std::size_t i{point.annotation()}; std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; for (std::size_t k = 0; k < m_NumberInfluences; ++k) { TMeanAccumulator reachability_; for (std::size_t j = a; j <= b; ++j) { reachability_.add(this->reachabilityDistance( {neighbours[j].annotation(), distanceAtEps(projection, k, eps, point, neighbours[j])})); } double reachability{common::CBasicStatistics::mean(reachability_)}; m_LrdAtEps[this->epsLrdIndex(i, k)] = 1.0 / std::max(reachability, min[0] / 2.0); } }); } void computeLocalOutlierFactors(const TPointVec& points, double eps, TDouble1VecVec& scores) { core::parallel_for_each(points.begin(), points.end(), [&](const POINT& point) mutable { std::size_t i{point.annotation()}; TMeanAccumulator neighbourhoodLrd; for (std::size_t j = 0; j < this->k(); ++j) { const auto& neighbour = m_KDistances[this->kDistanceIndex(i, j)]; if (distance(neighbour) != UNSET_DISTANCE) { neighbourhoodLrd.add(m_Lrd[index(neighbour)]); } } scores[i].resize(eps > 0.0 && this->computeFeatureInfluence() ? m_NumberInfluences + 1 : 1); scores[i][0] = common::CBasicStatistics::mean(neighbourhoodLrd) / m_Lrd[i]; // We choose to ignore the impact of moving the point on its // neighbours' local reachability distances when computing // scores for each coordinate. for (std::size_t j = 1; j < scores[i].size(); ++j) { scores[i][j] = common::CBasicStatistics::mean(neighbourhoodLrd) / m_LrdAtEps[this->epsLrdIndex(i, j - 1)]; } }); } std::size_t kDistanceIndex(std::size_t index, std::size_t neighbourIndex) const { return index * this->k() + neighbourIndex; } std::size_t epsLrdIndex(std::size_t index, std::size_t coordinate) const { return index * m_NumberInfluences + coordinate; } static std::size_t index(const TUInt32CoordinatePr& neighbour) { return neighbour.first; } static double distance(const TUInt32CoordinatePr& neighbour) { return neighbour.second; } double reachabilityDistance(const TUInt32CoordinatePr& neighbour) const { return std::max(this->kdistance(index(neighbour)), distance(neighbour)); } double kdistance(std::size_t i) const { for (std::size_t j = this->k(); j > 0; --j) { double dist{distance(m_KDistances[this->kDistanceIndex(i, j - 1)])}; if (dist != UNSET_DISTANCE) { return dist; } } return UNSET_DISTANCE; } private: std::size_t m_NumberInfluences; std::size_t m_StartAddresses; std::size_t m_EndAddresses; // The k distances to the neighbouring points of each point are stored // flattened: [neighbours of 0, neighbours of 1,...]. TUInt32CoordinatePrVec m_KDistances; TCoordinateVec m_Lrd; // The epsilon local reachability distances are stored flattened: // [coordinates of 0, coordinates of 2, ...]. TCoordinateVec m_LrdAtEps; }; template<typename POINT, typename NEAREST_NEIGHBOURS> const typename CLof<POINT, NEAREST_NEIGHBOURS>::TCoordinate CLof<POINT, NEAREST_NEIGHBOURS>::UNSET_DISTANCE(-1.0); //! \brief Computes the local distance based outlier score. template<typename POINT, typename NEAREST_NEIGHBOURS> class CLdof final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> { public: CLdof(bool computeFeatureInfluence, std::size_t k, TProgressCallback recordProgress, NEAREST_NEIGHBOURS lookup = NEAREST_NEIGHBOURS()) : CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>{ computeFeatureInfluence, k, std::move(lookup), std::move(recordProgress)} {} private: using TMatrix = typename common::SConformableMatrix<POINT>::Type; using CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>::distanceAtEps; private: void add(const POINT& point, const TMatrix& projection, double eps, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override { std::size_t numberInfluences{common::las::columns(projection)}; auto& score = scores[point.annotation()]; score.assign( eps > 0.0 && this->computeFeatureInfluence() ? numberInfluences + 1 : 1, 0.0); if (neighbours.size() < 2) { return; } auto ldof = [](const TMeanAccumulator& d, const TMeanAccumulator& D) { return common::CBasicStatistics::mean(D) > 0.0 ? common::CBasicStatistics::mean(d) / common::CBasicStatistics::mean(D) : 0.0; }; std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; TMeanAccumulator d; TMeanAccumulator D; for (std::size_t i = a; i <= b; ++i) { d.add(common::las::distance(point, neighbours[i])); for (std::size_t j = 1; j < i; ++j) { D.add(common::las::distance(neighbours[i], neighbours[j])); } } score[0] = ldof(d, D); for (std::size_t j = 1; j < score.size(); ++j) { d = TMeanAccumulator{}; for (std::size_t i = a; i <= b; ++i) { d.add(distanceAtEps(projection, j - 1, eps, point, neighbours[i])); } score[j] = ldof(d, D); } } std::string name() const override { return "ldof"; } }; //! \brief Computes the distance to the k'th nearest neighbour score. template<typename POINT, typename NEAREST_NEIGHBOURS> class CDistancekNN final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> { public: CDistancekNN(bool computeFeatureInfluence, std::size_t k, TProgressCallback recordProgress, NEAREST_NEIGHBOURS lookup = NEAREST_NEIGHBOURS()) : CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>{ computeFeatureInfluence, k, std::move(lookup), std::move(recordProgress)} {} private: using TMatrix = typename common::SConformableMatrix<POINT>::Type; using CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>::distanceAtEps; private: void add(const POINT& point, const TMatrix& projection, double eps, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override { std::size_t numberInfluences{common::las::columns(projection)}; auto& score = scores[point.annotation()]; score.assign( eps > 0.0 && this->computeFeatureInfluence() ? numberInfluences + 1 : 1, 0.0); if (neighbours.size() < 2) { return; } std::size_t k{std::min(this->k() + 1, neighbours.size() - 1) - (point == neighbours[0] ? 0 : 1)}; const auto& kthNeighbour = neighbours[k]; score[0] = common::las::distance(point, kthNeighbour); for (std::size_t i = 1; i < score.size(); ++i) { score[i] = distanceAtEps(projection, i - 1, eps, point, kthNeighbour); } } std::string name() const override { return "knn"; } }; //! \brief Computes the total distance to the k nearest neighbours score. template<typename POINT, typename NEAREST_NEIGHBOURS> class CTotalDistancekNN final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> { public: CTotalDistancekNN(bool computeFeatureInfluence, std::size_t k, TProgressCallback recordProgress, NEAREST_NEIGHBOURS lookup = NEAREST_NEIGHBOURS()) : CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>{ computeFeatureInfluence, k, std::move(lookup), std::move(recordProgress)} {} private: using TMatrix = typename common::SConformableMatrix<POINT>::Type; using CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>::distanceAtEps; private: void add(const POINT& point, const TMatrix& projection, double eps, const std::vector<POINT>& neighbours, TDouble1VecVec& scores) override { std::size_t numberInfluences{common::las::columns(projection)}; auto& score = scores[point.annotation()]; score.assign( eps > 0.0 && this->computeFeatureInfluence() ? numberInfluences + 1 : 1, 0.0); if (neighbours.size() < 2) { return; } std::size_t a(point == neighbours[0] ? 1 : 0); std::size_t b{std::min(this->k() + a - 1, neighbours.size() + a - 2)}; for (std::size_t i = a; i <= b; ++i) { score[0] += common::las::distance(point, neighbours[i]); for (std::size_t j = 1; j < score.size(); ++j) { score[j] += distanceAtEps(projection, j - 1, eps, point, neighbours[i]); } } for (std::size_t i = 0; i < score.size(); ++i) { score[i] /= static_cast<double>(this->k()); } } std::string name() const override { return "tnn"; } }; //! \brief A composite method. //! //! IMPLEMENTATION DECISIONS:\n //! This is used in conjunction with CEnsemble so we can share nearest //! neighbour lookups for all methods in an ensemble model. template<typename POINT, typename NEAREST_NEIGHBOURS> class CMultipleMethods final : public CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS> { public: using TPointVec = std::vector<POINT>; using TMatrix = typename common::SConformableMatrix<POINT>::Type; using TMethodUPtr = std::unique_ptr<CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>>; using TMethodUPtrVec = std::vector<TMethodUPtr>; public: CMultipleMethods(std::size_t k, TMethodUPtrVec methods, NEAREST_NEIGHBOURS lookup = NEAREST_NEIGHBOURS()) : CNearestNeighbourMethod<POINT, NEAREST_NEIGHBOURS>{methods[0]->computeFeatureInfluence(), k, std::move(lookup), methods[0]->progressRecorder()}, m_Methods{std::move(methods)} {} void recoverMemory() override { for (auto& model : m_Methods) { model->recoverMemory(); } } std::size_t staticSize() const override { return sizeof(*this); } std::size_t memoryUsage() const override { return core::memory::dynamicSize(m_Methods); } std::string print() const override { std::string result; result += "{"; for (const auto& method : m_Methods) { result += " " + method->print(); } result += " }"; return result; } private: std::size_t numberMethods() const override { return m_Methods.size(); } void setup(const TPointVec& points, const TMatrix& projection) override { for (auto& method : m_Methods) { method->setup(points, projection); } } void add(const POINT& point, const TMatrix& projection, double eps, const TPointVec& neighbours, TDouble1VecVec2Vec& scores) override { for (std::size_t i = 0; i < m_Methods.size(); ++i) { m_Methods[i]->add(point, projection, eps, neighbours, scores[i]); } } void compute(const TPointVec& points, const TMatrix& projection, double eps, TDouble1VecVec2Vec& scores) override { for (std::size_t i = 0; i < m_Methods.size(); ++i) { m_Methods[i]->compute(points, projection, eps, scores[i]); } } std::string name() const override { return "multiple"; } private: TMethodUPtrVec m_Methods; }; } //! \brief Utilities for computing outlier scores for collections of points. class MATHS_ANALYTICS_EXPORT COutliers : private core::CNonInstantiatable { public: using TDoubleVec = outliers_detail::TDoubleVec; using TMeanVarAccumulator = common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using TProgressCallback = outliers_detail::TProgressCallback; using TMemoryUsageCallback = outliers_detail::TMemoryUsageCallback; template<typename POINT> using TAnnotatedPoint = common::CAnnotatedVector<POINT, std::size_t>; //! \name Method Names //@{ static const std::string LOF; static const std::string LDOF; static const std::string DISTANCE_KNN; static const std::string TOTAL_DISTANCE_KNN; static const std::string ENSEMBLE; //@} //! Instrumentation phase. static const std::string COMPUTING_OUTLIERS; //! Used to compute numeric derivative for influence. static constexpr double EPS{0.01}; //! The outlier detection methods which are available. enum EMethod { E_Lof, E_Ldof, E_DistancekNN, E_TotalDistancekNN, E_Ensemble }; //! \brief The parameters for compute. struct SComputeParameters { //! The number of threads available. std::size_t s_NumberThreads; //! The number of partitions to use. std::size_t s_NumberPartitions; //! Standardize the column values before computing outlier scores. bool s_StandardizeColumns; //! The methods to use. EMethod s_Method; //! The number of neighbours to use if non-zero. std::size_t s_NumberNeighbours; //! If true also compute the feature influence. bool s_ComputeFeatureInfluence; //! The fraction of true outliers among the points. double s_OutlierFraction; }; public: //! Compute outliers for \p frame and write to a new column. //! //! \param[in] params The calculation parameters. //! \param[in] frame The data frame whose rows hold the coordinated of //! the points for which to compute outliers. //! \param[in] instrumentation Manages writing out telemetry. static void compute(const SComputeParameters& params, core::CDataFrame& frame, CDataFrameOutliersInstrumentationInterface& instrumentation); //! Estimate the amount of memory that will be used computing outliers //! for a data frame. //! //! \param[in] params The calculation parameters. //! \param[in] totalNumberPoints The total number of points for which //! outlier scores will be computed. //! \param[in] partitionNumberPoints The number of points per partition //! for which outlier scores will be computed. //! \param[in] dimension The dimension of the points for which outliers //! will be computed. static std::size_t estimateMemoryUsedByCompute(const SComputeParameters& params, std::size_t totalNumberPoints, std::size_t partitionNumberPoints, std::size_t dimension); //! Return string representation of the \p method. static const std::string& print(EMethod method); //! \name Test Interface //@{ //! Compute the normalized LOF scores for \p points. //! //! See https://en.wikipedia.org/wiki/Local_outlier_factor for details. //! //! \param[in] k The number of nearest neighbours to use. //! \param[in] points The points for which to compute scores. //! \param[out] scores The scores of \p points. template<typename POINT> static void lof(std::size_t k, std::vector<POINT> points, TDoubleVec& scores) { compute<outliers_detail::CLof>(k, std::move(points), scores); } //! Compute normalized local distance based outlier scores for \p points. //! //! See https://arxiv.org/pdf/0903.3257.pdf for details. //! //! \param[in] k The number of nearest neighbours to use. //! \param[in] points The points for which to compute scores. //! \param[out] scores The scores of \p points. template<typename POINT> static void ldof(std::size_t k, std::vector<POINT> points, TDoubleVec& scores) { compute<outliers_detail::CLdof>(k, std::move(points), scores); } //! Compute the normalized distance to the k-th nearest neighbour //! outlier scores for \p points. //! //! \param[in] k The number of nearest neighbours to use. //! \param[in] points The points for which to compute scores. //! \param[out] scores The scores of \p points. template<typename POINT> static void distancekNN(std::size_t k, std::vector<POINT> points, TDoubleVec& scores) { compute<outliers_detail::CDistancekNN>(k, std::move(points), scores); } //! Compute the normalized mean distance to the k nearest neighbours //! outlier scores for \p points. //! //! \param[in] k The number of nearest neighbours to use. //! \param[in] points The points for which to compute scores. //! \param[out] scores The scores of \p points. template<typename POINT> static void totalDistancekNN(std::size_t k, std::vector<POINT> points, TDoubleVec& scores) { compute<outliers_detail::CTotalDistancekNN>(k, std::move(points), scores); } //@} private: //! Estimate the amount of memory that will be used computing outliers //! for a data frame using POINT point type. template<typename POINT> static std::size_t estimateMemoryUsedByCompute(const SComputeParameters& params, std::size_t totalNumberPoints, std::size_t partitionNumberPoints, std::size_t dimension); //! Compute normalised outlier scores for a specified method. template<template<typename, typename> class METHOD, typename POINT> static void compute(std::size_t k, std::vector<POINT> points, TDoubleVec& scores) { using TPoint = TAnnotatedPoint<POINT>; using TMethod = METHOD<TPoint, common::CKdTree<TPoint>>; using TMatrix = typename common::SConformableMatrix<TPoint>::Type; if (points.size() > 0) { auto annotatedPoints = annotate(std::move(points)); common::CKdTree<TAnnotatedPoint<POINT>> lookup; lookup.reserve(points.size()); lookup.build(annotatedPoints); TMethod scorer{false, k, noopRecordProgress, std::move(lookup)}; TMatrix projection{common::SIdentity<TMatrix>::get( common::las::dimension(annotatedPoints[0]))}; auto scores_ = scorer.run(annotatedPoints, projection, EPS, annotatedPoints.size()); scores.resize(scores_[0].size()); for (std::size_t i = 0; i < scores.size(); ++i) { scores[i] = scores_[0][i][0]; } } } //! Create points annotated with their index in \p points. template<typename POINT> static std::vector<TAnnotatedPoint<POINT>> annotate(std::vector<POINT> points) { std::vector<TAnnotatedPoint<POINT>> annotatedPoints; annotatedPoints.reserve(points.size()); for (std::size_t i = 0; i < points.size(); ++i) { annotatedPoints.emplace_back(std::move(points[i]), i); } return annotatedPoints; } private: static void noopRecordProgress(double); static void noopRecordMemoryUsage(std::int64_t); }; } } } #endif // INCLUDED_ml_maths_analytics_COutliers_h