include/maths/analytics/CTreeShapFeatureImportance.h (144 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_maths_analytics_CTreeShapFeatureImportance_h #define INCLUDED_ml_maths_analytics_CTreeShapFeatureImportance_h #include <maths/analytics/CBoostedTree.h> #include <maths/analytics/ImportExport.h> #include <maths/common/CLinearAlgebraEigen.h> #include <vector> namespace ml { namespace core { class CDataFrame; } namespace maths { namespace analytics { //! \brief Computes SHAP (SHapley Additive exPlanation) values for feature importance estimation for gradient boosting //! trees. //! //! DESCRIPTION:\n //! SHAP values is a unique consistent and locally accurate attribution value. This mean that the sum of the SHAP //! feature importance values approximates the model prediction up to a constant bias. This implementation follows the //! algorithm "Consistent Individualized Feature Attribution for Tree Ensembles" by Lundberg, Erion, and Lee. //! The algorithm has the complexity O(TLD^2) where T is the number of trees, L is the maximum number of leaves in the //! tree, and D is the maximum depth of a tree in the ensemble. class MATHS_ANALYTICS_EXPORT CTreeShapFeatureImportance { public: using TIntVec = std::vector<int>; using TDoubleVec = std::vector<double>; using TDoubleVecVec = std::vector<TDoubleVec>; using TSizeVec = std::vector<std::size_t>; using TStrVec = std::vector<std::string>; using TRowRef = core::CDataFrame::TRowRef; using TTree = std::vector<CBoostedTreeNode>; using TTreeVec = std::vector<TTree>; using TVector = common::CDenseVector<double>; using TVectorVec = std::vector<TVector>; using TVectorVecVec = std::vector<TVectorVec>; using TShapWriter = std::function<void(const TSizeVec&, const TStrVec&, const TVectorVec&)>; public: CTreeShapFeatureImportance(std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, TTreeVec& trees, std::size_t numberTopShapValues); //! Compute SHAP values for the data in frame for which this was constructed. //! //! The results for each row of m_Frame are passed to \p writer from up to //! m_NumberThreads threads simultaneously. Results are passed as a vector //! of values where the i'th value corresponds to the i'th input feature to //! m_Encoder. void shap(const TRowRef& row, TShapWriter writer); //! Compute the number of rows of \p frame reaching each node in the \p forest. static void computeNumberSamples(std::size_t numberThreads, const core::CDataFrame& frame, const CDataFrameCategoryEncoder& encoder, TTreeVec& forest); //! Compute inner node values as weighted average of the children (leaf) values. //! //! The weights are the number of rows of \p frame reaching each node. static void computeInternalNodeValues(TTreeVec& forest); //! Get the maximum depth of any tree in \p forest. static std::size_t depth(const TTreeVec& forest); //! Get the column names. const TStrVec& columnNames() const; //! Get the baseline. TVector baseline() const; private: //! Collects the elements of the path through decision tree that are updated together struct SPathElement { double s_FractionOnes = 1.0; double s_FractionZeros = 1.0; int s_FeatureIndex = -1; }; using TElementVec = std::vector<SPathElement>; using TElementVecVec = std::vector<TElementVec>; using TElementItr = TElementVec::iterator; using TDoubleVecItr = TDoubleVec::iterator; class CSplitPath { public: CSplitPath(TElementItr fractionsIterator, TDoubleVecItr scaleIterator) : m_FractionsIterator{fractionsIterator}, m_ScaleIterator{scaleIterator} {} CSplitPath(const CSplitPath& parentSplitPath, int nextIndex) : CSplitPath(parentSplitPath.fractionsBegin() + nextIndex, parentSplitPath.scaleBegin() + nextIndex) { std::copy(parentSplitPath.fractionsBegin(), parentSplitPath.fractionsBegin() + nextIndex, this->fractionsBegin()); std::copy(parentSplitPath.scaleBegin(), parentSplitPath.scaleBegin() + nextIndex, this->scaleBegin()); } TElementItr& fractions() { return m_FractionsIterator; } const TElementItr& fractions() const { return m_FractionsIterator; } TDoubleVecItr& scale() { return m_ScaleIterator; } const TDoubleVecItr& scale() const { return m_ScaleIterator; } SPathElement& operator[](int index) { return m_FractionsIterator[index]; } TElementItr& fractionsBegin() { return m_FractionsIterator; } const TElementItr& fractionsBegin() const { return m_FractionsIterator; } TDoubleVecItr& scaleBegin() { return m_ScaleIterator; } const TDoubleVecItr& scaleBegin() const { return m_ScaleIterator; } void setValues(int index, double fractionOnes, double fractionZeros, int featureIndex) { m_FractionsIterator[index].s_FractionOnes = fractionOnes; m_FractionsIterator[index].s_FractionZeros = fractionZeros; m_FractionsIterator[index].s_FeatureIndex = featureIndex; } void scale(int index, double value) { m_ScaleIterator[index] = value; } double scale(int index) const { return m_ScaleIterator[index]; } int featureIndex(int nextIndex) const { return m_FractionsIterator[nextIndex].s_FeatureIndex; } double fractionZeros(int nextIndex) const { return m_FractionsIterator[nextIndex].s_FractionZeros; } double fractionOnes(int nextIndex) const { return m_FractionsIterator[nextIndex].s_FractionOnes; } int find(int feature, int nextIndex) const { auto featureIndexEnd{(this->fractionsBegin() + nextIndex)}; auto it = std::find_if(this->fractionsBegin(), featureIndexEnd, [feature](const SPathElement& el) { return el.s_FeatureIndex == feature; }); if (it != featureIndexEnd) { return static_cast<int>(std::distance(this->fractionsBegin(), it)); } else { return -1; } } private: TElementItr m_FractionsIterator; TDoubleVecItr m_ScaleIterator; }; private: static void computeInternalNodeValues(TTree& tree, std::size_t nodeIndex); static std::size_t depth(const TTree& tree, std::size_t nodeIndex); //! Recursively traverses all pathes in the \p tree and updated SHAP values once it hits a leaf. //! Ref. Algorithm 2 in the paper by Lundberg et al. void shapRecursive(const TTree& tree, const CEncodedDataFrameRowRef& encodedRow, std::size_t nodeIndex, double parentFractionZero, double parentFractionOne, int parentFeatureIndex, const CSplitPath& path, int nextIndex, TVectorVec& shap) const; //! Extend the \p path object, update the variables and factorial scaling coefficients. static void extendPath(CSplitPath& splitPath, double fractionZero, double fractionOne, int featureIndex, int& nextIndex); //! Sum the scaling coefficients for the \p scalePath without the feature defined in \p pathIndex. static double sumUnwoundPath(const CSplitPath& path, int pathIndex, int nextIndex); //! Updated the scaling coefficients in the \p path if the feature defined in \p pathIndex was seen again. static void unwindPath(CSplitPath& path, int pathIndex, int& nextIndex); private: std::size_t m_NumberTopShapValues; const CDataFrameCategoryEncoder* m_Encoder; const TTreeVec* m_Forest; TStrVec m_ColumnNames; TElementVecVec m_PathStorage; TDoubleVecVec m_ScaleStorage; TVectorVecVec m_PerThreadShapValues; TVectorVec m_ReducedShapValues; TSizeVec m_TopShapValues; }; } } } #endif // INCLUDED_ml_maths_analytics_CTreeShapFeatureImportance_h