include/maths/analytics/CBoostedTreeLeafNodeStatisticsIncremental.h (100 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_maths_analytics_CBoostedTreeLeafNodeStatisticsIncremental_h #define INCLUDED_ml_maths_analytics_CBoostedTreeLeafNodeStatisticsIncremental_h #include <core/CPackedBitVector.h> #include <maths/analytics/CBoostedTreeLeafNodeStatistics.h> #include <maths/analytics/ImportExport.h> #include <maths/common/MathsTypes.h> #include <cstddef> #include <functional> #include <optional> namespace CBoostedTreeLeafNodeStatisticsTest { struct testComputeBestSplitStatisticsThreading; } namespace ml { namespace core { class CDataFrame; } namespace maths { namespace analytics { class CBoostedTreeNode; class CDataFrameCategoryEncoder; class CEncodedDataFrameRowRef; //! \brief Maintains a collection of statistics about a leaf of the regression //! tree as it is built. //! //! DESCRIPTION:\N //! The regression tree is grown top down by greedily selecting the split with //! the maximum gain (in the loss). This finds and scores the maximum gain split //! of a single leaf of the tree. //! //! This version is used for training incrementally. class MATHS_ANALYTICS_EXPORT CBoostedTreeLeafNodeStatisticsIncremental final : public CBoostedTreeLeafNodeStatistics { public: CBoostedTreeLeafNodeStatisticsIncremental(std::size_t id, const TSizeVec& extraColumns, std::size_t dimensionGradient, const core::CDataFrame& frame, const TRegularization& regularization, const TFloatVecVec& candidateSplits, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, std::size_t depth, const core::CPackedBitVector& rowMask, CWorkspace& workspace); //! Only called by split but is public so it's accessible to std::make_shared. CBoostedTreeLeafNodeStatisticsIncremental(std::size_t id, const CBoostedTreeLeafNodeStatisticsIncremental& parent, const core::CDataFrame& frame, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, bool isLeftChild, const CBoostedTreeNode& split, CWorkspace& workspace); //! Only called by split but is public so it's accessible to std::make_shared. CBoostedTreeLeafNodeStatisticsIncremental(std::size_t id, CBoostedTreeLeafNodeStatisticsIncremental&& parent, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, bool isLeftChild, CWorkspace& workspace); CBoostedTreeLeafNodeStatisticsIncremental(const CBoostedTreeLeafNodeStatisticsIncremental&) = delete; CBoostedTreeLeafNodeStatisticsIncremental& operator=(const CBoostedTreeLeafNodeStatisticsIncremental&) = delete; // Move construction/assignment not possible due to const reference member. //! Apply the split defined by \p split. //! //! \return Shared pointers to the left and right child node statistics. TPtrPtrPr split(std::size_t leftChildId, std::size_t rightChildId, double gainThreshold, const core::CDataFrame& frame, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, const CBoostedTreeNode& split, CWorkspace& workspace) override; //! Get the size of this object. std::size_t staticSize() const override; private: using TFeatureBestSplitSearch = std::function<void(std::size_t)>; //! \brief Describes a split of the tree being incrementally retrained. struct MATHS_ANALYTICS_EXPORT SPreviousSplit { SPreviousSplit(std::size_t nodeIndex, std::size_t feature, double splitAt) : s_NodeIndex{nodeIndex}, s_Feature{feature}, s_SplitAt{splitAt} {} std::size_t s_NodeIndex; std::size_t s_Feature; double s_SplitAt; }; using TOptionalPreviousSplit = std::optional<SPreviousSplit>; private: CBoostedTreeLeafNodeStatisticsIncremental(const TSizeVec& extraColumns, std::size_t dimensionGradient, const TFloatVecVec& candidateSplits, CSplitsDerivatives derivatives); SSplitStatistics computeBestSplitStatistics(std::size_t numberThreads, const TRegularization& regularization, const TSizeVec& featureBag) const; TFeatureBestSplitSearch featureBestSplitSearch(const TRegularization& regularization, SSplitStatistics& bestSplitStatistics) const; double penaltyForTreeChange(const TRegularization& regularization, std::size_t feature, std::size_t split) const; TOptionalPreviousSplit rootPreviousSplit(const CWorkspace& workspace) const; TOptionalPreviousSplit leftChildPreviousSplit(std::size_t feature, const CWorkspace& workspace) const; TOptionalPreviousSplit rightChildPreviousSplit(std::size_t feature, const CWorkspace& workspace) const; private: TOptionalPreviousSplit m_PreviousSplit; friend struct CBoostedTreeLeafNodeStatisticsTest::testComputeBestSplitStatisticsThreading; }; } } } #endif // INCLUDED_ml_maths_analytics_CBoostedTreeLeafNodeStatisticsIncremental_h