lib/maths/analytics/CBoostedTreeLeafNodeStatisticsScratch.cc (366 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <maths/analytics/CBoostedTreeLeafNodeStatisticsScratch.h> #include <core/CDataFrame.h> #include <core/CLogger.h> #include <core/CMemoryDef.h> #include <maths/analytics/CBoostedTree.h> #include <maths/analytics/CDataFrameCategoryEncoder.h> #include <maths/common/CTools.h> #include <limits> namespace ml { namespace maths { namespace analytics { namespace { const std::size_t ASSIGN_MISSING_TO_LEFT{0}; const std::size_t ASSIGN_MISSING_TO_RIGHT{1}; } CBoostedTreeLeafNodeStatisticsScratch::CBoostedTreeLeafNodeStatisticsScratch( std::size_t id, const TSizeVec& extraColumns, std::size_t dimensionGradient, const core::CDataFrame& frame, const TRegularization& regularization, const TFloatVecVec& candidateSplits, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, std::size_t depth, const core::CPackedBitVector& rowMask, CWorkspace& workspace) : CBoostedTreeLeafNodeStatistics{id, depth, extraColumns, dimensionGradient, candidateSplits} { this->computeAggregateLossDerivatives(CLookAheadBound{}, workspace.numberThreads(), frame, treeFeatureBag, rowMask, workspace); // Lazily copy the mask and derivatives to avoid unnecessary allocations. this->derivatives().swap(workspace.reducedDerivatives(treeFeatureBag)); this->bestSplitStatistics() = this->computeBestSplitStatistics( workspace.numberThreads(), regularization, nodeFeatureBag); workspace.reducedDerivatives(treeFeatureBag).swap(this->derivatives()); if (this->gain() >= workspace.minimumGain()) { this->derivatives() = workspace.copy(workspace.derivatives()[0]); this->rowMask() = rowMask; } } CBoostedTreeLeafNodeStatisticsScratch::CBoostedTreeLeafNodeStatisticsScratch( std::size_t id, const CBoostedTreeLeafNodeStatisticsScratch& parent, const core::CDataFrame& frame, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, bool isLeftChild, const CBoostedTreeNode& split, CWorkspace& workspace) : CBoostedTreeLeafNodeStatistics{id, parent.depth() + 1, parent.extraColumns(), parent.dimensionGradient(), parent.candidateSplits()} { this->computeRowMaskAndAggregateLossDerivatives( CLookAheadBound{}, TThreading::numberThreadsForAggregateLossDerivatives( workspace.numberThreads(), treeFeatureBag.size(), parent.minimumChildRowCount()), frame, isLeftChild, split, treeFeatureBag, parent.rowMask(), workspace); // Lazily copy the mask and derivatives to avoid unnecessary allocations. this->derivatives().swap(workspace.reducedDerivatives(treeFeatureBag)); this->bestSplitStatistics() = this->computeBestSplitStatistics( workspace.numberThreads(), regularization, nodeFeatureBag); workspace.reducedDerivatives(treeFeatureBag).swap(this->derivatives()); if (this->gain() >= workspace.minimumGain()) { this->derivatives() = workspace.copy(workspace.reducedDerivatives(treeFeatureBag)); this->rowMask() = workspace.reducedMask(parent.rowMask().size()); } } CBoostedTreeLeafNodeStatisticsScratch::CBoostedTreeLeafNodeStatisticsScratch( std::size_t id, CBoostedTreeLeafNodeStatisticsScratch&& parent, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, CWorkspace& workspace) : CBoostedTreeLeafNodeStatistics{id, parent.depth() + 1, parent.extraColumns(), parent.dimensionGradient(), parent.candidateSplits(), std::move(parent.derivatives())} { // TODO if sum(splits) > |feature| * |rows| aggregate. this->derivatives().subtract(workspace.numberThreads(), workspace.reducedDerivatives(treeFeatureBag), treeFeatureBag); this->bestSplitStatistics() = this->computeBestSplitStatistics( workspace.numberThreads(), regularization, nodeFeatureBag); if (this->gain() >= workspace.minimumGain()) { // Lazily compute the row mask to avoid unnecessary work. this->rowMask() = std::move(parent.rowMask()); this->rowMask() ^= workspace.reducedMask(this->rowMask().size()); } else { // We're going to discard this node so recycle its derivatives. workspace.recycle(std::move(this->derivatives())); } } CBoostedTreeLeafNodeStatisticsScratch::CBoostedTreeLeafNodeStatisticsScratch( const TSizeVec& extraColumns, std::size_t dimensionGradient, const TFloatVecVec& candidateSplits, CSplitsDerivatives derivatives) : CBoostedTreeLeafNodeStatistics{0, // Id 0, // Depth extraColumns, dimensionGradient, candidateSplits, std::move(derivatives)} { } CBoostedTreeLeafNodeStatisticsScratch::TPtrPtrPr CBoostedTreeLeafNodeStatisticsScratch::split(std::size_t leftChildId, std::size_t rightChildId, double gainThreshold, const core::CDataFrame& frame, const TRegularization& regularization, const TSizeVec& treeFeatureBag, const TSizeVec& nodeFeatureBag, const CBoostedTreeNode& split, CWorkspace& workspace) { TPtr leftChild; TPtr rightChild; bool recycle{true}; if (this->leftChildHasFewerRows()) { if (this->bestSplitStatistics().s_LeftChildMaxGain > gainThreshold) { leftChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( leftChildId, *this, frame, regularization, treeFeatureBag, nodeFeatureBag, true /*is left child*/, split, workspace); if (this->bestSplitStatistics().s_RightChildMaxGain > gainThreshold) { rightChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( rightChildId, std::move(*this), regularization, treeFeatureBag, nodeFeatureBag, workspace); recycle = false; } } else if (this->bestSplitStatistics().s_RightChildMaxGain > gainThreshold) { rightChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( rightChildId, *this, frame, regularization, treeFeatureBag, nodeFeatureBag, false /*is left child*/, split, workspace); } } else { if (this->bestSplitStatistics().s_RightChildMaxGain > gainThreshold) { rightChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( rightChildId, *this, frame, regularization, treeFeatureBag, nodeFeatureBag, false /*is left child*/, split, workspace); if (this->bestSplitStatistics().s_LeftChildMaxGain > gainThreshold) { leftChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( leftChildId, std::move(*this), regularization, treeFeatureBag, nodeFeatureBag, workspace); recycle = false; } } else if (this->bestSplitStatistics().s_LeftChildMaxGain > gainThreshold) { leftChild = std::make_unique<CBoostedTreeLeafNodeStatisticsScratch>( leftChildId, *this, frame, regularization, treeFeatureBag, nodeFeatureBag, true /*is left child*/, split, workspace); } } if (recycle) { workspace.recycle(std::move(this->derivatives())); } return {std::move(leftChild), std::move(rightChild)}; } std::size_t CBoostedTreeLeafNodeStatisticsScratch::staticSize() const { return sizeof(*this); } CBoostedTreeLeafNodeStatisticsScratch::SSplitStatistics CBoostedTreeLeafNodeStatisticsScratch::computeBestSplitStatistics(std::size_t numberThreads, const TRegularization& regularization, const TSizeVec& featureBag) const { // We have three possible regularization terms we'll use: // 1. Tree size: gamma * "node count" // 2. Sum square weights: lambda * sum{"leaf weight" ^ 2)} // 3. Tree depth: alpha * sum{exp(("depth" / "target depth" - 1.0) / "tolerance")} // We seek to find the value at the minimum of the quadratic expansion of the // regularized loss function. For a given leaf this expansion is // // L(w) = 1/2 w^t H(\lambda) w + g^t w // // where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's // weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i // ranges over the examples in the leaf. Writing this as the sum of a quadratic // form and constant, i.e. x(w)^t H(\lambda) x(w) + constant, and noting that H // is positive definite, we see that we'll minimise loss by choosing w such that // x is zero, i.e. w^* = arg\min_w(L(w)) satisfies x(w) = 0. This gives // // L(w^*) = -1/2 g^t H(\lambda)^{-1} g using TFeatureBestSplitSearchVec = std::vector<TFeatureBestSplitSearch>; using TSplitStatisticsVec = std::vector<SSplitStatistics>; using TChildrenGainStatisticsVec = std::vector<SChildrenGainStatistics>; const auto& derivatives = this->derivatives(); numberThreads = TThreading::numberThreadsForComputeBestSplitStatistics( numberThreads, featureBag.size(), this->dimensionGradient(), derivatives.numberDerivatives(featureBag)); LOG_TRACE(<< "number threads = " << numberThreads); TFeatureBestSplitSearchVec featureBestSplitSearches; TSplitStatisticsVec splitStats(numberThreads); TChildrenGainStatisticsVec childrenGainStatistics(numberThreads); featureBestSplitSearches.reserve(numberThreads); for (std::size_t i = 0; i < numberThreads; ++i) { featureBestSplitSearches.push_back(this->featureBestSplitSearch( regularization, splitStats[i], childrenGainStatistics[i])); } core::parallel_for_each(featureBag.begin(), featureBag.end(), featureBestSplitSearches); SSplitStatistics result; SChildrenGainStatistics bestSplitChildrenGainStatistics; for (std::size_t i = 0; i < numberThreads; ++i) { if (splitStats[i] > result) { result = splitStats[i]; bestSplitChildrenGainStatistics = childrenGainStatistics[i]; } } if (derivatives.dimensionGradient() <= 2 && result.s_Gain > 0) { double lambda{regularization.leafWeightPenaltyMultiplier().value()}; double childPenaltyForDepth{regularization.penaltyForDepth(this->depth() + 1)}; double childPenaltyForDepthPlusOne{ regularization.penaltyForDepth(this->depth() + 2)}; double childPenalty{regularization.treeSizePenaltyMultiplier().value() + regularization.depthPenaltyMultiplier().value() * (2.0 * childPenaltyForDepthPlusOne - childPenaltyForDepth)}; result.s_LeftChildMaxGain = 0.5 * this->childMaxGain(bestSplitChildrenGainStatistics.s_GainLeft, bestSplitChildrenGainStatistics.s_MinLossLeft, lambda) - childPenalty; result.s_RightChildMaxGain = 0.5 * this->childMaxGain(bestSplitChildrenGainStatistics.s_GainRight, bestSplitChildrenGainStatistics.s_MinLossRight, lambda) - childPenalty; } return result; } CBoostedTreeLeafNodeStatisticsScratch::TFeatureBestSplitSearch CBoostedTreeLeafNodeStatisticsScratch::featureBestSplitSearch( const TRegularization& regularization, SSplitStatistics& bestSplitStatistics, SChildrenGainStatistics& childrenGainStatisticsGlobal) const { using TDoubleAry = std::array<double, 2>; using TDoubleVector = common::CDenseVector<double>; using TDoubleVectorAry = std::array<TDoubleVector, 2>; using TDoubleMatrix = common::CDenseMatrix<double>; using TDoubleMatrixAry = std::array<TDoubleMatrix, 2>; int d{static_cast<int>(this->dimensionGradient())}; double lambda{regularization.leafWeightPenaltyMultiplier().value()}; auto minimumLoss_ = TThreading::makeThreadLocalMinimumLossFunction(d, lambda); TDoubleVector g_{d}; TDoubleMatrix h_{d, d}; TDoubleVectorAry gl_{TDoubleVector{d}, TDoubleVector{d}}; TDoubleVectorAry gr_{TDoubleVector{d}, TDoubleVector{d}}; TDoubleMatrixAry hl_{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; TDoubleMatrixAry hr_{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}}; return [ // Inputs minimumLoss = std::move(minimumLoss_), &regularization, // State g = std::move(g_), h = std::move(h_), gl = std::move(gl_), gr = std::move(gr_), hl = std::move(hl_), hr = std::move(hr_), // Results &bestSplitStatistics, &childrenGainStatisticsGlobal, this ](std::size_t feature) mutable { const auto& candidateSplits = this->candidateSplits(); const auto& derivatives = this->derivatives(); std::size_t c{derivatives.missingCount(feature)}; g = derivatives.missingGradient(feature); h = derivatives.missingCurvature(feature); for (auto featureDerivatives = derivatives.beginDerivatives(feature); featureDerivatives != derivatives.endDerivatives(feature); ++featureDerivatives) { c += featureDerivatives->count(); g += featureDerivatives->gradient(); h += featureDerivatives->curvature(); } std::size_t cl[]{derivatives.missingCount(feature), 0}; gl[ASSIGN_MISSING_TO_LEFT] = derivatives.missingGradient(feature); gl[ASSIGN_MISSING_TO_RIGHT] = TDoubleVector::Zero(g.rows()); gr[ASSIGN_MISSING_TO_LEFT] = g - derivatives.missingGradient(feature); gr[ASSIGN_MISSING_TO_RIGHT] = g; hl[ASSIGN_MISSING_TO_LEFT] = derivatives.missingCurvature(feature); hl[ASSIGN_MISSING_TO_RIGHT] = TDoubleMatrix::Zero(h.rows(), h.cols()); hr[ASSIGN_MISSING_TO_LEFT] = h - derivatives.missingCurvature(feature); hr[ASSIGN_MISSING_TO_RIGHT] = h; double maximumGain{-INF}; double splitAt{-INF}; std::size_t leftChildRowCount{0}; bool assignMissingToLeft{true}; std::size_t size{derivatives.numberDerivatives(feature)}; TDoubleAry gain; TDoubleAry minLossLeft; TDoubleAry minLossRight; SChildrenGainStatistics childrenGainStatisticsPerFeature; for (std::size_t split = 0; split + 1 < size; ++split) { std::size_t count{derivatives.count(feature, split)}; if (count == 0) { continue; } const auto& gradient = derivatives.gradient(feature, split); const auto& curvature = derivatives.curvature(feature, split); cl[ASSIGN_MISSING_TO_LEFT] += count; cl[ASSIGN_MISSING_TO_RIGHT] += count; gl[ASSIGN_MISSING_TO_LEFT] += gradient; gl[ASSIGN_MISSING_TO_RIGHT] += gradient; gr[ASSIGN_MISSING_TO_LEFT] -= gradient; gr[ASSIGN_MISSING_TO_RIGHT] -= gradient; hl[ASSIGN_MISSING_TO_LEFT] += curvature; hl[ASSIGN_MISSING_TO_RIGHT] += curvature; hr[ASSIGN_MISSING_TO_LEFT] -= curvature; hr[ASSIGN_MISSING_TO_RIGHT] -= curvature; if (cl[ASSIGN_MISSING_TO_LEFT] == 0 || cl[ASSIGN_MISSING_TO_LEFT] == c) { gain[ASSIGN_MISSING_TO_LEFT] = -INF; } else { minLossLeft[ASSIGN_MISSING_TO_LEFT] = minimumLoss( gl[ASSIGN_MISSING_TO_LEFT], hl[ASSIGN_MISSING_TO_LEFT]); minLossRight[ASSIGN_MISSING_TO_LEFT] = minimumLoss( gr[ASSIGN_MISSING_TO_LEFT], hr[ASSIGN_MISSING_TO_LEFT]); gain[ASSIGN_MISSING_TO_LEFT] = minLossLeft[ASSIGN_MISSING_TO_LEFT] + minLossRight[ASSIGN_MISSING_TO_LEFT]; } if (cl[ASSIGN_MISSING_TO_RIGHT] == 0 || cl[ASSIGN_MISSING_TO_RIGHT] == c) { gain[ASSIGN_MISSING_TO_RIGHT] = -INF; } else { minLossLeft[ASSIGN_MISSING_TO_RIGHT] = minimumLoss( gl[ASSIGN_MISSING_TO_RIGHT], hl[ASSIGN_MISSING_TO_RIGHT]); minLossRight[ASSIGN_MISSING_TO_RIGHT] = minimumLoss( gr[ASSIGN_MISSING_TO_RIGHT], hr[ASSIGN_MISSING_TO_RIGHT]); gain[ASSIGN_MISSING_TO_RIGHT] = minLossLeft[ASSIGN_MISSING_TO_RIGHT] + minLossRight[ASSIGN_MISSING_TO_RIGHT]; } if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_LEFT]; splitAt = candidateSplits[feature][split]; leftChildRowCount = cl[ASSIGN_MISSING_TO_LEFT]; assignMissingToLeft = true; // If gain > -INF then minLossLeft and minLossRight were initialized. childrenGainStatisticsPerFeature = { minLossLeft[ASSIGN_MISSING_TO_LEFT], minLossRight[ASSIGN_MISSING_TO_LEFT], gl[ASSIGN_MISSING_TO_LEFT](0), gr[ASSIGN_MISSING_TO_LEFT](0)}; } if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) { maximumGain = gain[ASSIGN_MISSING_TO_RIGHT]; splitAt = candidateSplits[feature][split]; leftChildRowCount = cl[ASSIGN_MISSING_TO_RIGHT]; assignMissingToLeft = false; // If gain > -INF then minLossLeft and minLossRight were initialized. childrenGainStatisticsPerFeature = { minLossLeft[ASSIGN_MISSING_TO_RIGHT], minLossRight[ASSIGN_MISSING_TO_RIGHT], gl[ASSIGN_MISSING_TO_RIGHT](0), gr[ASSIGN_MISSING_TO_RIGHT](0)}; } } double penaltyForDepth{regularization.penaltyForDepth(this->depth())}; double penaltyForDepthPlusOne{regularization.penaltyForDepth(this->depth() + 1)}; // The gain is the difference between the quadratic minimum for loss with // no split and the loss with the minimum loss split we found. double totalGain{0.5 * (maximumGain - minimumLoss(g, h)) - regularization.treeSizePenaltyMultiplier().value() - regularization.depthPenaltyMultiplier().value() * (2.0 * penaltyForDepthPlusOne - penaltyForDepth)}; SSplitStatistics candidateSplitStatistics{ totalGain, h.trace() / static_cast<double>(this->dimensionGradient()), feature, splitAt, std::min(leftChildRowCount, c - leftChildRowCount), 2 * leftChildRowCount < c, assignMissingToLeft}; LOG_TRACE(<< "candidate split: " << candidateSplitStatistics.print()); if (candidateSplitStatistics > bestSplitStatistics) { bestSplitStatistics = candidateSplitStatistics; childrenGainStatisticsGlobal = childrenGainStatisticsPerFeature; } }; } double CBoostedTreeLeafNodeStatisticsScratch::childMaxGain(double childGain, double minLossChild, double lambda) const { // This computes the maximum possible gain we can expect splitting a child node given // we know the sum of the positive (g^+) and negative gradients (g^-) at its parent, // the minimum curvature on the positive and negative gradient set (hmin^+ and hmin^-) // and largest and smallest gradient (gmax and gmin, respectively). The highest possible // gain consistent with these constraints can be shown to be: // // (g^+)^2 / (hmin^+ * g^+ / gmax + lambda) + (g^-)^2 / (hmin^- * g^- / gmin + lambda) // // Since gchild = gchild^+ + gchild^-, we can improve estimates on g^+ and g^- for the // child as: // g^+ = max(min(gchild - g^-, g^+), 0), // g^- = max(min(gchild - g^+, g^-), 0). double positiveDerivativesGSum = std::max(std::min(childGain - this->derivatives().negativeDerivativesGSum(), this->derivatives().positiveDerivativesGSum()), 0.0); double negativeDerivativesGSum = std::min(std::max(childGain - this->derivatives().positiveDerivativesGSum(), this->derivatives().negativeDerivativesGSum()), 0.0); double lookAheadGain{ ((positiveDerivativesGSum != 0.0) ? common::CTools::pow2(positiveDerivativesGSum) / (this->derivatives().positiveDerivativesHMin() * positiveDerivativesGSum / this->derivatives().positiveDerivativesGMax() + lambda + 1e-10) : 0.0) + ((negativeDerivativesGSum != 0.0) ? common::CTools::pow2(negativeDerivativesGSum) / (this->derivatives().negativeDerivativesHMin() * negativeDerivativesGSum / this->derivatives().negativeDerivativesGMin() + lambda + 1e-10) : 0.0)}; return lookAheadGain - minLossChild; } } } }