lib/maths/analytics/CBoostedTreeLeafNodeStatisticsIncremental.cc (376 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <maths/analytics/CBoostedTreeLeafNodeStatisticsIncremental.h>
#include <core/CDataFrame.h>
#include <core/CLogger.h>
#include <core/CMemoryDef.h>
#include <maths/analytics/CBoostedTree.h>
#include <maths/analytics/CBoostedTreeUtils.h>
#include <maths/analytics/CDataFrameCategoryEncoder.h>
#include <maths/common/CBasicStatistics.h>
#include <maths/common/CTools.h>
#include <limits>
#include <utility>
namespace ml {
namespace maths {
namespace analytics {
using namespace boosted_tree_detail;
namespace {
const std::size_t ASSIGN_MISSING_TO_LEFT{0};
const std::size_t ASSIGN_MISSING_TO_RIGHT{1};
}
CBoostedTreeLeafNodeStatisticsIncremental::CBoostedTreeLeafNodeStatisticsIncremental(
std::size_t id,
const TSizeVec& extraColumns,
std::size_t dimensionGradient,
const core::CDataFrame& frame,
const TRegularization& regularization,
const TFloatVecVec& candidateSplits,
const TSizeVec& treeFeatureBag,
const TSizeVec& nodeFeatureBag,
std::size_t depth,
const core::CPackedBitVector& rowMask,
CWorkspace& workspace)
: CBoostedTreeLeafNodeStatistics{id, depth, extraColumns, dimensionGradient, candidateSplits} {
this->computeAggregateLossDerivatives(CNoLookAheadBound{},
workspace.numberThreads(), frame,
treeFeatureBag, rowMask, workspace);
// Lazily copy the mask and derivatives to avoid unnecessary allocations.
m_PreviousSplit = this->rootPreviousSplit(workspace);
this->derivatives().swap(workspace.reducedDerivatives(treeFeatureBag));
this->bestSplitStatistics() = this->computeBestSplitStatistics(
workspace.numberThreads(), regularization, nodeFeatureBag);
workspace.reducedDerivatives(treeFeatureBag).swap(this->derivatives());
if (this->gain() >= workspace.minimumGain()) {
this->derivatives() = workspace.copy(workspace.derivatives()[0]);
this->rowMask() = rowMask;
}
}
CBoostedTreeLeafNodeStatisticsIncremental::CBoostedTreeLeafNodeStatisticsIncremental(
std::size_t id,
const CBoostedTreeLeafNodeStatisticsIncremental& parent,
const core::CDataFrame& frame,
const TRegularization& regularization,
const TSizeVec& treeFeatureBag,
const TSizeVec& nodeFeatureBag,
bool isLeftChild,
const CBoostedTreeNode& split,
CWorkspace& workspace)
: CBoostedTreeLeafNodeStatistics{id, parent.depth() + 1, parent.extraColumns(),
parent.dimensionGradient(),
parent.candidateSplits()} {
this->computeRowMaskAndAggregateLossDerivatives(
CNoLookAheadBound{},
TThreading::numberThreadsForAggregateLossDerivatives(
workspace.numberThreads(), treeFeatureBag.size(), parent.minimumChildRowCount()),
frame, isLeftChild, split, treeFeatureBag, parent.rowMask(), workspace);
this->derivatives().swap(workspace.reducedDerivatives(treeFeatureBag));
// Set the split feature and value for this node in the tree being retrained.
std::size_t parentSplitFeature{parent.bestSplit().first};
m_PreviousSplit =
(isLeftChild ? parent.leftChildPreviousSplit(parentSplitFeature, workspace)
: parent.rightChildPreviousSplit(parentSplitFeature, workspace));
this->bestSplitStatistics() = this->computeBestSplitStatistics(
workspace.numberThreads(), regularization, nodeFeatureBag);
workspace.reducedDerivatives(treeFeatureBag).swap(this->derivatives());
// Lazily copy the mask and derivatives to avoid unnecessary allocations.
if (this->gain() >= workspace.minimumGain()) {
this->derivatives() = workspace.copy(workspace.reducedDerivatives(treeFeatureBag));
this->rowMask() = workspace.reducedMask(parent.rowMask().size());
}
}
CBoostedTreeLeafNodeStatisticsIncremental::CBoostedTreeLeafNodeStatisticsIncremental(
std::size_t id,
CBoostedTreeLeafNodeStatisticsIncremental&& parent,
const TRegularization& regularization,
const TSizeVec& treeFeatureBag,
const TSizeVec& nodeFeatureBag,
bool isLeftChild,
CWorkspace& workspace)
: CBoostedTreeLeafNodeStatistics{id,
parent.depth() + 1,
parent.extraColumns(),
parent.dimensionGradient(),
parent.candidateSplits(),
std::move(parent.derivatives())} {
// TODO if sum(splits) > |feature| * |rows| aggregate.
this->derivatives().subtract(workspace.numberThreads(),
workspace.reducedDerivatives(treeFeatureBag),
treeFeatureBag);
// Set the split feature and value for this node in the tree being retrained.
std::size_t parentSplitFeature{parent.bestSplit().first};
m_PreviousSplit =
(isLeftChild ? parent.leftChildPreviousSplit(parentSplitFeature, workspace)
: parent.rightChildPreviousSplit(parentSplitFeature, workspace));
this->bestSplitStatistics() = this->computeBestSplitStatistics(
workspace.numberThreads(), regularization, nodeFeatureBag);
if (this->gain() >= workspace.minimumGain()) {
// Lazily compute the row mask to avoid unnecessary work.
this->rowMask() = std::move(parent.rowMask());
this->rowMask() ^= workspace.reducedMask(this->rowMask().size());
} else {
// We're going to discard this node so recycle its derivatives.
workspace.recycle(std::move(this->derivatives()));
}
}
CBoostedTreeLeafNodeStatisticsIncremental::CBoostedTreeLeafNodeStatisticsIncremental(
const TSizeVec& extraColumns,
std::size_t dimensionGradient,
const TFloatVecVec& candidateSplits,
CSplitsDerivatives derivatives)
: CBoostedTreeLeafNodeStatistics{0, // Id
0, // Depth
extraColumns,
dimensionGradient,
candidateSplits,
std::move(derivatives)} {
}
CBoostedTreeLeafNodeStatisticsIncremental::TPtrPtrPr
CBoostedTreeLeafNodeStatisticsIncremental::split(std::size_t leftChildId,
std::size_t rightChildId,
double /*gainThreshold*/,
const core::CDataFrame& frame,
const TRegularization& regularization,
const TSizeVec& treeFeatureBag,
const TSizeVec& nodeFeatureBag,
const CBoostedTreeNode& split,
CWorkspace& workspace) {
TPtr leftChild;
TPtr rightChild;
if (this->leftChildHasFewerRows()) {
leftChild = std::make_unique<CBoostedTreeLeafNodeStatisticsIncremental>(
leftChildId, *this, frame, regularization, treeFeatureBag,
nodeFeatureBag, true /*is left child*/, split, workspace);
rightChild = std::make_unique<CBoostedTreeLeafNodeStatisticsIncremental>(
rightChildId, std::move(*this), regularization, treeFeatureBag,
nodeFeatureBag, false /*is left child*/, workspace);
return {std::move(leftChild), std::move(rightChild)};
}
rightChild = std::make_unique<CBoostedTreeLeafNodeStatisticsIncremental>(
rightChildId, *this, frame, regularization, treeFeatureBag,
nodeFeatureBag, false /*is left child*/, split, workspace);
leftChild = std::make_unique<CBoostedTreeLeafNodeStatisticsIncremental>(
leftChildId, std::move(*this), regularization, treeFeatureBag,
nodeFeatureBag, true /*is left child*/, workspace);
return {std::move(leftChild), std::move(rightChild)};
}
std::size_t CBoostedTreeLeafNodeStatisticsIncremental::staticSize() const {
return sizeof(*this);
}
CBoostedTreeLeafNodeStatisticsIncremental::SSplitStatistics
CBoostedTreeLeafNodeStatisticsIncremental::computeBestSplitStatistics(
std::size_t numberThreads,
const TRegularization& regularization,
const TSizeVec& featureBag) const {
// We have four possible regularization terms we'll use:
// 1. Tree size: gamma * "node count"
// 2. Sum square weights: lambda * sum{"leaf weight" ^ 2)}
// 3. Tree depth: alpha * sum{exp(("depth" / "target depth" - 1.0) / "tolerance")}
// 4. Tree topology change: we get a fixed penalty for choosing a different split
// feature and a smaller penalty for choosing a different split value for the
// same feature which is proportional to the difference.
// We seek to find the value at the minimum of the quadratic expansion of the
// regularized loss function. For a given leaf this expansion is
//
// L(w) = 1/2 w^t H(\lambda) w + g^t w
//
// where H(\lambda) = \sum_i H_i + \lambda I, g = \sum_i g_i and w is the leaf's
// weight. Here, g_i and H_i denote an example's loss gradient and Hessian and i
// ranges over the examples in the leaf. Writing this as the sum of a quadratic
// form and constant, i.e. x(w)^t H(\lambda) x(w) + constant, and noting that H
// is positive definite, we see that we'll minimise loss by choosing w such that
// x is zero, i.e. w^* = arg\min_w(L(w)) satisfies x(w) = 0. This gives
//
// L(w^*) = -1/2 g^t H(\lambda)^{-1} g
using TFeatureBestSplitSearchVec = std::vector<TFeatureBestSplitSearch>;
using TSplitStatisticsVec = std::vector<SSplitStatistics>;
numberThreads = TThreading::numberThreadsForComputeBestSplitStatistics(
numberThreads, featureBag.size(), this->dimensionGradient(),
this->derivatives().numberDerivatives(featureBag));
LOG_TRACE(<< "number threads = " << numberThreads);
TFeatureBestSplitSearchVec featureBestSplitSearches;
TSplitStatisticsVec splitStats(numberThreads);
featureBestSplitSearches.reserve(numberThreads);
for (std::size_t i = 0; i < numberThreads; ++i) {
featureBestSplitSearches.push_back(
this->featureBestSplitSearch(regularization, splitStats[i]));
}
core::parallel_for_each(featureBag.begin(), featureBag.end(), featureBestSplitSearches);
SSplitStatistics result;
for (std::size_t i = 0; i < numberThreads; ++i) {
if (splitStats[i] > result) {
result = splitStats[i];
}
}
LOG_TRACE(<< "best split: " << result.print());
return result;
}
CBoostedTreeLeafNodeStatisticsIncremental::TFeatureBestSplitSearch
CBoostedTreeLeafNodeStatisticsIncremental::featureBestSplitSearch(
const TRegularization& regularization,
SSplitStatistics& bestSplitStatistics) const {
using TDoubleAry = std::array<double, 2>;
using TDoubleVector = common::CDenseVector<double>;
using TDoubleVectorAry = std::array<TDoubleVector, 2>;
using TDoubleMatrix = common::CDenseMatrix<double>;
using TDoubleMatrixAry = std::array<TDoubleMatrix, 2>;
int d{static_cast<int>(this->dimensionGradient())};
double lambda{regularization.leafWeightPenaltyMultiplier().value()};
auto minimumLoss = TThreading::makeThreadLocalMinimumLossFunction(d, lambda);
TDoubleVector g_{d};
TDoubleMatrix h_{d, d};
TDoubleVectorAry gl_{TDoubleVector{d}, TDoubleVector{d}};
TDoubleVectorAry gr_{TDoubleVector{d}, TDoubleVector{d}};
TDoubleMatrixAry hl_{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}};
TDoubleMatrixAry hr_{TDoubleMatrix{d, d}, TDoubleMatrix{d, d}};
return [
// Inputs
minimumLoss, ®ularization,
// State
g = std::move(g_), h = std::move(h_), gl = std::move(gl_),
gr = std::move(gr_), hl = std::move(hl_), hr = std::move(hr_),
// Results
&bestSplitStatistics, this
](std::size_t feature) mutable {
const auto& derivatives = this->derivatives();
const auto& candidateSplits = this->candidateSplits();
std::size_t c{derivatives.missingCount(feature)};
g = derivatives.missingGradient(feature);
h = derivatives.missingCurvature(feature);
for (auto featureDerivatives = derivatives.beginDerivatives(feature);
featureDerivatives != derivatives.endDerivatives(feature); ++featureDerivatives) {
c += featureDerivatives->count();
g += featureDerivatives->gradient();
h += featureDerivatives->curvature();
}
std::size_t cl[]{derivatives.missingCount(feature), 0};
gl[ASSIGN_MISSING_TO_LEFT] = derivatives.missingGradient(feature);
gl[ASSIGN_MISSING_TO_RIGHT] = TDoubleVector::Zero(g.rows());
gr[ASSIGN_MISSING_TO_LEFT] = g - derivatives.missingGradient(feature);
gr[ASSIGN_MISSING_TO_RIGHT] = g;
hl[ASSIGN_MISSING_TO_LEFT] = derivatives.missingCurvature(feature);
hl[ASSIGN_MISSING_TO_RIGHT] = TDoubleMatrix::Zero(h.rows(), h.cols());
hr[ASSIGN_MISSING_TO_LEFT] = h - derivatives.missingCurvature(feature);
hr[ASSIGN_MISSING_TO_RIGHT] = h;
double maximumGain{-INF};
double splitAt{-INF};
std::size_t leftChildRowCount{0};
bool assignMissingToLeft{true};
std::size_t size{derivatives.numberDerivatives(feature)};
TDoubleAry gain;
auto gainMoments = common::CBasicStatistics::momentsAccumulator(0.0, 0.0, 0.0);
for (std::size_t split = 0; split + 1 < size; ++split) {
std::size_t count{derivatives.count(feature, split)};
if (count == 0) {
continue;
}
const auto& gradient = derivatives.gradient(feature, split);
const auto& curvature = derivatives.curvature(feature, split);
cl[ASSIGN_MISSING_TO_LEFT] += count;
cl[ASSIGN_MISSING_TO_RIGHT] += count;
gl[ASSIGN_MISSING_TO_LEFT] += gradient;
gl[ASSIGN_MISSING_TO_RIGHT] += gradient;
gr[ASSIGN_MISSING_TO_LEFT] -= gradient;
gr[ASSIGN_MISSING_TO_RIGHT] -= gradient;
hl[ASSIGN_MISSING_TO_LEFT] += curvature;
hl[ASSIGN_MISSING_TO_RIGHT] += curvature;
hr[ASSIGN_MISSING_TO_LEFT] -= curvature;
hr[ASSIGN_MISSING_TO_RIGHT] -= curvature;
// Note in the following we scale the tree change penalty by 2 to undo
// the scaling which is applied when computing maximum gain.
if (cl[ASSIGN_MISSING_TO_LEFT] == 0 || cl[ASSIGN_MISSING_TO_LEFT] == c) {
gain[ASSIGN_MISSING_TO_LEFT] = -INF;
} else {
double minLossLeft{minimumLoss(gl[ASSIGN_MISSING_TO_LEFT],
hl[ASSIGN_MISSING_TO_LEFT])};
double minLossRight{minimumLoss(gr[ASSIGN_MISSING_TO_LEFT],
hr[ASSIGN_MISSING_TO_LEFT])};
gain[ASSIGN_MISSING_TO_LEFT] =
minLossLeft + minLossRight -
2.0 * this->penaltyForTreeChange(regularization, feature, split);
gainMoments.add(gain[ASSIGN_MISSING_TO_LEFT]);
}
if (cl[ASSIGN_MISSING_TO_RIGHT] == 0 || cl[ASSIGN_MISSING_TO_RIGHT] == c) {
gain[ASSIGN_MISSING_TO_RIGHT] = -INF;
} else {
double minLossLeft{minimumLoss(gl[ASSIGN_MISSING_TO_RIGHT],
hl[ASSIGN_MISSING_TO_RIGHT])};
double minLossRight{minimumLoss(gr[ASSIGN_MISSING_TO_RIGHT],
hr[ASSIGN_MISSING_TO_RIGHT])};
gain[ASSIGN_MISSING_TO_RIGHT] =
minLossLeft + minLossRight -
2.0 * this->penaltyForTreeChange(regularization, feature, split);
gainMoments.add(gain[ASSIGN_MISSING_TO_RIGHT]);
}
if (gain[ASSIGN_MISSING_TO_LEFT] > maximumGain) {
maximumGain = gain[ASSIGN_MISSING_TO_LEFT];
splitAt = candidateSplits[feature][split];
leftChildRowCount = cl[ASSIGN_MISSING_TO_LEFT];
assignMissingToLeft = true;
}
if (gain[ASSIGN_MISSING_TO_RIGHT] > maximumGain) {
maximumGain = gain[ASSIGN_MISSING_TO_RIGHT];
splitAt = candidateSplits[feature][split];
leftChildRowCount = cl[ASSIGN_MISSING_TO_RIGHT];
assignMissingToLeft = false;
}
}
double penaltyForDepth{regularization.penaltyForDepth(this->depth())};
double penaltyForDepthPlusOne{regularization.penaltyForDepth(this->depth() + 1)};
// The gain is the difference between the quadratic minimum for loss with
// no split and the loss with the minimum loss split we found.
double totalGain{0.5 * (maximumGain - minimumLoss(g, h)) -
regularization.treeSizePenaltyMultiplier().value() -
regularization.depthPenaltyMultiplier().value() *
(2.0 * penaltyForDepthPlusOne - penaltyForDepth)};
SSplitStatistics candidate{
totalGain,
common::CBasicStatistics::variance(gainMoments),
h.trace() / static_cast<double>(this->dimensionGradient()),
feature,
splitAt,
std::min(leftChildRowCount, c - leftChildRowCount),
2 * leftChildRowCount < c,
assignMissingToLeft};
LOG_TRACE(<< "candidate split: " << candidate.print());
if (candidate > bestSplitStatistics) {
bestSplitStatistics = candidate;
}
};
}
double
CBoostedTreeLeafNodeStatisticsIncremental::penaltyForTreeChange(const TRegularization& regularization,
std::size_t feature,
std::size_t split) const {
if (m_PreviousSplit == std::nullopt) {
return 0.0;
}
if (feature != m_PreviousSplit->s_Feature) {
return regularization.treeTopologyChangePenalty().value();
}
const auto& candidateSplits = this->candidateSplits()[feature];
if (candidateSplits.empty()) {
return 0.0;
}
double a{*candidateSplits.begin()};
double b{*(candidateSplits.end() - 1)};
if (a == b) {
return 0.0;
}
double splitAt{candidateSplits[split]};
double previousSplitAt{common::CTools::truncate(m_PreviousSplit->s_SplitAt, a, b)};
return 0.25 * regularization.treeTopologyChangePenalty().value() *
std::fabs(splitAt - previousSplitAt) / (b - a);
}
CBoostedTreeLeafNodeStatisticsIncremental::TOptionalPreviousSplit
CBoostedTreeLeafNodeStatisticsIncremental::rootPreviousSplit(const CWorkspace& workspace) const {
if (workspace.retraining() == nullptr) {
return {};
}
const auto& tree = *workspace.retraining();
const auto& node = root(tree);
if (node.isLeaf()) {
return {};
}
return TOptionalPreviousSplit{std::in_place, rootIndex(),
node.splitFeature(), node.splitValue()};
}
CBoostedTreeLeafNodeStatisticsIncremental::TOptionalPreviousSplit
CBoostedTreeLeafNodeStatisticsIncremental::leftChildPreviousSplit(std::size_t feature,
const CWorkspace& workspace) const {
if (workspace.retraining() == nullptr || m_PreviousSplit == std::nullopt ||
m_PreviousSplit->s_Feature != feature) {
return {};
}
const auto& tree = *workspace.retraining();
if (tree[m_PreviousSplit->s_NodeIndex].isLeaf()) {
return {};
}
std::size_t leftChildIndex{tree[m_PreviousSplit->s_NodeIndex].leftChildIndex()};
const auto& node = tree[leftChildIndex];
return TOptionalPreviousSplit{std::in_place, leftChildIndex,
node.splitFeature(), node.splitValue()};
}
CBoostedTreeLeafNodeStatisticsIncremental::TOptionalPreviousSplit
CBoostedTreeLeafNodeStatisticsIncremental::rightChildPreviousSplit(std::size_t feature,
const CWorkspace& workspace) const {
if (workspace.retraining() == nullptr || m_PreviousSplit == std::nullopt ||
m_PreviousSplit->s_Feature != feature) {
return {};
}
const auto& tree = *workspace.retraining();
if (tree[m_PreviousSplit->s_NodeIndex].isLeaf()) {
return {};
}
std::size_t rightChildIndex{tree[m_PreviousSplit->s_NodeIndex].leftChildIndex()};
const auto& node = tree[rightChildIndex];
return TOptionalPreviousSplit{std::in_place, rightChildIndex,
node.splitFeature(), node.splitValue()};
}
}
}
}