lib/maths/analytics/CBoostedTreeLoss.cc (1,573 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <maths/analytics/CBoostedTreeLoss.h>
#include <core/CDataFrame.h>
#include <core/CPersistUtils.h>
#include <core/Concurrency.h>
#include <maths/analytics/CBoostedTreeUtils.h>
#include <maths/analytics/CDataFrameCategoryEncoder.h>
#include <maths/common/CBasicStatistics.h>
#include <maths/common/CLbfgs.h>
#include <maths/common/CLinearAlgebraEigen.h>
#include <maths/common/CPRNG.h>
#include <maths/common/CSampling.h>
#include <maths/common/CSolvers.h>
#include <maths/common/CTools.h>
#include <maths/common/CToolsDetail.h>
#include <algorithm>
#include <exception>
#include <limits>
#include <memory>
namespace ml {
namespace maths {
namespace analytics {
using namespace boosted_tree_detail;
using TRowItr = core::CDataFrame::TRowItr;
namespace {
const double EPSILON{100.0 * std::numeric_limits<double>::epsilon()};
const double LOG_EPSILON{common::CTools::stableLog(EPSILON)};
// MSLE constants
const std::size_t MSLE_PREDICTION_INDEX{0};
const std::size_t MSLE_ACTUAL_INDEX{1};
const std::size_t MSLE_ERROR_INDEX{2};
const std::size_t MSLE_BUCKET_SIZE{32};
const std::size_t MSLE_OPTIMIZATION_ITERATIONS{15};
// Pseudo-Huber constants
const std::size_t HUBER_BUCKET_SIZE{128};
const std::size_t HUBER_OPTIMIZATION_ITERATIONS{15};
// Persistence and restoration
const std::string NUMBER_CLASSES_TAG{"number_classes"};
const std::string OFFSET_TAG{"offset"};
const std::string DELTA_TAG{"delta"};
const std::string NAME_TAG{"name"};
double logOneMinusLogistic(double logOdds) {
// For large x logistic(x) = 1 - e^(-x) + O(e^(-2x))
if (logOdds > -LOG_EPSILON) {
return -logOdds;
}
return common::CTools::stableLog(1.0 - common::CTools::logisticFunction(logOdds));
}
double logLogistic(double logOdds) {
// For small x logistic(x) = e^(x) + O(e^(2x))
if (logOdds < LOG_EPSILON) {
return logOdds;
}
return common::CTools::stableLog(common::CTools::logisticFunction(logOdds));
}
}
namespace boosted_tree_detail {
CArgMinLossImpl::CArgMinLossImpl(double lambda) : m_Lambda{lambda} {
}
double CArgMinLossImpl::lambda() const {
return m_Lambda;
}
CArgMinMseImpl::CArgMinMseImpl(double lambda) : CArgMinLossImpl{lambda} {
}
std::unique_ptr<CArgMinLossImpl> CArgMinMseImpl::clone() const {
return std::make_unique<CArgMinMseImpl>(*this);
}
bool CArgMinMseImpl::nextPass() {
return false;
}
void CArgMinMseImpl::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) {
m_MeanError.add(actual - prediction(0), weight);
}
void CArgMinMseImpl::merge(const CArgMinLossImpl& other) {
const auto* mse = dynamic_cast<const CArgMinMseImpl*>(&other);
if (mse != nullptr) {
m_MeanError += mse->m_MeanError;
}
}
CArgMinMseImpl::TDoubleVector CArgMinMseImpl::value() const {
// We searching for the value x^* which satisfies
//
// x^* = argmin_x{ sum_i{(a_i - (p_i + x))^2} + lambda * x^2 }
//
// This is convex so there is one minimum where the derivative w.r.t. x is zero
// and x^* = 1 / (n + lambda) sum_i{ a_i - p_i }. Denoting the mean prediction
// error m = 1/n sum_i{ a_i - p_i } we have x^* = n / (n + lambda) m.
double count{common::CBasicStatistics::count(m_MeanError)};
double meanError{common::CBasicStatistics::mean(m_MeanError)};
TDoubleVector result(1);
result(0) = count == 0.0 ? 0.0 : count / (count + this->lambda()) * meanError;
return result;
}
CArgMinMseIncrementalImpl::CArgMinMseIncrementalImpl(double lambda,
double eta,
double mu,
const TNodeVec& tree)
: CArgMinMseImpl{lambda}, m_Eta{eta}, m_Mu{mu}, m_Tree{&tree} {
}
std::unique_ptr<CArgMinLossImpl> CArgMinMseIncrementalImpl::clone() const {
return std::make_unique<CArgMinMseIncrementalImpl>(*this);
}
void CArgMinMseIncrementalImpl::add(const CEncodedDataFrameRowRef& row,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
this->CArgMinMseImpl::add(prediction, actual, weight);
if (newExample == false) {
m_MeanTreePredictions.add(root(*m_Tree).value(row, *m_Tree)(0));
}
}
void CArgMinMseIncrementalImpl::merge(const CArgMinLossImpl& other) {
const auto* mse = dynamic_cast<const CArgMinMseIncrementalImpl*>(&other);
if (mse != nullptr) {
this->CArgMinMseImpl::merge(*mse);
m_MeanTreePredictions += mse->m_MeanTreePredictions;
}
}
CArgMinMseIncrementalImpl::TDoubleVector CArgMinMseIncrementalImpl::value() const {
// We searching for the value x^* which satisfies
//
// x^* = argmin_x{ sum_i{(a_i - (p_i + x))^2 + 1{old} mu (p_i' / eta - x)^2} + lambda * x^2 }
//
// Here, a_i are the actuals, p_i the predictions and p_i' the predictions from
// the tree being retrained. This is convex so there is one minimum where the
// derivative w.r.t. x is zero and
//
// x^* = 1 / (n (1 + n' / n mu) + lambda) sum_i{ a_i - p_i + mu / eta 1{old} p_i' }.
//
// where n' = sum_i 1{old}. Denoting the mean prediction error m = 1/n sum_i{ a_i - p_i }
// and the mean tree predictions p' = 1/n' sum_i{p_i'} we have
//
// x^* = n / (n (1 + n' / n mu) + lambda) (m + n' / n mu / eta p').
//
// In the following we absorb n' / n into the value of mu.
double count{common::CBasicStatistics::count(this->meanError())};
double meanError{common::CBasicStatistics::mean(this->meanError())};
double oldCount{common::CBasicStatistics::count(m_MeanTreePredictions)};
double meanTreePrediction{common::CBasicStatistics::mean(m_MeanTreePredictions)};
double mu{(count == oldCount ? 1.0 : oldCount / count) * m_Mu};
TDoubleVector result(1);
result(0) = count == 0.0 ? 0.0
: count / (count * (1.0 + mu) + this->lambda()) *
(meanError + mu / m_Eta * meanTreePrediction);
return result;
}
CArgMinBinomialLogisticLossImpl::CArgMinBinomialLogisticLossImpl(double lambda)
: CArgMinLossImpl{lambda}, m_ClassCounts{0},
m_BucketsClassCounts(NUMBER_BUCKETS, TDoubleVector2x1{0.0}) {
}
std::unique_ptr<CArgMinLossImpl> CArgMinBinomialLogisticLossImpl::clone() const {
return std::make_unique<CArgMinBinomialLogisticLossImpl>(*this);
}
bool CArgMinBinomialLogisticLossImpl::nextPass() {
m_CurrentPass += bucketWidth(m_PredictionMinMax) > 0.0 ? 1 : 2;
return m_CurrentPass < 2;
}
void CArgMinBinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
switch (m_CurrentPass) {
case 0: {
m_PredictionMinMax.add(prediction(0));
m_ClassCounts(static_cast<std::size_t>(actual)) += weight;
break;
}
case 1: {
auto& count = m_BucketsClassCounts[bucket(m_PredictionMinMax, prediction(0))];
count(static_cast<std::size_t>(actual)) += weight;
break;
}
default:
break;
}
}
void CArgMinBinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) {
const auto* logistic = dynamic_cast<const CArgMinBinomialLogisticLossImpl*>(&other);
if (logistic != nullptr) {
switch (m_CurrentPass) {
case 0:
m_PredictionMinMax += logistic->m_PredictionMinMax;
m_ClassCounts += logistic->m_ClassCounts;
break;
case 1:
for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) {
m_BucketsClassCounts[i] += logistic->m_BucketsClassCounts[i];
}
break;
default:
break;
}
}
}
CArgMinBinomialLogisticLossImpl::TDoubleVector CArgMinBinomialLogisticLossImpl::value() const {
double minWeight;
double maxWeight;
// This is true if and only if all the predictions were identical. In this
// case we only need one pass over the data and can compute the optimal
// value from the counts of the two categories.
if (bucketWidth(m_PredictionMinMax) == 0.0) {
// This is the (unique) predicted value for the rows in leaf by the forest
// so far (i.e. without the weight for the leaf we're about to add).
double prediction{mid(m_PredictionMinMax)};
// Weight shrinkage means the optimal weight will be somewhere between
// the logit of the empirical probability and zero.
double c0{m_ClassCounts(0) + 1.0};
double c1{m_ClassCounts(1) + 1.0};
double empiricalProbabilityC1{c1 / (c0 + c1)};
double empiricalLogOddsC1{common::CTools::stableLog(
empiricalProbabilityC1 / (1.0 - empiricalProbabilityC1))};
minWeight = (empiricalProbabilityC1 < 0.5 ? empiricalLogOddsC1 : 0.0) - prediction;
maxWeight = (empiricalProbabilityC1 < 0.5 ? 0.0 : empiricalLogOddsC1) - prediction;
} else {
// Choose a weight interval in which all probabilites vary from close to
// zero to close to one. In particular, the idea is to minimize the leaf
// weight on an interval [a, b] where if we add "a" the log-odds for all
// rows <= -5, i.e. max prediction + a = -5, and if we add "b" the log-odds
// for all rows >= 5, i.e. min prediction + a = 5.
minWeight = -m_PredictionMinMax.max() - 5.0;
maxWeight = -m_PredictionMinMax.min() + 5.0;
}
minWeight = std::min(minWeight, this->minWeight());
maxWeight = std::max(maxWeight, this->maxWeight());
TDoubleVector result(1);
if (minWeight == maxWeight) {
result(0) = minWeight;
return result;
}
double minimum;
double objectiveAtMinimum;
auto objective = this->objective();
std::size_t maxIterations{10};
common::CSolvers::minimize(minWeight, maxWeight, objective(minWeight),
objective(maxWeight), objective, 1e-3,
maxIterations, minimum, objectiveAtMinimum);
LOG_TRACE(<< "minimum = " << minimum << " objective(minimum) = " << objectiveAtMinimum);
result(0) = minimum;
return result;
}
CArgMinBinomialLogisticLossImpl::TObjective CArgMinBinomialLogisticLossImpl::objective() const {
// This is true if all the predictions were identical. In this case we only
// need one pass over the data and can compute the optimal value from the
// counts of the two categories.
if (bucketWidth(m_PredictionMinMax) == 0.0) {
double prediction{mid(m_PredictionMinMax)};
return [prediction, this](double weight) {
double logOdds{prediction + weight};
double c0{m_ClassCounts(0)};
double c1{m_ClassCounts(1)};
return this->lambda() * common::CTools::pow2(weight) -
c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds);
};
}
return [this](double weight) {
double loss{this->lambda() * common::CTools::pow2(weight)};
for (std::size_t i = 0; i < m_BucketsClassCounts.size(); ++i) {
double logOdds{bucketCentre(m_PredictionMinMax, i) + weight};
double c0{m_BucketsClassCounts[i](0)};
double c1{m_BucketsClassCounts[i](1)};
loss -= c0 * logOneMinusLogistic(logOdds) + c1 * logLogistic(logOdds);
}
return loss;
};
}
CArgMinBinomialLogisticLossIncrementalImpl::CArgMinBinomialLogisticLossIncrementalImpl(
double lambda,
double eta,
double mu,
const TNodeVec& tree)
: CArgMinBinomialLogisticLossImpl{lambda}, m_Eta{eta}, m_Mu{mu}, m_Tree{&tree},
m_BucketsCount(NUMBER_BUCKETS, 0.0) {
}
std::unique_ptr<CArgMinLossImpl> CArgMinBinomialLogisticLossIncrementalImpl::clone() const {
return std::make_unique<CArgMinBinomialLogisticLossIncrementalImpl>(*this);
}
bool CArgMinBinomialLogisticLossIncrementalImpl::nextPass() {
this->CArgMinBinomialLogisticLossImpl::nextPass();
m_CurrentPass += bucketWidth(m_TreePredictionMinMax) > 0.0 ? 1 : 2;
return m_CurrentPass < 2;
}
void CArgMinBinomialLogisticLossIncrementalImpl::add(const CEncodedDataFrameRowRef& row,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
this->CArgMinBinomialLogisticLossImpl::add(prediction, actual, weight);
if (newExample == false) {
switch (m_CurrentPass) {
case 0: {
double treePrediction{root(*m_Tree).value(row, *m_Tree)(0) / m_Eta};
m_TreePredictionMinMax.add(treePrediction);
m_Count += weight;
break;
}
case 1: {
double treePrediction{root(*m_Tree).value(row, *m_Tree)(0) / m_Eta};
auto& count = m_BucketsCount[bucket(m_TreePredictionMinMax, treePrediction)];
count += weight;
break;
}
default:
break;
}
}
}
void CArgMinBinomialLogisticLossIncrementalImpl::merge(const CArgMinLossImpl& other) {
const auto* logistic =
dynamic_cast<const CArgMinBinomialLogisticLossIncrementalImpl*>(&other);
if (logistic != nullptr) {
this->CArgMinBinomialLogisticLossImpl::merge(*logistic);
switch (m_CurrentPass) {
case 0:
m_TreePredictionMinMax += logistic->m_TreePredictionMinMax;
m_Count += logistic->m_Count;
break;
case 1:
for (std::size_t i = 0; i < m_BucketsCount.size(); ++i) {
m_BucketsCount[i] += logistic->m_BucketsCount[i];
}
break;
default:
break;
}
}
}
CArgMinBinomialLogisticLossIncrementalImpl::TObjective
CArgMinBinomialLogisticLossIncrementalImpl::objective() const {
// This is true if all the forest and tree predictions were identical.
if (bucketWidth(this->predictionMinMax()) == 0.0 &&
bucketWidth(m_TreePredictionMinMax) == 0.0) {
double prediction{mid(this->predictionMinMax())};
double pOld{common::CTools::logisticFunction(mid(m_TreePredictionMinMax))};
double mu{m_Mu * m_Count};
return [prediction, pOld, mu, this](double weight) {
double logOdds{prediction + weight};
double c0{this->classCounts()(0)};
double c1{this->classCounts()(1)};
double logOneMinusPNew{logOneMinusLogistic(weight)};
double logPNew{logLogistic(weight)};
return this->lambda() * common::CTools::pow2(weight) -
c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds) -
mu * ((1.0 - pOld) * logOneMinusPNew + pOld * logPNew);
};
}
// This is true if all the forest predictions were identical.
if (bucketWidth(this->predictionMinMax()) == 0.0) {
double prediction{mid(this->predictionMinMax())};
return [prediction, this](double weight) {
double logOdds{prediction + weight};
double c0{this->classCounts()(0)};
double c1{this->classCounts()(1)};
double logOneMinusPNew{logOneMinusLogistic(weight)};
double logPNew{logLogistic(weight)};
double loss{this->lambda() * common::CTools::pow2(weight) -
c0 * logOneMinusLogistic(logOdds) - c1 * logLogistic(logOdds)};
for (std::size_t i = 0; i < NUMBER_BUCKETS; ++i) {
double pOld{common::CTools::logisticFunction(
bucketCentre(m_TreePredictionMinMax, i))};
double mu{m_Mu * m_BucketsCount[i]};
loss -= mu * ((1.0 - pOld) * logOneMinusPNew + pOld * logPNew);
}
return loss;
};
}
// This is true if all the tree predictions were identical.
if (bucketWidth(m_TreePredictionMinMax) == 0.0) {
double pOld{common::CTools::logisticFunction(mid(m_TreePredictionMinMax))};
double mu{m_Mu * m_Count};
return [pOld, mu, this](double weight) {
const auto& predictionMinMax = this->predictionMinMax();
const auto& bucketsClassCounts = this->bucketsClassCounts();
double logOneMinusPNew{logOneMinusLogistic(weight)};
double logPNew{logLogistic(weight)};
double loss{this->lambda() * common::CTools::pow2(weight) -
mu * ((1.0 - pOld) * logOneMinusPNew + pOld * logPNew)};
for (std::size_t i = 0; i < NUMBER_BUCKETS; ++i) {
double logOdds{bucketCentre(predictionMinMax, i) + weight};
double c0{bucketsClassCounts[i](0)};
double c1{bucketsClassCounts[i](1)};
loss -= c0 * logOneMinusLogistic(logOdds) + c1 * logLogistic(logOdds);
}
return loss;
};
}
return [this](double weight) {
const auto& predictionMinMax = this->predictionMinMax();
const auto& bucketsClassCounts = this->bucketsClassCounts();
double logOneMinusPNew{logOneMinusLogistic(weight)};
double logPNew{logLogistic(weight)};
double loss{this->lambda() * common::CTools::pow2(weight)};
for (std::size_t i = 0; i < NUMBER_BUCKETS; ++i) {
double logOdds{bucketCentre(predictionMinMax, i) + weight};
double c0{bucketsClassCounts[i](0)};
double c1{bucketsClassCounts[i](1)};
double mu{m_Mu * m_BucketsCount[i]};
double pOld{common::CTools::logisticFunction(
bucketCentre(m_TreePredictionMinMax, i))};
loss -= c0 * logOneMinusLogistic(logOdds) + c1 * logLogistic(logOdds) +
mu * ((1.0 - pOld) * logOneMinusPNew + pOld * logPNew);
}
return loss;
};
}
CArgMinMultinomialLogisticLossImpl::CArgMinMultinomialLogisticLossImpl(
std::size_t numberClasses,
double lambda,
const common::CPRNG::CXorOShiro128Plus& rng)
: CArgMinLossImpl{lambda}, m_NumberClasses{numberClasses}, m_Rng{rng},
m_ClassCounts{TDoubleVector::Zero(numberClasses)}, m_Sampler{NUMBER_CENTRES} {
}
std::unique_ptr<CArgMinLossImpl> CArgMinMultinomialLogisticLossImpl::clone() const {
return std::make_unique<CArgMinMultinomialLogisticLossImpl>(*this);
}
bool CArgMinMultinomialLogisticLossImpl::nextPass() {
if (m_CurrentPass++ == 0) {
m_Centres = std::move(m_Sampler.samples());
m_CurrentPass += m_Centres.size() == 1 ? 1 : 0;
m_CentresClassCounts.resize(m_Centres.size(), TDoubleVector::Zero(m_NumberClasses));
LOG_TRACE(<< "# centres = " << m_Centres.size());
}
LOG_TRACE(<< "current pass = " << m_CurrentPass);
return m_CurrentPass < 2;
}
void CArgMinMultinomialLogisticLossImpl::add(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
using TMinAccumulator =
common::CBasicStatistics::SMin<std::pair<double, std::size_t>>::TAccumulator;
switch (m_CurrentPass) {
case 0: {
// We have a member variable to avoid allocating a tempory each time.
m_DoublePrediction = prediction;
m_Sampler.sample(m_DoublePrediction);
m_ClassCounts(static_cast<std::size_t>(actual)) += weight;
break;
}
case 1: {
if (m_Centres.empty()) {
HANDLE_FATAL(<< "Internal error: failed computing leaf weights.");
return;
}
TMinAccumulator nearest;
for (std::size_t i = 0; i < m_Centres.size(); ++i) {
nearest.add({(m_Centres[i] - prediction).squaredNorm(), i});
}
auto& count = m_CentresClassCounts[nearest[0].second];
count(static_cast<std::size_t>(actual)) += weight;
break;
}
default:
break;
}
}
void CArgMinMultinomialLogisticLossImpl::merge(const CArgMinLossImpl& other) {
const auto* logistic = dynamic_cast<const CArgMinMultinomialLogisticLossImpl*>(&other);
if (logistic != nullptr) {
switch (m_CurrentPass) {
case 0:
m_Sampler.merge(logistic->m_Sampler);
m_ClassCounts += logistic->m_ClassCounts;
break;
case 1:
for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
m_CentresClassCounts[i] += logistic->m_CentresClassCounts[i];
}
break;
default:
break;
}
}
}
CArgMinMultinomialLogisticLossImpl::TDoubleVector
CArgMinMultinomialLogisticLossImpl::value() const {
// The optimisation objective is convex. To see this note that we can write
// it as sum_i{ f_ij(w) } + ||w||^2 with f_ij(w) = -[log(softmax_j(z_i + w))].
// Since the sum of convex functions is convex and ||.|| is clearly convex we
// just require the f_ij to be convex. This is a standard result and follows from
// the fact that their Hessian is of the form H = diag(p) - p p^t where 1-norm
// of p is one. Convexity follows if this is positive definite. To verify note
// that x^t H x = ||p^(1/2) x||^2 ||p^(1/2)||^2 - (p^t x)^2, which is greater
// than 0 for all x via Cauchy-Schwarz. We optimize via L-BFGS. Note also that
// we truncate lambda to be positive so the weights don't become too large for
// leaves with only one class.
TObjective objective{this->objective()};
TObjectiveGradient objectiveGradient{this->objectiveGradient()};
TDoubleVector wmin{TDoubleVector::Zero(m_NumberClasses)};
double loss;
common::CLbfgs<TDoubleVector> lgbfs{5};
std::tie(wmin, loss) = lgbfs.minimize(objective, objectiveGradient, std::move(wmin));
LOG_TRACE(<< "loss* = " << loss << " weight* = " << wmin.transpose());
return wmin;
}
CArgMinMultinomialLogisticLossImpl::TObjective
CArgMinMultinomialLogisticLossImpl::objective() const {
TDoubleVector logProbabilities{m_NumberClasses};
double lambda{std::max(this->lambda(), 1e-6)};
if (m_Centres.size() == 1) {
return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable -> double {
logProbabilities = m_Centres[0] + weight;
common::CTools::inplaceLogSoftmax(logProbabilities);
return lambda * weight.squaredNorm() - m_ClassCounts.transpose() * logProbabilities;
};
}
return [logProbabilities, lambda, this](const TDoubleVector& weight) mutable -> double {
double loss{0.0};
for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
if (m_CentresClassCounts[i].sum() > 0.0) {
logProbabilities = m_Centres[i] + weight;
common::CTools::inplaceLogSoftmax(logProbabilities);
loss -= m_CentresClassCounts[i].transpose() * logProbabilities;
}
}
return loss + lambda * weight.squaredNorm();
};
}
CArgMinMultinomialLogisticLossImpl::TObjectiveGradient
CArgMinMultinomialLogisticLossImpl::objectiveGradient() const {
TDoubleVector probabilities{m_NumberClasses};
TDoubleVector lossGradient{m_NumberClasses};
double lambda{std::max(this->lambda(), 1e-6)};
if (m_Centres.size() == 1) {
return [probabilities, lossGradient, lambda,
this](const TDoubleVector& weight) mutable -> TDoubleVector {
probabilities = m_Centres[0] + weight;
common::CTools::inplaceSoftmax(probabilities);
lossGradient = m_ClassCounts.array().sum() * probabilities - m_ClassCounts;
return 2.0 * lambda * weight + lossGradient;
};
}
return [probabilities, lossGradient, lambda,
this](const TDoubleVector& weight) mutable -> TDoubleVector {
lossGradient.array() = 0.0;
for (std::size_t i = 0; i < m_CentresClassCounts.size(); ++i) {
double n{m_CentresClassCounts[i].array().sum()};
if (n > 0.0) {
probabilities = m_Centres[i] + weight;
common::CTools::inplaceSoftmax(probabilities);
lossGradient -= m_CentresClassCounts[i] - n * probabilities;
}
}
return 2.0 * lambda * weight + lossGradient;
};
}
CArgMinMsleImpl::CArgMinMsleImpl(double lambda, double offset)
: CArgMinLossImpl{lambda}, m_Offset{offset}, m_Buckets(MSLE_BUCKET_SIZE) {
for (auto& bucket : m_Buckets) {
bucket.resize(MSLE_BUCKET_SIZE);
}
}
std::unique_ptr<CArgMinLossImpl> CArgMinMsleImpl::clone() const {
return std::make_unique<CArgMinMsleImpl>(*this);
}
bool CArgMinMsleImpl::nextPass() {
m_CurrentPass += this->bucketWidth().first > 0.0 ? 1 : 2;
return m_CurrentPass < 2;
}
void CArgMinMsleImpl::add(const TMemoryMappedFloatVector& prediction, double actual, double weight) {
double expPrediction{common::CTools::stableExp(prediction[0])};
double logActual{common::CTools::fastLog(m_Offset + actual)};
switch (m_CurrentPass) {
case 0: {
m_ExpPredictionMinMax.add(expPrediction);
m_LogActualMinMax.add(logActual);
m_MeanLogActual.add(logActual, weight);
break;
}
case 1: {
double logError{logActual - common::CTools::fastLog(m_Offset + expPrediction)};
TVector example;
example(MSLE_PREDICTION_INDEX) = expPrediction;
example(MSLE_ACTUAL_INDEX) = logActual;
example(MSLE_ERROR_INDEX) = logError;
auto bucketIndex{this->bucket(expPrediction, logActual)};
m_Buckets[bucketIndex.first][bucketIndex.second].add(example, weight);
break;
}
default:
break;
}
}
void CArgMinMsleImpl::merge(const CArgMinLossImpl& other) {
const auto* mlse = dynamic_cast<const CArgMinMsleImpl*>(&other);
if (mlse != nullptr) {
switch (m_CurrentPass) {
case 0:
m_ExpPredictionMinMax += mlse->m_ExpPredictionMinMax;
m_LogActualMinMax += mlse->m_LogActualMinMax;
m_MeanLogActual += mlse->m_MeanLogActual;
break;
case 1:
for (std::size_t i = 0; i < m_Buckets.size(); ++i) {
for (std::size_t j = 0; j < m_Buckets[i].size(); ++j) {
m_Buckets[i][j] += mlse->m_Buckets[i][j];
}
}
break;
default:
break;
}
}
}
CArgMinMsleImpl::TDoubleVector CArgMinMsleImpl::value() const {
TObjective objective;
double minLogWeight;
double maxLogWeight;
objective = this->objective();
if (this->bucketWidth().first == 0.0) {
minLogWeight = -4.0;
maxLogWeight = common::CBasicStatistics::mean(m_MeanLogActual);
} else {
// If the weight smaller than the minimum log error every prediction is low.
// Conversely, if it's larger than the maximum log error every prediction is
// high. In both cases, we can reduce the error by making the weight larger,
// respectively smaller. Therefore, the optimal weight must lie between these
// values.
minLogWeight = std::numeric_limits<double>::max();
maxLogWeight = -minLogWeight;
for (const auto& bucketsPrediction : m_Buckets) {
for (const auto& bucketActual : bucketsPrediction) {
minLogWeight = std::min(
minLogWeight, common::CBasicStatistics::mean(bucketActual)(MSLE_ERROR_INDEX));
maxLogWeight = std::max(
maxLogWeight, common::CBasicStatistics::mean(bucketActual)(MSLE_ERROR_INDEX));
}
}
}
double minimizer;
double objectiveAtMinimum;
std::size_t maxIterations{MSLE_OPTIMIZATION_ITERATIONS};
common::CSolvers::minimize(minLogWeight, maxLogWeight, objective(minLogWeight),
objective(maxLogWeight), objective, 1e-5,
maxIterations, minimizer, objectiveAtMinimum);
LOG_TRACE(<< "minimum = " << minimizer << " objective(minimum) = " << objectiveAtMinimum);
TDoubleVector result(1);
result(0) = minimizer;
return result;
}
CArgMinMsleImpl::TObjective CArgMinMsleImpl::objective() const {
return [this](double logWeight) {
double weight{common::CTools::stableExp(logWeight)};
if (this->bucketWidth().first == 0.0) {
// prediction is constant
double expPrediction{m_ExpPredictionMinMax.max()};
double logPrediction{common::CTools::fastLog(m_Offset + expPrediction * weight)};
double meanLogActual{common::CBasicStatistics::mean(m_MeanLogActual)};
double meanLogActualSquared{common::CBasicStatistics::variance(m_MeanLogActual) +
common::CTools::pow2(meanLogActual)};
double loss{meanLogActualSquared - 2.0 * meanLogActual * logPrediction +
common::CTools::pow2(logPrediction)};
return loss + this->lambda() * common::CTools::pow2(weight);
}
double loss{0.0};
double totalCount{0.0};
for (const auto& bucketPrediction : m_Buckets) {
for (const auto& bucketActual : bucketPrediction) {
double count{common::CBasicStatistics::count(bucketActual)};
if (count > 0.0) {
const auto& bucketMean{common::CBasicStatistics::mean(bucketActual)};
double expPrediction{bucketMean(MSLE_PREDICTION_INDEX)};
double logActual{bucketMean(MSLE_ACTUAL_INDEX)};
double logPrediction{
common::CTools::fastLog(m_Offset + expPrediction * weight)};
loss += count * common::CTools::pow2(logActual - logPrediction);
totalCount += count;
}
}
}
return loss / totalCount + this->lambda() * common::CTools::pow2(weight);
};
}
CArgMinPseudoHuberImpl::CArgMinPseudoHuberImpl(double lambda, double delta)
: CArgMinLossImpl{lambda}, m_DeltaSquared{common::CTools::pow2(delta)},
m_Buckets(HUBER_BUCKET_SIZE) {
}
std::unique_ptr<CArgMinLossImpl> CArgMinPseudoHuberImpl::clone() const {
return std::make_unique<CArgMinPseudoHuberImpl>(*this);
}
bool CArgMinPseudoHuberImpl::nextPass() {
m_CurrentPass += this->bucketWidth() > 0.0 ? 1 : 2;
return m_CurrentPass < 2;
}
void CArgMinPseudoHuberImpl::add(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
switch (m_CurrentPass) {
case 0: {
m_ErrorMinMax.add(actual - prediction[0]);
break;
}
case 1: {
double error{actual - prediction[0]};
auto bucketIndex{this->bucket(error)};
m_Buckets[bucketIndex].add(error, weight);
break;
}
default:
break;
}
}
void CArgMinPseudoHuberImpl::merge(const CArgMinLossImpl& other) {
const auto* huber = dynamic_cast<const CArgMinPseudoHuberImpl*>(&other);
if (huber != nullptr) {
switch (m_CurrentPass) {
case 0:
m_ErrorMinMax += huber->m_ErrorMinMax;
break;
case 1:
for (std::size_t i = 0; i < m_Buckets.size(); ++i) {
m_Buckets[i] += huber->m_Buckets[i];
}
break;
default:
break;
}
}
}
CArgMinPseudoHuberImpl::TDoubleVector CArgMinPseudoHuberImpl::value() const {
// Set the lower (upper) bounds for minimisation such that every example will
// have the same sign error if the weight is smaller (larger) than this bound
// and so would only increase the loss.
double minWeight = m_ErrorMinMax.min();
double maxWeight = m_ErrorMinMax.max();
TObjective objective{this->objective()};
double minimizer;
double objectiveAtMinimum;
std::size_t maxIterations{HUBER_OPTIMIZATION_ITERATIONS};
common::CSolvers::minimize(minWeight, maxWeight, objective(minWeight),
objective(maxWeight), objective, 1e-5,
maxIterations, minimizer, objectiveAtMinimum);
LOG_TRACE(<< "minimum = " << minimizer << " objective(minimum) = " << objectiveAtMinimum);
TDoubleVector result(1);
result(0) = minimizer;
return result;
}
CArgMinPseudoHuberImpl::TObjective CArgMinPseudoHuberImpl::objective() const {
return [this](double weight) {
if (m_DeltaSquared > 0) {
double loss{0.0};
double totalCount{0.0};
for (const auto& bucket : m_Buckets) {
double count{common::CBasicStatistics::count(bucket)};
if (count > 0.0) {
double error{common::CBasicStatistics::mean(bucket)};
loss += count * m_DeltaSquared *
(std::sqrt(1.0 + common::CTools::pow2(error - weight) / m_DeltaSquared) -
1.0);
totalCount += count;
}
}
return loss / totalCount + this->lambda() * common::CTools::pow2(weight);
}
return 0.0;
};
}
}
namespace boosted_tree {
void CLoss::persistLoss(core::CStatePersistInserter& inserter) const {
auto persist = [this](core::CStatePersistInserter& inserter_) {
this->acceptPersistInserter(inserter_);
};
inserter.insertLevel(this->name(), persist);
}
CLoss::TLossUPtr CLoss::restoreLoss(core::CStateRestoreTraverser& traverser) {
const std::string& lossFunctionName{traverser.name()};
try {
if (lossFunctionName == CMse::NAME) {
return std::make_unique<CMse>(traverser);
}
if (lossFunctionName == CMsle::NAME) {
return std::make_unique<CMsle>(traverser);
}
if (lossFunctionName == CPseudoHuber::NAME) {
return std::make_unique<CPseudoHuber>(traverser);
}
if (lossFunctionName == CBinomialLogisticLoss::NAME) {
return std::make_unique<CBinomialLogisticLoss>(traverser);
}
if (lossFunctionName == CMultinomialLogisticLoss::NAME) {
return std::make_unique<CMultinomialLogisticLoss>(traverser);
}
} catch (const std::exception& e) {
LOG_ERROR(<< "Error restoring loss function " << lossFunctionName << " "
<< e.what());
return nullptr;
}
LOG_ERROR(<< "Error restoring loss function. Unknown loss function type '"
<< lossFunctionName << "'.");
return nullptr;
}
CArgMinLoss::CArgMinLoss(const CArgMinLoss& other)
: m_Impl{other.m_Impl->clone()} {
}
CArgMinLoss& CArgMinLoss::operator=(const CArgMinLoss& other) {
if (this != &other) {
m_Impl = other.m_Impl->clone();
}
return *this;
}
bool CArgMinLoss::nextPass() const {
return m_Impl->nextPass();
}
void CArgMinLoss::add(const CEncodedDataFrameRowRef& row,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double actual,
double weight) {
return m_Impl->add(row, newExample, prediction, actual, weight);
}
void CArgMinLoss::merge(CArgMinLoss& other) {
return m_Impl->merge(*other.m_Impl);
}
CArgMinLoss::TDoubleVector CArgMinLoss::value() const {
return m_Impl->value();
}
CArgMinLoss::CArgMinLoss(const CArgMinLossImpl& impl) : m_Impl{impl.clone()} {
}
CArgMinLoss CLoss::makeMinimizer(const boosted_tree_detail::CArgMinLossImpl& impl) {
return CArgMinLoss{impl};
}
CMse::CMse(core::CStateRestoreTraverser& traverser) {
if (traverser.traverseSubLevel([this](auto& traverser_) {
return this->acceptRestoreTraverser(traverser_);
}) == false) {
throw std::runtime_error{"failed to restore CMse"};
}
}
CMse::TLossUPtr CMse::clone() const {
return std::make_unique<CMse>(*this);
}
CMse::TLossUPtr CMse::incremental(double eta, double mu, const TNodeVec& tree) const {
return std::make_unique<CMseIncremental>(eta, mu, tree);
}
CMse::TLossUPtr CMse::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CMse::type() const {
return E_MseRegression;
}
std::size_t CMse::dimensionPrediction() const {
return 1;
}
std::size_t CMse::dimensionGradient() const {
return 1;
}
double CMse::value(const TMemoryMappedFloatVector& prediction, double actual, double weight) const {
return weight * common::CTools::pow2(prediction(0) - actual);
}
void CMse::gradient(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
writer(0, 2.0 * weight * (prediction(0) - actual));
}
void CMse::curvature(const TMemoryMappedFloatVector& /*prediction*/,
double /*actual*/,
const TWriter& writer,
double weight) const {
writer(0, 2.0 * weight);
}
bool CMse::isCurvatureConstant() const {
return true;
}
double CMse::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
return weight * common::CTools::pow2(prediction(0) - previousPrediction(0));
}
CMse::TDoubleVector CMse::transform(const TMemoryMappedFloatVector& prediction) const {
return TDoubleVector{prediction};
}
CArgMinLoss CMse::minimizer(double lambda, const common::CPRNG::CXorOShiro128Plus&) const {
return this->makeMinimizer(CArgMinMseImpl{lambda});
}
const std::string& CMse::name() const {
return NAME;
}
bool CMse::isRegression() const {
return true;
}
void CMse::acceptPersistInserter(core::CStatePersistInserter& /* inserter */) const {
}
bool CMse::acceptRestoreTraverser(core::CStateRestoreTraverser& /* traverser */) {
return true;
}
const std::string CMse::NAME{"mse"};
CMseIncremental::CMseIncremental(double eta, double mu, const TNodeVec& tree)
: m_Eta{eta}, m_Mu{mu}, m_Tree{&tree} {
}
CMseIncremental::TLossUPtr CMseIncremental::clone() const {
return std::make_unique<CMseIncremental>(*this);
}
CMseIncremental::TLossUPtr
CMseIncremental::incremental(double eta, double mu, const TNodeVec& tree) const {
return std::make_unique<CMseIncremental>(eta, mu, tree);
}
CMseIncremental::TLossUPtr
CMseIncremental::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CMseIncremental::type() const {
return E_MseRegression;
}
std::size_t CMseIncremental::dimensionPrediction() const {
return 1;
}
std::size_t CMseIncremental::dimensionGradient() const {
return 1;
}
double CMseIncremental::value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) const {
// This purposely doesn't include any loss term for changing the prediction.
// This is used to estimate the quality of a retrained forest and select
// hyperaparameters which penalise changing predictions such as mu. As such
// we compute loss on a hold out from the old data to act as a proxy for how
// much we might have damaged accuracy on the original training data.
return this->CMse::value(prediction, actual, weight);
}
void CMseIncremental::gradient(const CEncodedDataFrameRowRef& row,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
if (newExample) {
this->CMse::gradient(prediction, actual, writer, weight);
} else {
double treePrediction{root(*m_Tree).value(row, *m_Tree)(0)};
writer(0, 2.0 * weight * (prediction(0) - actual + m_Mu / m_Eta * treePrediction));
}
}
void CMseIncremental::curvature(bool newExample,
const TMemoryMappedFloatVector& /*prediction*/,
double /*actual*/,
const TWriter& writer,
double weight) const {
writer(0, 2.0 * weight * (1.0 + (newExample ? 0.0 : m_Mu)));
}
bool CMseIncremental::isCurvatureConstant() const {
return true;
}
double CMseIncremental::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
return this->CMse::difference(prediction, previousPrediction, weight);
}
CMse::TDoubleVector CMseIncremental::transform(const TMemoryMappedFloatVector& prediction) const {
return TDoubleVector{prediction};
}
CArgMinLoss CMseIncremental::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus&) const {
return this->makeMinimizer(CArgMinMseIncrementalImpl{lambda, m_Eta, m_Mu, *m_Tree});
}
const std::string& CMseIncremental::name() const {
return NAME;
}
bool CMseIncremental::isRegression() const {
return true;
}
bool CMseIncremental::acceptRestoreTraverser(core::CStateRestoreTraverser&) {
return false;
}
const std::string CMseIncremental::NAME{"mse_incremental"};
CMsle::CMsle(double offset) : m_Offset{offset} {
}
CMsle::CMsle(core::CStateRestoreTraverser& traverser) {
if (traverser.traverseSubLevel([this](auto& traverser_) {
return this->acceptRestoreTraverser(traverser_);
}) == false) {
throw std::runtime_error{"failed to restore CMsle"};
}
}
CMsle::TLossUPtr CMsle::clone() const {
return std::make_unique<CMsle>(*this);
}
CMsle::TLossUPtr CMsle::incremental(double, double, const TNodeVec&) const {
return nullptr;
}
CMsle::TLossUPtr CMsle::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CMsle::type() const {
return E_MsleRegression;
}
std::size_t CMsle::dimensionPrediction() const {
return 1;
}
std::size_t CMsle::dimensionGradient() const {
return 1;
}
double CMsle::value(const TMemoryMappedFloatVector& logPrediction, double actual, double weight) const {
double prediction{common::CTools::stableExp(logPrediction(0))};
double logOffsetPrediction{common::CTools::stableLog(m_Offset + prediction)};
if (actual < 0.0) {
HANDLE_FATAL(<< "Input error: target value needs to be non-negative to use "
<< "with MSLE loss, received: " << actual);
}
double logOffsetActual{common::CTools::stableLog(m_Offset + actual)};
return weight * common::CTools::pow2(logOffsetPrediction - logOffsetActual);
}
void CMsle::gradient(const TMemoryMappedFloatVector& logPrediction,
double actual,
const TWriter& writer,
double weight) const {
double prediction{common::CTools::stableExp(logPrediction(0))};
double log1PlusPrediction{common::CTools::stableLog(m_Offset + prediction)};
double log1PlusActual{common::CTools::stableLog(m_Offset + actual)};
writer(0, 2.0 * weight * (log1PlusPrediction - log1PlusActual) / (prediction + 1.0));
}
void CMsle::curvature(const TMemoryMappedFloatVector& logPrediction,
double actual,
const TWriter& writer,
double weight) const {
double prediction{common::CTools::stableExp(logPrediction(0))};
double logOffsetPrediction{common::CTools::stableLog(m_Offset + prediction)};
double logOffsetActual{common::CTools::stableLog(m_Offset + actual)};
// Apply L'Hopital's rule in the limit prediction -> actual.
writer(0, prediction == actual
? 0.0
: 2.0 * weight * (logOffsetPrediction - logOffsetActual) /
((prediction + m_Offset) * (prediction - actual)));
}
bool CMsle::isCurvatureConstant() const {
return false;
}
double CMsle::difference(const TMemoryMappedFloatVector& logPrediction,
const TMemoryMappedFloatVector& logPreviousPrediction,
double weight) const {
double prediction{common::CTools::stableExp(logPrediction(0))};
double previousPrediction{common::CTools::stableExp(logPreviousPrediction(0))};
double logOffsetPrediction{common::CTools::stableLog(m_Offset + prediction)};
double logOffsetPreviousPrediction{common::CTools::stableLog(m_Offset + previousPrediction)};
return weight * common::CTools::pow2(logOffsetPrediction - logOffsetPreviousPrediction);
}
CMsle::TDoubleVector CMsle::transform(const TMemoryMappedFloatVector& prediction) const {
TDoubleVector result{1};
result(0) = std::exp(prediction(0));
return result;
}
CArgMinLoss CMsle::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus& /* rng */) const {
return this->makeMinimizer(CArgMinMsleImpl{lambda});
}
const std::string& CMsle::name() const {
return NAME;
}
bool CMsle::isRegression() const {
return true;
}
void CMsle::acceptPersistInserter(core::CStatePersistInserter& inserter) const {
core::CPersistUtils::persist(OFFSET_TAG, m_Offset, inserter);
}
bool CMsle::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) {
do {
const std::string& name = traverser.name();
RESTORE(OFFSET_TAG, core::CPersistUtils::restore(OFFSET_TAG, m_Offset, traverser))
} while (traverser.next());
return true;
}
const std::string CMsle::NAME{"msle"};
CPseudoHuber::CPseudoHuber(double delta) : m_Delta{delta} {};
CPseudoHuber::CPseudoHuber(core::CStateRestoreTraverser& traverser) {
if (traverser.traverseSubLevel([this](auto& traverser_) {
return this->acceptRestoreTraverser(traverser_);
}) == false) {
throw std::runtime_error{"failed to restore CPseudoHuber"};
}
}
CPseudoHuber::TLossUPtr CPseudoHuber::clone() const {
return std::make_unique<CPseudoHuber>(*this);
}
CPseudoHuber::TLossUPtr CPseudoHuber::incremental(double, double, const TNodeVec&) const {
return nullptr;
}
CPseudoHuber::TLossUPtr CPseudoHuber::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CPseudoHuber::type() const {
return E_HuberRegression;
}
std::size_t CPseudoHuber::dimensionPrediction() const {
return 1;
}
std::size_t CPseudoHuber::dimensionGradient() const {
return 1;
}
double CPseudoHuber::value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) const {
double delta2{common::CTools::pow2(m_Delta)};
return weight * delta2 *
(std::sqrt(1.0 + common::CTools::pow2(actual - prediction(0)) / delta2) - 1.0);
}
void CPseudoHuber::gradient(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
writer(0, weight * (prediction(0) - actual) /
(std::sqrt(1.0 + common::CTools::pow2((actual - prediction(0)) / m_Delta))));
}
void CPseudoHuber::curvature(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
double result{
1.0 / (std::sqrt(1.0 + common::CTools::pow2((actual - prediction(0)) / m_Delta)))};
writer(0, weight * result);
}
bool CPseudoHuber::isCurvatureConstant() const {
return false;
}
double CPseudoHuber::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
double delta2{common::CTools::pow2(m_Delta)};
return weight * delta2 *
(std::sqrt(1.0 + common::CTools::pow2(prediction(0) - previousPrediction(0)) / delta2) -
1.0);
}
CPseudoHuber::TDoubleVector
CPseudoHuber::transform(const TMemoryMappedFloatVector& prediction) const {
TDoubleVector result{1};
result(0) = prediction(0);
return result;
}
CArgMinLoss CPseudoHuber::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus& /* rng */) const {
return this->makeMinimizer(CArgMinPseudoHuberImpl{lambda, m_Delta});
}
const std::string& CPseudoHuber::name() const {
return NAME;
}
bool CPseudoHuber::isRegression() const {
return true;
}
void CPseudoHuber::acceptPersistInserter(core::CStatePersistInserter& inserter) const {
core::CPersistUtils::persist(DELTA_TAG, m_Delta, inserter);
}
bool CPseudoHuber::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) {
do {
const std::string& name = traverser.name();
RESTORE(DELTA_TAG, core::CPersistUtils::restore(DELTA_TAG, m_Delta, traverser))
} while (traverser.next());
return true;
}
const std::string CPseudoHuber::NAME{"pseudo_huber"};
CBinomialLogisticLoss::CBinomialLogisticLoss(core::CStateRestoreTraverser& traverser) {
if (traverser.traverseSubLevel([this](auto& traverser_) {
return this->acceptRestoreTraverser(traverser_);
}) == false) {
throw std::runtime_error{"failed to restore CBinomialLogisticLoss"};
}
}
CBinomialLogisticLoss::TLossUPtr CBinomialLogisticLoss::clone() const {
return std::make_unique<CBinomialLogisticLoss>(*this);
}
CBinomialLogisticLoss::TLossUPtr
CBinomialLogisticLoss::incremental(double eta, double mu, const TNodeVec& tree) const {
return std::make_unique<CBinomialLogisticLossIncremental>(eta, mu, tree);
}
CBinomialLogisticLoss::TLossUPtr
CBinomialLogisticLoss::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CBinomialLogisticLoss::type() const {
return E_BinaryClassification;
}
std::size_t CBinomialLogisticLoss::dimensionPrediction() const {
return 1;
}
std::size_t CBinomialLogisticLoss::dimensionGradient() const {
return 1;
}
double CBinomialLogisticLoss::value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) const {
return -weight * ((1.0 - actual) * logOneMinusLogistic(prediction(0)) +
actual * logLogistic(prediction(0)));
}
void CBinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
if (prediction(0) > -LOG_EPSILON && actual == 1.0) {
writer(0, -weight * common::CTools::stableExp(-prediction(0)));
} else {
writer(0, weight * (common::CTools::logisticFunction(prediction(0)) - actual));
}
}
void CBinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& prediction,
double /*actual*/,
const TWriter& writer,
double weight) const {
if (prediction(0) > -LOG_EPSILON) {
writer(0, weight * common::CTools::stableExp(-prediction(0)));
} else {
double probability{common::CTools::logisticFunction(prediction(0))};
writer(0, weight * probability * (1.0 - probability));
}
}
bool CBinomialLogisticLoss::isCurvatureConstant() const {
return false;
}
double CBinomialLogisticLoss::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
// The cross entropy of the new predicted probabilities given the previous ones.
double previousProbability{common::CTools::logisticFunction(previousPrediction(0))};
return -weight * ((1.0 - previousProbability) * logOneMinusLogistic(prediction(0)) +
previousProbability * logLogistic(prediction(0)));
}
CBinomialLogisticLoss::TDoubleVector
CBinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const {
double p1{common::CTools::logisticFunction(prediction(0))};
TDoubleVector result{2};
result(0) = 1.0 - p1;
result(1) = p1;
return result;
}
CArgMinLoss CBinomialLogisticLoss::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus&) const {
return this->makeMinimizer(CArgMinBinomialLogisticLossImpl{lambda});
}
const std::string& CBinomialLogisticLoss::name() const {
return NAME;
}
bool CBinomialLogisticLoss::isRegression() const {
return false;
}
void CBinomialLogisticLoss::acceptPersistInserter(core::CStatePersistInserter& /* inserter */) const {
}
bool CBinomialLogisticLoss::acceptRestoreTraverser(core::CStateRestoreTraverser& /* traverser */) {
return true;
}
const std::string CBinomialLogisticLoss::NAME{"binomial_logistic"};
CBinomialLogisticLossIncremental::CBinomialLogisticLossIncremental(double eta,
double mu,
const TNodeVec& tree)
: m_Eta{eta}, m_Mu{mu}, m_Tree{&tree} {
}
CBinomialLogisticLossIncremental::TLossUPtr CBinomialLogisticLossIncremental::clone() const {
return std::make_unique<CBinomialLogisticLossIncremental>(*this);
}
CBinomialLogisticLossIncremental::TLossUPtr
CBinomialLogisticLossIncremental::incremental(double eta, double mu, const TNodeVec& tree) const {
return std::make_unique<CBinomialLogisticLossIncremental>(eta, mu, tree);
}
CBinomialLogisticLossIncremental::TLossUPtr
CBinomialLogisticLossIncremental::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
ELossType CBinomialLogisticLossIncremental::type() const {
return E_BinaryClassification;
}
std::size_t CBinomialLogisticLossIncremental::dimensionPrediction() const {
return 1;
}
std::size_t CBinomialLogisticLossIncremental::dimensionGradient() const {
return 1;
}
double CBinomialLogisticLossIncremental::value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) const {
// This purposely doesn't include any loss term for changing the prediction.
// This is used to estimate the quality of a retrained forest and select
// hyperaparameters which penalise changing predictions such as mu. As such
// we compute loss on a hold out from the old data to act as a proxy for how
// much we might have damaged accuracy on the original training data.
return this->CBinomialLogisticLoss::value(prediction, actual, weight);
}
void CBinomialLogisticLossIncremental::gradient(const CEncodedDataFrameRowRef& row,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
if (newExample) {
this->CBinomialLogisticLoss::gradient(prediction, actual, writer, weight);
} else {
double treePrediction{common::CTools::logisticFunction(
root(*m_Tree).value(row, *m_Tree)(0) / m_Eta)};
if (prediction(0) > -LOG_EPSILON && actual == 1.0) {
writer(0, -weight * ((1.0 + m_Mu) * common::CTools::stableExp(-prediction(0)) +
m_Mu * (treePrediction - 1.0)));
} else {
writer(0, weight * ((1.0 + m_Mu) * common::CTools::logisticFunction(prediction(0)) -
actual - m_Mu * treePrediction));
}
}
}
void CBinomialLogisticLossIncremental::curvature(const CEncodedDataFrameRowRef& /*row*/,
bool newExample,
const TMemoryMappedFloatVector& prediction,
double /*actual*/,
const TWriter& writer,
double weight) const {
if (prediction(0) > -LOG_EPSILON) {
writer(0, weight * (newExample ? 1.0 : 1.0 + m_Mu) *
common::CTools::stableExp(-prediction(0)));
} else {
double probability{common::CTools::logisticFunction(prediction(0))};
writer(0, weight * (newExample ? 1.0 : 1.0 + m_Mu) * probability * (1.0 - probability));
}
}
bool CBinomialLogisticLossIncremental::isCurvatureConstant() const {
return false;
}
double CBinomialLogisticLossIncremental::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
return this->CBinomialLogisticLoss::difference(prediction, previousPrediction, weight);
}
CBinomialLogisticLossIncremental::TDoubleVector
CBinomialLogisticLossIncremental::transform(const TMemoryMappedFloatVector& prediction) const {
return this->CBinomialLogisticLoss::transform(prediction);
}
CArgMinLoss CBinomialLogisticLossIncremental::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus&) const {
return this->makeMinimizer(
CArgMinBinomialLogisticLossIncrementalImpl{lambda, m_Eta, m_Mu, *m_Tree});
}
const std::string& CBinomialLogisticLossIncremental::name() const {
return NAME;
}
bool CBinomialLogisticLossIncremental::isRegression() const {
return false;
}
bool CBinomialLogisticLossIncremental::acceptRestoreTraverser(core::CStateRestoreTraverser&) {
return false;
}
const std::string CBinomialLogisticLossIncremental::NAME{"binomial_logistic_incremental"};
CMultinomialLogisticLoss::CMultinomialLogisticLoss(std::size_t numberClasses)
: m_NumberClasses{numberClasses} {
}
CMultinomialLogisticLoss::CMultinomialLogisticLoss(core::CStateRestoreTraverser& traverser) {
if (traverser.traverseSubLevel([this](auto& traverser_) {
return this->acceptRestoreTraverser(traverser_);
}) == false) {
throw std::runtime_error{"failed to restore CMultinomialLogisticLoss"};
}
}
CMultinomialLogisticLoss::TLossUPtr CMultinomialLogisticLoss::clone() const {
return std::make_unique<CMultinomialLogisticLoss>(m_NumberClasses);
}
CMultinomialLogisticLoss::TLossUPtr
CMultinomialLogisticLoss::incremental(double, double, const TNodeVec&) const {
return nullptr;
}
CMultinomialLogisticLoss::TLossUPtr
CMultinomialLogisticLoss::project(std::size_t numberThreads,
core::CDataFrame& frame,
const core::CPackedBitVector& rowMask,
std::size_t targetColumn,
const TSizeVec& extraColumns,
common::CPRNG::CXorOShiro128Plus rng) const {
if (m_NumberClasses <= MAX_GRADIENT_DIMENSION) {
return this->clone();
}
// Compute total loss over masked rows.
auto result =
frame
.readRows(
numberThreads, 0, frame.numberRows(),
core::bindRetrievableState(
[&](TDoubleVec& losses, const TRowItr& beginRows, const TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
auto prediction = readPrediction(*row, extraColumns, m_NumberClasses);
double actual{readActual(*row, targetColumn)};
double weight{readExampleWeight(*row, extraColumns)};
losses[static_cast<int>(actual)] +=
this->value(prediction, actual, weight);
}
},
TDoubleVec(m_NumberClasses, 0.0)),
&rowMask)
.first;
auto losses = std::move(result[0].s_FunctionState);
for (std::size_t i = 1; i < result.size(); ++i) {
for (std::size_t j = 1; j < losses.size(); ++j) {
losses[j] += result[i].s_FunctionState[j];
}
}
TSizeVec classes;
common::CSampling::categoricalSampleWithoutReplacement(
rng, losses, MAX_GRADIENT_DIMENSION - 1, classes);
std::sort(classes.begin(), classes.end());
return std::make_unique<CMultinomialLogisticSubsetLoss>(m_NumberClasses, classes);
}
ELossType CMultinomialLogisticLoss::type() const {
return E_MulticlassClassification;
}
std::size_t CMultinomialLogisticLoss::dimensionPrediction() const {
return m_NumberClasses;
}
std::size_t CMultinomialLogisticLoss::dimensionGradient() const {
return std::min(m_NumberClasses, MAX_GRADIENT_DIMENSION);
}
double CMultinomialLogisticLoss::value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight) const {
double zmax{prediction.maxCoeff()};
double logZ{0.0};
for (int i = 0; i < prediction.size(); ++i) {
logZ += std::exp(prediction(i) - zmax);
}
logZ = zmax + common::CTools::stableLog(logZ);
// i.e. -log(z(actual))
return weight * (logZ - prediction(static_cast<int>(actual)));
}
void CMultinomialLogisticLoss::gradient(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
// We prefer an implementation which avoids any memory allocations.
double zmax{prediction.maxCoeff()};
double pEps{0.0};
double logZ{0.0};
for (int i = 0; i < prediction.size(); ++i) {
if (prediction(i) - zmax < LOG_EPSILON) {
// Sum the contributions from classes whose predicted probability
// is less than epsilon, for which we'd lose all nearly precision
// when adding to the normalisation coefficient.
pEps += std::exp(prediction(i) - zmax);
} else {
logZ += std::exp(prediction(i) - zmax);
}
}
pEps = common::CTools::stable(pEps / logZ);
logZ = zmax + common::CTools::stableLog(logZ);
for (int i = 0; i < prediction.size(); ++i) {
double pi{common::CTools::stableExp(prediction(i) - logZ)};
if (i == static_cast<int>(actual)) {
// We have that p = 1 / (1 + eps) and the gradient is p - 1.
// Use a Taylor expansion and drop terms of O(eps^2) to get:
writer(i, weight * (pi == 1.0 ? -pEps : pi - 1.0));
} else {
writer(i, weight * pi);
}
}
}
void CMultinomialLogisticLoss::curvature(const TMemoryMappedFloatVector& prediction,
double /*actual*/,
const TWriter& writer,
double weight) const {
// Return the lower triangle of the Hessian column major.
// We prefer an implementation which avoids any memory allocations.
double zmax{prediction.maxCoeff()};
double pEps{0.0};
double logZ{0.0};
for (int i = 0; i < prediction.size(); ++i) {
double pAdj{std::exp(prediction(i) - zmax)};
if (prediction(i) - zmax < LOG_EPSILON) {
// Sum the contributions from classes whose predicted probability
// is less than epsilon, for which we'd lose all nearly precision
// when adding to the normalisation coefficient.
pEps += pAdj;
} else {
logZ += pAdj;
}
}
pEps = common::CTools::stable(pEps / logZ);
logZ = zmax + common::CTools::stableLog(logZ);
std::size_t k{0};
for (int i = 0; i < prediction.size(); ++i) {
double pi{common::CTools::stableExp(prediction(i) - logZ)};
// We have that p = 1 / (1 + eps) and the curvature is p (1 - p).
// Use a Taylor expansion and drop terms of O(eps^2) to get:
writer(k++, weight * (pi == 1.0 ? pEps : pi * (1.0 - pi)));
for (int j = i + 1; j < prediction.size(); ++j) {
double pij{common::CTools::stableExp(prediction(i) + prediction(j) - 2.0 * logZ)};
writer(k++, -weight * pij);
}
}
LOG_TRACE(<< "Wrote " << k << " curvatures");
}
bool CMultinomialLogisticLoss::isCurvatureConstant() const {
return false;
}
double CMultinomialLogisticLoss::difference(const TMemoryMappedFloatVector& prediction,
const TMemoryMappedFloatVector& previousPrediction,
double weight) const {
// The cross entropy of the new predicted probabilities given the previous ones.
double zmax{prediction.maxCoeff()};
double logZ{0.0};
for (int i = 0; i < prediction.size(); ++i) {
logZ += std::exp(prediction(i) - zmax);
}
logZ = zmax + common::CTools::stableLog(logZ);
double result{0};
auto previousProbabilities = this->transform(previousPrediction);
for (int i = 0; i < prediction.size(); ++i) {
result += previousProbabilities(i) * (logZ - prediction(i));
}
return weight * result;
}
CMultinomialLogisticLoss::TDoubleVector
CMultinomialLogisticLoss::transform(const TMemoryMappedFloatVector& prediction) const {
TDoubleVector result{prediction};
common::CTools::inplaceSoftmax(result);
return result;
}
CArgMinLoss CMultinomialLogisticLoss::minimizer(double lambda,
const common::CPRNG::CXorOShiro128Plus& rng) const {
return this->makeMinimizer(
CArgMinMultinomialLogisticLossImpl{m_NumberClasses, lambda, rng});
}
const std::string& CMultinomialLogisticLoss::name() const {
return NAME;
}
bool CMultinomialLogisticLoss::isRegression() const {
return false;
}
void CMultinomialLogisticLoss::acceptPersistInserter(core::CStatePersistInserter& inserter) const {
core::CPersistUtils::persist(NUMBER_CLASSES_TAG, m_NumberClasses, inserter);
}
bool CMultinomialLogisticLoss::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) {
do {
const std::string& name = traverser.name();
RESTORE(NUMBER_CLASSES_TAG,
core::CPersistUtils::restore(NUMBER_CLASSES_TAG, m_NumberClasses, traverser))
} while (traverser.next());
return true;
}
const std::string CMultinomialLogisticLoss::NAME{"multinomial_logistic"};
CMultinomialLogisticSubsetLoss::CMultinomialLogisticSubsetLoss(std::size_t numberClasses,
const TSizeVec& classes)
: CMultinomialLogisticLoss{numberClasses} {
m_InClasses.reserve(classes.size());
for (auto i : classes) {
if (i >= numberClasses) {
HANDLE_FATAL(<< "Invalid class " << i << " out-of-range [0, "
<< numberClasses << ").");
}
m_InClasses.push_back(static_cast<int>(i));
}
m_OutClasses.resize(numberClasses);
std::iota(m_OutClasses.begin(), m_OutClasses.end(), 0);
m_OutClasses.erase(std::set_difference(m_OutClasses.begin(), m_OutClasses.end(),
m_InClasses.begin(), m_InClasses.end(),
m_OutClasses.begin()),
m_OutClasses.end());
LOG_TRACE(<< "in = " << m_InClasses << ", out = " << m_OutClasses);
}
CMultinomialLogisticSubsetLoss::TLossUPtr CMultinomialLogisticSubsetLoss::clone() const {
return std::make_unique<CMultinomialLogisticSubsetLoss>(*this);
}
CMultinomialLogisticSubsetLoss::TLossUPtr
CMultinomialLogisticSubsetLoss::incremental(double, double, const TNodeVec&) const {
return nullptr;
}
CMultinomialLogisticSubsetLoss::TLossUPtr
CMultinomialLogisticSubsetLoss::project(std::size_t /*numberThreads*/,
core::CDataFrame& /*frame*/,
const core::CPackedBitVector& /*rowMask*/,
std::size_t /*targetColumn*/,
const TSizeVec& /*extrColumns*/,
common::CPRNG::CXorOShiro128Plus /*rng*/) const {
return this->clone();
}
void CMultinomialLogisticSubsetLoss::gradient(const TMemoryMappedFloatVector& prediction,
double actual,
const TWriter& writer,
double weight) const {
// We prefer an implementation which avoids any memory allocations.
int actual_{static_cast<int>(actual)};
double pEps{0.0};
double logZ{0.0};
bool isActualIn{false};
double zmax{prediction.maxCoeff()};
for (auto i : m_InClasses) {
double pAdj{std::exp(prediction(i) - zmax)};
// Sum the contributions from classes whose predicted probability
// is less than epsilon, for which we'd lose all nearly precision
// when adding to the normalisation coefficient.
(prediction(i) - zmax < LOG_EPSILON ? pEps : logZ) += pAdj;
isActualIn |= (actual_ == i);
}
double pAgg{std::accumulate(
m_OutClasses.begin(), m_OutClasses.end(), 0.0,
[&](auto p, auto i) { return p + std::exp(prediction(i) - zmax); })};
(pAgg < EPSILON * logZ ? pEps : logZ) += pAgg;
pEps = common::CTools::stable(pEps / logZ);
logZ = zmax + std::log(logZ);
pAgg = common::CTools::stable(pAgg * std::exp(zmax - logZ) /
static_cast<double>(m_OutClasses.size()));
LOG_TRACE(<< "p(agg) = " << pAgg);
for (std::size_t i = 0; i <= m_InClasses.size(); ++i) {
bool iAgg{i < m_InClasses.size()};
double pi{iAgg ? common::CTools::stableExp(prediction(m_InClasses[i]) - logZ) : pAgg};
bool isActual{iAgg ? (m_InClasses[i] == actual_) : (isActualIn == false)};
if (isActual) {
// We have that p = 1 / (1 + eps) and the gradient is p - 1.
// Use a Taylor expansion and drop terms of O(eps^2) to get:
writer(i, weight * (pi == 1.0 ? -pEps : pi - 1.0));
} else {
writer(i, weight * pi);
}
}
}
void CMultinomialLogisticSubsetLoss::curvature(const TMemoryMappedFloatVector& prediction,
double /*actual*/,
const TWriter& writer,
double weight) const {
// Return the lower triangle of the Hessian column major.
// We prefer an implementation which avoids any memory allocations.
double pEps{0.0};
double logZ{0.0};
double zmax{prediction.maxCoeff()};
for (auto i : m_InClasses) {
double pAdj{std::exp(prediction(i) - zmax)};
if (prediction(i) - zmax < LOG_EPSILON) {
// Sum the contributions from classes whose predicted probability
// is less than epsilon, for which we'd lose all nearly precision
// when adding to the normalisation coefficient.
pEps += pAdj;
} else {
logZ += pAdj;
}
}
double pAgg{std::accumulate(
m_OutClasses.begin(), m_OutClasses.end(), 0.0,
[&](auto p, auto i) { return p + std::exp(prediction(i) - zmax); })};
(pAgg < EPSILON * logZ ? pEps : logZ) += pAgg;
pEps = common::CTools::stable(pEps / logZ);
logZ = zmax + common::CTools::stableLog(logZ);
pAgg = common::CTools::stable(pAgg * std::exp(zmax - logZ) /
static_cast<double>(m_OutClasses.size()));
LOG_TRACE(<< "p(agg) = " << pAgg);
auto probability = [&](std::size_t i) {
return i < m_InClasses.size()
? common::CTools::stableExp(prediction(m_InClasses[i]) - logZ)
: pAgg;
};
std::size_t k{0};
for (std::size_t i = 0; i <= m_InClasses.size(); ++i) {
double pi{probability(i)};
// We have that p = 1 / (1 + eps) and the curvature is p (1 - p).
// Use a Taylor expansion and drop terms of O(eps^2) to get:
writer(k++, weight * (pi == 1.0 ? pEps : pi * (1.0 - pi)));
for (std::size_t j = i + 1; j <= m_InClasses.size(); ++j) {
double pij{pi * probability(j)};
writer(k++, -weight * pij);
}
}
LOG_TRACE(<< "Wrote " << k << " curvatures");
}
}
}
}
}