lib/maths/analytics/unittest/CBoostedTreeLossTest.cc

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CContainerPrinter.h> #include <core/CDataFrame.h> #include <maths/analytics/CBoostedTree.h> #include <maths/analytics/CBoostedTreeFactory.h> #include <maths/analytics/CBoostedTreeImpl.h> #include <maths/analytics/CBoostedTreeLoss.h> #include <maths/analytics/CBoostedTreeUtils.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CPRNG.h> #include <maths/common/CSolvers.h> #include <maths/common/CTools.h> #include <maths/common/CToolsDetail.h> #include <test/BoostTestCloseAbsolute.h> #include <test/CRandomNumbers.h> #include "BoostedTreeTestData.h" #include <boost/test/unit_test.hpp> #include <algorithm> #include <cmath> #include <cstddef> #include <limits> #include <numeric> #include <utility> #include <vector> BOOST_AUTO_TEST_SUITE(CBoostedTreeLossTest) using namespace ml; using TDoubleVec = std::vector<double>; using TDoubleVecVec = std::vector<TDoubleVec>; using TDoubleVector = maths::analytics::boosted_tree::CLoss::TDoubleVector; using TDoubleVectorVec = std::vector<TDoubleVector>; using TFloatVec = std::vector<maths::common::CFloatStorage>; using TRowRef = core::CDataFrame::TRowRef; using TRowItr = core::CDataFrame::TRowItr; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TArgMinLossVec = std::vector<maths::analytics::boosted_tree::CArgMinLoss>; using TMemoryMappedFloatVector = maths::analytics::boosted_tree::CLoss::TMemoryMappedFloatVector; using maths::analytics::boosted_tree::CBinomialLogisticLoss; using maths::analytics::boosted_tree::CBinomialLogisticLossIncremental; using maths::analytics::boosted_tree::CMse; using maths::analytics::boosted_tree::CMseIncremental; using maths::analytics::boosted_tree::CMultinomialLogisticLoss; using maths::analytics::boosted_tree::CMultinomialLogisticSubsetLoss; using maths::analytics::boosted_tree_detail::CArgMinBinomialLogisticLossImpl; using maths::analytics::boosted_tree_detail::CArgMinMsleImpl; using maths::analytics::boosted_tree_detail::CArgMinMultinomialLogisticLossImpl; using maths::analytics::boosted_tree_detail::CArgMinPseudoHuberImpl; using maths::analytics::boosted_tree_detail::readActual; using maths::analytics::boosted_tree_detail::readExampleWeight; using maths::analytics::boosted_tree_detail::readPrediction; using maths::analytics::boosted_tree_detail::root; namespace { void minimizeGridSearch(std::function<double(const TDoubleVector&)> objective, double scale, int d, TDoubleVector& x, double& min, TDoubleVector& argmin) { if (d == x.size()) { double value{objective(x)}; if (value < min) { min = value; argmin = x; } return; } for (double xd : {-0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5}) { x(d) = scale * xd; minimizeGridSearch(objective, scale, d + 1, x, min, argmin); } } } BOOST_AUTO_TEST_CASE(testBinomialLogisticArgminEdgeCases) { // All predictions equal and zero. { CArgMinBinomialLogisticLossImpl argmin{0.0}; maths::common::CFloatStorage storage[]{0.0}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, 0.0); argmin.add(prediction, 1.0); argmin.add(prediction, 1.0); argmin.add(prediction, 0.0); BOOST_REQUIRE_EQUAL(false, argmin.nextPass()); BOOST_REQUIRE_EQUAL(0.0, argmin.value()[0]); } // All predictions are equal and 0.5. { test::CRandomNumbers rng; TDoubleVec labels; rng.generateUniformSamples(0.0, 1.0, 1000, labels); for (auto& label : labels) { label = std::floor(label + 0.3); } CArgMinBinomialLogisticLossImpl argmin{0.0}; std::size_t numberPasses{0}; std::size_t counts[]{0, 0}; do { ++numberPasses; for (std::size_t i = 0; i < labels.size(); ++i) { maths::common::CFloatStorage storage[]{0.5}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, labels[i]); ++counts[static_cast<std::size_t>(labels[i])]; } } while (argmin.nextPass()); double p{static_cast<double>(counts[1]) / 1000.0}; double expected{std::log(p / (1.0 - p)) - 0.5}; double actual{argmin.value()[0]}; BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses); BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); } // Test underflow of probabilities. { CArgMinBinomialLogisticLossImpl argmin{0.0}; TDoubleVec predictions{-500.0, -30.0, -15.0, -400.0}; TDoubleVec labels{1.0, 1.0, 0.0, 1.0}; do { for (std::size_t i = 0; i < predictions.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, labels[i]); } } while (argmin.nextPass()); double minimizer{argmin.value()[0]}; // Check we're at the minimum. CBinomialLogisticLoss loss; TDoubleVec losses; for (double eps : {-10.0, 0.0, 10.0}) { double lossAtEps{0.0}; for (std::size_t i = 0; i < predictions.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[i] + minimizer + eps}; TMemoryMappedFloatVector probe{storage, 1}; lossAtEps += loss.value(probe, labels[i]); } losses.push_back(lossAtEps); } BOOST_TEST_REQUIRE(losses[0] >= losses[1]); BOOST_TEST_REQUIRE(losses[2] >= losses[1]); } } BOOST_AUTO_TEST_CASE(testBinomialLogisticArgminRandom) { // Test that we a good approximation of the additive term for the log-odds // which minimises the exact cross entropy objective. test::CRandomNumbers rng; TDoubleVec labels; TDoubleVec predictions; for (auto lambda : {0.0, 10.0}) { LOG_DEBUG(<< "lambda = " << lambda); // The exact objective. auto exactObjective = [&](double weight) { double loss{0.0}; for (std::size_t i = 0; i < labels.size(); ++i) { double p{maths::common::CTools::logisticFunction(predictions[i] + weight)}; loss -= (1.0 - labels[i]) * maths::common::CTools::fastLog(1.0 - p) + labels[i] * maths::common::CTools::fastLog(p); } return loss + lambda * maths::common::CTools::pow2(weight); }; // This loop is fuzzing the predicted log-odds and testing we get consistently // good estimates of the true minimizer. for (std::size_t t = 0; t < 10; ++t) { double min{std::numeric_limits<double>::max()}; double max{-min}; rng.generateUniformSamples(0.0, 1.0, 1000, labels); for (auto& label : labels) { label = std::floor(label + 0.5); } predictions.clear(); for (const auto& label : labels) { TDoubleVec weight; rng.generateNormalSamples(label, 2.0, 1, weight); predictions.push_back(weight[0]); min = std::min(min, weight[0]); max = std::max(max, weight[0]); } double expected; double objectiveAtExpected; std::size_t maxIterations{20}; maths::common::CSolvers::minimize( -max, -min, exactObjective(-max), exactObjective(-min), exactObjective, 1e-3, maxIterations, expected, objectiveAtExpected); LOG_DEBUG(<< "expected = " << expected << " objective(expected) = " << objectiveAtExpected); CArgMinBinomialLogisticLossImpl argminPartition[2]{ CArgMinBinomialLogisticLossImpl{lambda}, CArgMinBinomialLogisticLossImpl{lambda}}; CArgMinBinomialLogisticLossImpl argmin{lambda}; auto nextPass = [&] { bool done{argmin.nextPass() == false}; done &= (argminPartition[0].nextPass() == false); done &= (argminPartition[1].nextPass() == false); return done == false; }; do { for (std::size_t i = 0; i < labels.size() / 2; ++i) { maths::common::CFloatStorage storage[]{predictions[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, labels[i]); argminPartition[0].add(prediction, labels[i]); } for (std::size_t i = labels.size() / 2; i < labels.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, labels[i]); argminPartition[1].add(prediction, labels[i]); } argminPartition[0].merge(argminPartition[1]); argminPartition[1] = argminPartition[0]; } while (nextPass()); double actual{argmin.value()(0)}; double actualPartition{argminPartition[0].value()(0)}; double objectiveAtActual{exactObjective(actual)}; LOG_DEBUG(<< "actual = " << actual << " objective(actual) = " << objectiveAtActual); // We should be within 1% for the value and 0.001% for the objective // at the value. BOOST_REQUIRE_EQUAL(actual, actualPartition); BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.01 * std::fabs(expected)); BOOST_REQUIRE_CLOSE_ABSOLUTE(objectiveAtExpected, objectiveAtActual, 1e-5 * objectiveAtExpected); } } } BOOST_AUTO_TEST_CASE(testBinomialLogisticLossGradientAndCurvature) { // Test that the loss gradient and curvature functions are close to the // numerical derivatives of the objective. test::CRandomNumbers rng; double eps{1e-3}; auto derivative = [eps](const std::function<double(TDoubleVector)>& f, TDoubleVector prediction, std::size_t i) { prediction(i) += eps; double result{f(prediction)}; prediction(i) -= 2.0 * eps; result -= f(prediction); result /= 2.0 * eps; return result; }; for (std::size_t t = 0; t < 100; ++t) { CBinomialLogisticLoss loss; TDoubleVec predictions{0.0}; if (t > 0) { rng.generateUniformSamples(-1.0, 1.0, 1, predictions); } TDoubleVec actual; rng.generateUniformSamples(0.0, 2.0, 1, actual); actual[0] = std::floor(actual[0]); double expectedGradient; double expectedCurvature; { auto gradient = [&](TDoubleVector prediction) { return derivative( [&](TDoubleVector prediction_) { maths::common::CFloatStorage storage[]{prediction_(0)}; return loss.value(TMemoryMappedFloatVector{storage, 1}, actual[0]); }, prediction, 0); }; TDoubleVector prediction{1}; prediction(0) = predictions[0]; expectedGradient = gradient(prediction); expectedCurvature = derivative(gradient, prediction, 0); } double actualGradient; double actualCurvature; { maths::common::CFloatStorage storage[]{predictions[0]}; TMemoryMappedFloatVector prediction{storage, 1}; loss.gradient(prediction, actual[0], [&](std::size_t, double gradient) { actualGradient = gradient; }); loss.curvature(prediction, actual[0], [&](std::size_t, double curvature) { actualCurvature = curvature; }); } if (t % 10 == 0) { LOG_DEBUG(<< "actual gradient = " << actualGradient); LOG_DEBUG(<< "expected gradient = " << expectedGradient); LOG_DEBUG(<< "actual curvature = " << actualCurvature); LOG_DEBUG(<< "expected curvature = " << expectedCurvature); } BOOST_REQUIRE_SMALL(std::fabs(expectedGradient - actualGradient), 0.001); BOOST_REQUIRE_SMALL(std::fabs(expectedCurvature - actualCurvature), 0.02); } } BOOST_AUTO_TEST_CASE(testBinomialLogisticLossForUnderflow) { // Test the behaviour of value, gradient and curvature of the logistic loss in // the vicinity the point at which we switch to using Taylor expansion of the // logistic function is as expected. double eps{100.0 * std::numeric_limits<double>::epsilon()}; CBinomialLogisticLoss loss; // Losses should be very nearly linear function of log-odds when they're large. { maths::common::CFloatStorage storage[]{1.0 - std::log(eps), 1.0 + std::log(eps)}; TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}}; TDoubleVec previousLoss{loss.value(predictions[0], 0.0), loss.value(predictions[1], 1.0)}; for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { storage[0] = scale - std::log(eps); storage[1] = scale + std::log(eps); TDoubleVec currentLoss{loss.value(predictions[0], 0.0), loss.value(predictions[1], 1.0)}; BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005); BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005); previousLoss = currentLoss; } } // The gradient and curvature should be proportional to the exponential of the // log-odds when they're small. { auto readDerivatives = [&](double prediction, TDoubleVec& gradients, TDoubleVec& curvatures) { maths::common::CFloatStorage storage[]{prediction + std::log(eps), prediction - std::log(eps)}; TMemoryMappedFloatVector predictions[]{{&storage[0], 1}, {&storage[1], 1}}; loss.gradient(predictions[0], 0.0, [&](std::size_t, double value) { gradients[0] = value; }); loss.gradient(predictions[1], 1.0, [&](std::size_t, double value) { gradients[1] = value; }); loss.curvature(predictions[0], 0.0, [&](std::size_t, double value) { curvatures[0] = value; }); loss.curvature(predictions[1], 1.0, [&](std::size_t, double value) { curvatures[1] = value; }); }; TDoubleVec previousGradient(2); TDoubleVec previousCurvature(2); readDerivatives(1.0, previousGradient, previousCurvature); for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { TDoubleVec currentGradient(2); TDoubleVec currentCurvature(2); readDerivatives(scale, currentGradient, currentCurvature); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(0.25), previousGradient[0] / currentGradient[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(-0.25), previousGradient[1] / currentGradient[1], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(0.25), previousCurvature[0] / currentCurvature[0], 0.01); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(-0.25), previousCurvature[1] / currentCurvature[1], 0.01); previousGradient = currentGradient; previousCurvature = currentCurvature; } } } BOOST_AUTO_TEST_CASE(testMultinomialLogisticArgminObjectiveFunction) { // Test that the gradient function is close to the numerical derivative // of the objective. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; for (std::size_t t = 0; t < 10; ++t) { CArgMinMultinomialLogisticLossImpl argmin{3, 0.1 * static_cast<double>(t + 1), rng}; TDoubleVec labels; testRng.generateUniformSamples(0.0, 3.0, 20, labels); TDoubleVec predictions; if (t == 0) { predictions.resize(3 * labels.size(), 0.0); } else { testRng.generateUniformSamples(-1.0, 1.0, 3 * labels.size(), predictions); } do { for (std::size_t i = 0; i < labels.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[3 * i + 0], predictions[3 * i + 1], predictions[3 * i + 2]}; TMemoryMappedFloatVector prediction{storage, 3}; argmin.add(prediction, std::floor(labels[i])); } } while (argmin.nextPass()); auto objective = argmin.objective(); auto objectiveGradient = argmin.objectiveGradient(); double eps{1e-3}; TDoubleVec probes; testRng.generateUniformSamples(-1.0, 1.0, 30, probes); for (std::size_t i = 0; i < probes.size(); i += 3) { TDoubleVector probe{3}; probe(0) = probes[i]; probe(1) = probes[i + 1]; probe(2) = probes[i + 2]; TDoubleVector expectedGradient{3}; for (std::size_t j = 0; j < 3; ++j) { TDoubleVector shift{TDoubleVector::Zero(3)}; shift(j) = eps; expectedGradient(j) = (objective(probe + shift) - objective(probe - shift)) / (2.0 * eps); } TDoubleVector actualGradient{objectiveGradient(probe)}; BOOST_REQUIRE_SMALL((expectedGradient - actualGradient).norm(), eps); } } } BOOST_AUTO_TEST_CASE(testMultinomialLogisticArgminEdgeCases) { maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; // All predictions equal and zero. { CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; maths::common::CFloatStorage storage[]{0.0, 0.0, 0.0}; TMemoryMappedFloatVector prediction{storage, 3}; argmin.add(prediction, 0.0); argmin.add(prediction, 1.0); argmin.add(prediction, 1.0); argmin.add(prediction, 0.0); argmin.add(prediction, 2.0); argmin.add(prediction, 1.0); BOOST_REQUIRE_EQUAL(false, argmin.nextPass()); TDoubleVector expectedProbabilities{3}; expectedProbabilities(0) = 2.0 / 6.0; expectedProbabilities(1) = 3.0 / 6.0; expectedProbabilities(2) = 1.0 / 6.0; TDoubleVector actualProbabilities{argmin.value()}; maths::common::CTools::inplaceSoftmax(actualProbabilities); BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 1e-3); } // All predictions are equal and 0.5. for (std::size_t t = 0; t < 10; ++t) { TDoubleVec labels; testRng.generateUniformSamples(0.0, 2.0, 20, labels); for (auto& label : labels) { label = std::floor(label + 0.3); } CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; std::size_t numberPasses{0}; std::size_t counts[]{0, 0, 0}; maths::common::CFloatStorage storage[]{0.5, 0.5, 0.5}; TMemoryMappedFloatVector prediction{storage, 3}; do { ++numberPasses; for (const auto& label : labels) { argmin.add(prediction, label); ++counts[static_cast<std::size_t>(label)]; } } while (argmin.nextPass()); BOOST_REQUIRE_EQUAL(std::size_t{1}, numberPasses); TDoubleVector expectedProbabilities{3}; expectedProbabilities(0) = static_cast<double>(counts[0]) / 20.0; expectedProbabilities(1) = static_cast<double>(counts[1]) / 20.0; expectedProbabilities(2) = static_cast<double>(counts[2]) / 20.0; TDoubleVector actualProbabilities{prediction + argmin.value()}; maths::common::CTools::inplaceSoftmax(actualProbabilities); BOOST_REQUIRE_SMALL((actualProbabilities - expectedProbabilities).norm(), 0.001); } // Test underflow of probabilities. LOG_DEBUG(<< "Test underflow"); { CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; TDoubleVecVec predictions{{-230.0, -200.0, -200.0}, {-30.0, -10.0, -20.0}, {-15.0, -50.0, -30.0}, {-400.0, -350.0, -300.0}}; TDoubleVec labels{1.0, 1.0, 0.0, 2.0}; do { for (std::size_t i = 0; i < predictions.size(); ++i) { maths::common::CFloatStorage storage[]{ predictions[i][0], predictions[i][1], predictions[i][2]}; TMemoryMappedFloatVector prediction{storage, 3}; argmin.add(prediction, labels[i]); } } while (argmin.nextPass()); TDoubleVector minimizer{argmin.value()}; // Check we're at a minimum. CMultinomialLogisticLoss loss{3}; for (std::size_t i = 0; i < 3; ++i) { TDoubleVec losses; for (double eps : {-30.0, 0.0, 30.0}) { double lossAtEps{0.0}; for (std::size_t j = 0; j < predictions.size(); ++j) { maths::common::CFloatStorage storage[]{ predictions[j][0] + minimizer(0), predictions[j][1] + minimizer(1), predictions[j][2] + minimizer(2)}; storage[i] += eps; TMemoryMappedFloatVector probe{storage, 3}; lossAtEps += loss.value(probe, labels[j]); } losses.push_back(lossAtEps); } BOOST_TEST_REQUIRE(losses[0] >= losses[1]); BOOST_TEST_REQUIRE(losses[2] >= losses[1]); } } // All labels equal. { CArgMinMultinomialLogisticLossImpl argmin{3, 0.0, rng}; maths::common::CFloatStorage storage[]{0.0, 0.0, 0.0}; TMemoryMappedFloatVector prediction{storage, 3}; TDoubleVec labels{1.0, 1.0, 1.0, 1.0}; do { for (const auto& label : labels) { argmin.add(prediction, label); } } while (argmin.nextPass()); TDoubleVector minimizer{argmin.value()}; double totalLoss{0.0}; CMultinomialLogisticLoss loss{3}; for (const auto& label : labels) { maths::common::CFloatStorage probeStorage[]{ prediction(0) + minimizer(0), prediction(1) + minimizer(1), prediction(2) + minimizer(2)}; TMemoryMappedFloatVector probe{probeStorage, 3}; totalLoss += loss.value(probe, label); } BOOST_REQUIRE_SMALL(totalLoss, 1e-3); } } BOOST_AUTO_TEST_CASE(testMultinomialLogisticArgminRandom) { // Test that we have a good approximation of the additive term for the log-odds // which minimises the exact cross entropy objective. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; TDoubleVec labels; TDoubleVectorVec predictions; TDoubleVec scales{6.0, 1.0}; TDoubleVec lambdas{0.0, 10.0}; for (auto i : {0, 1}) { double lambda{lambdas[i]}; LOG_DEBUG(<< "lambda = " << lambda); // The exact objective. auto exactObjective = [&](const TDoubleVector& weight) { double loss{0.0}; for (std::size_t j = 0; j < labels.size(); ++j) { TDoubleVector probabilities{predictions[j] + weight}; maths::common::CTools::inplaceSoftmax(probabilities); loss -= maths::common::CTools::fastLog( probabilities(static_cast<int>(labels[j]))); } return loss + lambda * weight.squaredNorm(); }; // This loop is fuzzing the predicted log-odds and testing we get consistently // good estimates of the true minimizer. double sumObjectiveGridSearch{0.0}; double sumObjectiveAtActual{0.0}; for (std::size_t t = 0; t < 10; ++t) { testRng.generateUniformSamples(0.0, 2.0, 500, labels); for (auto& label : labels) { label = std::floor(label + 0.5); } predictions.clear(); for (const auto& label : labels) { TDoubleVec prediction; testRng.generateNormalSamples(0.0, 2.0, 3, prediction); prediction[static_cast<std::size_t>(label)] += 1.0; predictions.push_back(TDoubleVector::fromStdVector(prediction)); } TDoubleVector weight{3}; double objectiveGridSearch{std::numeric_limits<double>::max()}; TDoubleVector argminGridSearch; minimizeGridSearch(exactObjective, scales[i], 0, weight, objectiveGridSearch, argminGridSearch); LOG_DEBUG(<< "argmin grid search = " << argminGridSearch.transpose() << ", min objective grid search = " << objectiveGridSearch); std::size_t numberPasses{0}; maths::common::CFloatStorage storage[]{0.0, 0.0, 0.0}; TMemoryMappedFloatVector prediction{storage, 3}; CArgMinMultinomialLogisticLossImpl argmin{3, lambda, rng}; do { ++numberPasses; for (std::size_t j = 0; j < labels.size(); ++j) { storage[0] = predictions[j](0); storage[1] = predictions[j](1); storage[2] = predictions[j](2); argmin.add(prediction, labels[j]); } } while (argmin.nextPass()); BOOST_REQUIRE_EQUAL(std::size_t{2}, numberPasses); TDoubleVector actual{argmin.value()}; double objectiveAtActual{exactObjective(actual)}; LOG_DEBUG(<< "actual = " << actual.transpose() << ", objective(actual) = " << objectiveAtActual); BOOST_TEST_REQUIRE(objectiveAtActual < 1.01 * objectiveGridSearch); sumObjectiveGridSearch += objectiveGridSearch; sumObjectiveAtActual += objectiveAtActual; } LOG_DEBUG(<< "sum min objective grid search = " << sumObjectiveGridSearch); LOG_DEBUG(<< "sum objective(actual) = " << sumObjectiveAtActual); BOOST_TEST_REQUIRE(sumObjectiveAtActual < 1.01 * sumObjectiveGridSearch); } } BOOST_AUTO_TEST_CASE(testMultinomialLogisticLossGradientAndCurvature) { // Test that the loss gradient and curvature functions are close to the // numerical derivatives of the objective. test::CRandomNumbers rng; double eps{1e-3}; auto derivative = [eps](const std::function<double(TDoubleVector)>& f, TDoubleVector prediction, std::size_t i) { prediction(i) += eps; double result{f(prediction)}; prediction(i) -= 2.0 * eps; result -= f(prediction); result /= 2.0 * eps; return result; }; for (std::size_t t = 0; t < 100; ++t) { CMultinomialLogisticLoss loss{3}; TDoubleVec predictions(3, 0.0); if (t > 0) { rng.generateUniformSamples(-1.0, 1.0, 3, predictions); } TDoubleVec actual; rng.generateUniformSamples(0.0, 3.0, 1, actual); actual[0] = std::floor(actual[0]); TDoubleVector expectedGradient{3}; TDoubleVector expectedCurvature{6}; for (std::size_t i = 0, k = 0; i < 3; ++i) { auto gi = [&](TDoubleVector prediction) { return derivative( [&](TDoubleVector prediction_) { maths::common::CFloatStorage storage[]{ prediction_(0), prediction_(1), prediction_(2)}; return loss.value(TMemoryMappedFloatVector{storage, 3}, actual[0]); }, prediction, i); }; TDoubleVector prediction{3}; prediction(0) = predictions[0]; prediction(1) = predictions[1]; prediction(2) = predictions[2]; expectedGradient(i) = gi(prediction); for (std::size_t j = i; j < 3; ++j, ++k) { expectedCurvature(k) = derivative(gi, prediction, j); } } maths::common::CFloatStorage storage[]{predictions[0], predictions[1], predictions[2]}; TMemoryMappedFloatVector prediction{storage, 3}; TDoubleVector actualGradient{3}; loss.gradient(prediction, actual[0], [&](std::size_t k, double gradient) { actualGradient(k) = gradient; }); TDoubleVector actualCurvature{6}; loss.curvature(prediction, actual[0], [&](std::size_t k, double curvature) { actualCurvature(k) = curvature; }); if (t % 10 == 0) { LOG_DEBUG(<< "actual gradient = " << actualGradient.transpose()); LOG_DEBUG(<< "expected gradient = " << expectedGradient.transpose()); LOG_DEBUG(<< "actual curvature = " << actualCurvature.transpose()); LOG_DEBUG(<< "expected curvature = " << expectedCurvature.transpose()); } BOOST_REQUIRE_SMALL((expectedGradient - actualGradient).norm(), 0.001); BOOST_REQUIRE_SMALL((expectedCurvature - actualCurvature).norm(), 0.02); } } BOOST_AUTO_TEST_CASE(testMultinomialLogisticLossForUnderflow) { // Test the behaviour of value, gradient and Hessian of the logistic loss in // the regime where the probabilities underflow. double eps{100.0 * std::numeric_limits<double>::epsilon()}; auto logits = [](double x, TFloatVec& result) { result.assign({0.0, x}); }; CMultinomialLogisticLoss loss{2}; // Losses should be very nearly linear function of log-odds when they're large. { TFloatVec storage[2]; logits(1.0 - std::log(eps), storage[0]); logits(1.0 + std::log(eps), storage[1]); TMemoryMappedFloatVector predictions[]{{storage[0].data(), 2}, {storage[1].data(), 2}}; TDoubleVec previousLoss{loss.value(predictions[0], 0.0), loss.value(predictions[1], 1.0)}; for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { logits(scale - std::log(eps), storage[0]); logits(scale + std::log(eps), storage[1]); TDoubleVec currentLoss{loss.value(predictions[0], 0.0), loss.value(predictions[1], 1.0)}; BOOST_REQUIRE_CLOSE_ABSOLUTE(0.25, previousLoss[0] - currentLoss[0], 0.005); BOOST_REQUIRE_CLOSE_ABSOLUTE(-0.25, previousLoss[1] - currentLoss[1], 0.005); previousLoss = currentLoss; } } // The gradient and curvature should be proportional to the exponential of the // log-odds when they're small. { auto readDerivatives = [&](double prediction, TDoubleVecVec& gradients, TDoubleVecVec& curvatures) { TFloatVec storage[2]; logits(prediction + std::log(eps), storage[0]); logits(prediction - std::log(eps), storage[1]); TMemoryMappedFloatVector predictions[]{{storage[0].data(), 2}, {storage[1].data(), 2}}; loss.gradient(predictions[0], 0.0, [&](std::size_t i, double value) { gradients[0][i] = value; }); loss.gradient(predictions[1], 1.0, [&](std::size_t i, double value) { gradients[1][i] = value; }); loss.curvature(predictions[0], 0.0, [&](std::size_t i, double value) { curvatures[0][i] = value; }); loss.curvature(predictions[1], 1.0, [&](std::size_t i, double value) { curvatures[1][i] = value; }); }; TDoubleVecVec previousGradient(2, TDoubleVec(2)); TDoubleVecVec previousCurvature(2, TDoubleVec(3)); readDerivatives(1.0, previousGradient, previousCurvature); for (double scale : {0.75, 0.5, 0.25, 0.0, -0.25, -0.5, -0.75, -1.0}) { TDoubleVecVec currentGradient(2, TDoubleVec(2)); TDoubleVecVec currentCurvature(2, TDoubleVec(3)); readDerivatives(scale, currentGradient, currentCurvature); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(0.25), previousGradient[0][1] / currentGradient[0][1], 0.04); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(-0.25), previousGradient[1][1] / currentGradient[1][1], 0.04); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(0.25), previousCurvature[0][2] / currentCurvature[0][2], 0.04); BOOST_REQUIRE_CLOSE_ABSOLUTE( std::exp(-0.25), previousCurvature[1][2] / currentCurvature[1][2], 0.04); previousGradient = currentGradient; previousCurvature = currentCurvature; } } } BOOST_AUTO_TEST_CASE(testSubsetMultinomialLogisticLoss) { using TSizeVec = std::vector<std::size_t>; auto fillRowsAndCols = [](std::size_t dimension, TSizeVec& rows, TSizeVec& cols) { rows.resize(dimension * (dimension + 1) / 2); cols.resize(dimension * (dimension + 1) / 2); for (std::size_t i = 0, k = 0; i < dimension; ++i) { for (std::size_t j = i; j < dimension; ++j, ++k) { rows[k] = i; cols[k] = j; } } }; test::CRandomNumbers rng; TSizeVec subset{1, 3, 4, 7, 9}; TSizeVec complement{0, 2, 5, 6, 8}; TSizeVec rows; TSizeVec cols; TSizeVec subsetRows; TSizeVec subsetCols; fillRowsAndCols(10, rows, cols); fillRowsAndCols(6, subsetRows, subsetCols); CMultinomialLogisticLoss loss{10}; CMultinomialLogisticSubsetLoss subsetLoss{10, subset}; TDoubleVec weights; TDoubleVec actual; TFloatVec storage; TDoubleVec gradient(10); TDoubleVec subsetGradient(6); TDoubleVecVec curvatures(10, TDoubleVec(10)); TDoubleVecVec subsetCurvatures(6, TDoubleVec(6)); for (std::size_t t = 0; t < 1000; ++t) { rng.generateUniformSamples(0.0, 1.0, 10, weights); storage.assign(weights.begin(), weights.end()); TMemoryMappedFloatVector prediction{storage.data(), 10}; prediction.array() = (prediction.array() / prediction.sum()).log(); rng.generateUniformSamples(0.0, 10.0, 1, actual); actual[0] = std::floor(actual[0]); loss.gradient(prediction, actual[0], [&](std::size_t i, double value) { gradient[i] = value; }); subsetLoss.gradient(prediction, actual[0], [&](std::size_t i, double value) { subsetGradient[i] = value; }); for (std::size_t i = 0; i < subset.size(); ++i) { BOOST_REQUIRE_CLOSE(gradient[subset[i]], subsetGradient[i], 1e-3); } double pAgg{std::accumulate(complement.begin(), complement.end(), 0.0, [&](auto p, auto i) { return p + std::exp(prediction(i)); }) / static_cast<double>(complement.size())}; BOOST_REQUIRE_CLOSE(std::binary_search(complement.begin(), complement.end(), static_cast<std::size_t>(actual[0])) ? pAgg - 1.0 : pAgg, subsetGradient[subset.size()], 1e-3); loss.curvature(prediction, actual[0], [&](std::size_t i, double value) { curvatures[rows[i]][cols[i]] = value; }); subsetLoss.curvature(prediction, actual[0], [&](std::size_t i, double value) { subsetCurvatures[subsetRows[i]][subsetCols[i]] = value; }); for (std::size_t i = 0; i <= subset.size(); ++i) { for (std::size_t j = i; j <= subset.size(); ++j) { if (i < subset.size() && j < subset.size()) { BOOST_REQUIRE_CLOSE(curvatures[subset[i]][subset[j]], subsetCurvatures[i][j], 1e-3); } else if (i < subset.size()) { BOOST_REQUIRE_CLOSE(-std::exp(prediction(subset[i])) * pAgg, subsetCurvatures[i][j], 1e-3); } else if (j < subset.size()) { BOOST_REQUIRE_CLOSE(-std::exp(prediction(subset[j])) * pAgg, subsetCurvatures[i][j], 1e-3); } } } } } BOOST_AUTO_TEST_CASE(testMsleArgminObjectiveFunction) { // Test that the calculated objective function is close to the correct value. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t numberSamples{10000}; { for (std::size_t t = 0; t < 3; ++t) { double lambda{0.1 * static_cast<double>(t + 1)}; CArgMinMsleImpl argmin{lambda}; TDoubleVec targets; testRng.generateUniformSamples(1000.0, 100000.0, numberSamples, targets); TDoubleVec predictionErrors; predictionErrors.resize(targets.size(), 0.0); testRng.generateUniformSamples(-100.0, 100.0, targets.size(), predictionErrors); do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{ std::log(targets[i] + predictionErrors[i])}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); auto objective = argmin.objective(); for (double weight = -1.0; weight < 1.0; weight += 0.1) { TMeanAccumulator expectedErrorAccumulator; for (std::size_t i = 0; i < targets.size(); ++i) { double error{ std::log(targets[i] + 1) - std::log(std::exp(weight) * (targets[i] + predictionErrors[i]) + 1)}; expectedErrorAccumulator.add(error * error); } double expectedObjectiveValue{ maths::common::CBasicStatistics::mean(expectedErrorAccumulator) + lambda * maths::common::CTools::pow2(std::exp(weight))}; double estimatedObjectiveValue{objective(weight)}; BOOST_REQUIRE_CLOSE_ABSOLUTE(estimatedObjectiveValue, expectedObjectiveValue, 1e-3); } } } // Constant prediction { for (std::size_t t = 0; t < 3; ++t) { double lambda{0.1 * static_cast<double>(t + 1)}; CArgMinMsleImpl argmin{lambda}; double constantPrediction{1000.0}; TDoubleVec targets; testRng.generateUniformSamples(0.0, 10000.0, numberSamples, targets); do { for (auto target : targets) { maths::common::CFloatStorage storage[]{std::log(constantPrediction)}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, target); } } while (argmin.nextPass()); auto objective = argmin.objective(); for (double weight = -1.0; weight < 1.0; weight += 0.1) { TMeanAccumulator expectedErrorAccumulator; for (auto target : targets) { double error{std::log(target + 1) - std::log(constantPrediction * std::exp(weight) + 1)}; expectedErrorAccumulator.add(error * error); } double expectedObjectiveValue{ maths::common::CBasicStatistics::mean(expectedErrorAccumulator) + lambda * maths::common::CTools::pow2(std::exp(weight))}; double estimatedObjectiveValue{objective(weight)}; BOOST_REQUIRE_CLOSE_ABSOLUTE(estimatedObjectiveValue, expectedObjectiveValue, 1e-3); } } } } BOOST_AUTO_TEST_CASE(testMsleArgmin) { // Test on a single data point with known output. { double lambda{0.0}; CArgMinMsleImpl argmin{lambda}; TDoubleVec targets; maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t numberSamples{1}; testRng.generateUniformSamples(0.0, 10000.0, numberSamples, targets); TDoubleVec predictions; predictions.resize(targets.size(), 0.0); testRng.generateUniformSamples(0.0, 10000.0, targets.size(), predictions); do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{std::log(predictions[i])}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); double expectedWeight{std::log(targets[0] / predictions[0])}; double estimatedWeight{argmin.value()[0]}; BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedWeight, estimatedWeight, 1e-3); } // Test against scipy and scikit learn. // // To reproduce run in Python: // from sklearn.metrics import mean_squared_log_error // import numpy as np // from scipy.optimize import minimize // y_true = [3, 5, 2.5, 7] // y_pred = [2.5, 5, 4, 8] // def objective(logWeight): // return mean_squared_log_error(y_true, np.exp(logWeight)*y_pred) // minimize(objective, 0.0) { double lambda{0.0}; CArgMinMsleImpl argmin{lambda}; TDoubleVec targets{3, 5, 2.5, 7}; TDoubleVec predictions{2.5, 5, 4, 8}; do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{std::log(predictions[i])}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); double optimalWeight{-0.11355011}; double optimalObjective{0.03145382791305494}; double estimatedWeight{argmin.value()[0]}; double estimatedObjective{argmin.objective()(estimatedWeight)}; LOG_DEBUG(<< "Estimated objective " << estimatedObjective << " optimal objective " << optimalObjective); LOG_DEBUG(<< "Estimated weight " << estimatedWeight << " true weight " << optimalWeight); BOOST_REQUIRE_CLOSE_ABSOLUTE(optimalObjective, estimatedObjective, 1e-5); BOOST_REQUIRE_CLOSE_ABSOLUTE(optimalWeight, estimatedWeight, 1e-2); } } BOOST_AUTO_TEST_CASE(testPseudoHuberArgminObjectiveFunction) { // Test that the calculated objective function is close to the correct value. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t numberSamples{10000}; { for (std::size_t t = 0; t < 3; ++t) { double lambda{0.1 * static_cast<double>(t + 1)}; double delta{lambda * 10}; // try different delta's without the second loop CArgMinPseudoHuberImpl argmin{lambda, delta}; TDoubleVec targets; targets.resize(numberSamples, 0.0); testRng.generateUniformSamples(0.0, 10000.0, numberSamples, targets); double trueWeight{20.0}; TDoubleVec predictionErrors; predictionErrors.resize(targets.size(), 0.0); testRng.generateNormalSamples(-trueWeight, 10.0, targets.size(), predictionErrors); do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{targets[i] + predictionErrors[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); auto objective = argmin.objective(); double expectedMin{std::numeric_limits<double>::infinity()}; double expectedArgmin{-100}; double estimatedMin{std::numeric_limits<double>::infinity()}; double estimatedArgmin{100}; for (double weight = -10.0; weight <= 30.0; weight += 0.1) { TMeanAccumulator expectedErrorAccumulator; for (std::size_t i = 0; i < targets.size(); ++i) { double p{targets[i] + predictionErrors[i]}; double error{maths::common::CTools::pow2(delta) * (std::sqrt(1.0 + maths::common::CTools::pow2( (targets[i] - p - weight) / delta)) - 1.0)}; expectedErrorAccumulator.add(error); } double expectedObjectiveValue{ maths::common::CBasicStatistics::mean(expectedErrorAccumulator) + lambda * maths::common::CTools::pow2(weight)}; double estimatedObjectiveValue{objective(weight)}; if (expectedObjectiveValue < expectedMin) { expectedMin = expectedObjectiveValue; expectedArgmin = weight; } if (estimatedObjectiveValue < estimatedMin) { estimatedMin = estimatedObjectiveValue; estimatedArgmin = weight; } } BOOST_REQUIRE_CLOSE_ABSOLUTE(estimatedArgmin, expectedArgmin, 0.11); } } } BOOST_AUTO_TEST_CASE(testPseudoHuberArgmin) { // Test on a single data point with known output. { double lambda{0.0}; double delta{1.0}; CArgMinPseudoHuberImpl argmin{lambda, delta}; TDoubleVec targets; maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t numberSamples{1}; testRng.generateUniformSamples(0.0, 10000.0, numberSamples, targets); TDoubleVec predictions; predictions.resize(targets.size(), 0.0); testRng.generateUniformSamples(0.0, 10000.0, numberSamples, targets); do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); double expectedWeight{targets[0] - predictions[0]}; double estimatedWeight{argmin.value()[0]}; LOG_DEBUG(<< "Estimate weight " << estimatedWeight << " true weight " << expectedWeight); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedWeight, estimatedWeight, 1e-3); } // test against scipy and scikit learn // To reproduce run in Python: // from sklearn.metrics import mean_squared_log_error // import numpy as np // from scipy.optimize import minimize // y_true = [3, 5, 2.5, 7] // y_pred = [2.5, 5, 4, 8] // def pseudo_huber(a, p, delta=1.0): // return np.mean(delta**2*(np.sqrt(1+((a-p)/delta)**2)-1)) // def objective(weight): // return pseudo_huber(y_true, y_pred + weight) // minimize(objective, 0.0) { double lambda{0.0}; double delta{1.0}; CArgMinPseudoHuberImpl argmin{lambda, delta}; TDoubleVec targets{3, 5, 2.5, 7}; TDoubleVec predictions{2.5, 5, 4, 8}; do { for (std::size_t i = 0; i < targets.size(); ++i) { maths::common::CFloatStorage storage[]{predictions[i]}; TMemoryMappedFloatVector prediction{storage, 1}; argmin.add(prediction, targets[i]); } } while (argmin.nextPass()); double optimalWeight{-0.5}; double optimalObjective{0.266123775}; double estimatedWeight{argmin.value()[0]}; double estimatedObjective{argmin.objective()(estimatedWeight)}; BOOST_REQUIRE_CLOSE_ABSOLUTE(optimalObjective, estimatedObjective, 1e-5); BOOST_REQUIRE_CLOSE_ABSOLUTE(optimalWeight, estimatedWeight, 1e-2); } } BOOST_AUTO_TEST_CASE(testMseIncrementalArgmin) { // Test that the minimizer finds a local minimum of the adjusted MSE loss // function (it's convex so this is unique). double eps{0.01}; std::size_t min{0}; std::size_t minMinusEps{1}; std::size_t minPlusEps{2}; std::size_t rows{200}; std::size_t cols{5}; auto frame = setupLinearRegressionProblem(rows, cols); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double lambda{ regression->impl().hyperparameters().leafWeightPenaltyMultiplier().value()}; double eta{regression->impl().hyperparameters().eta().value()}; double mu{0.1}; auto forest = regression->impl().trainedModel(); BOOST_TEST_REQUIRE(forest.size() > 1); const auto& tree = forest[1]; const auto& extraColumns = regression->impl().extraColumns(); const auto& encoder = regression->impl().encoder(); maths::analytics::boosted_tree::CMseIncremental mse{eta, mu, tree}; auto adjustedLoss = [&](const TRowRef& row, double x) { double actual{readActual(row, regression->columnHoldingDependentVariable())}; auto prediction = readPrediction(row, extraColumns, mse.dimensionPrediction()); double treePrediction{root(tree).value(encoder.encode(row), tree)(0)}; double weight{readExampleWeight(row, extraColumns)}; return weight * (maths::common::CTools::pow2(actual - (prediction(0) + x)) + mu * maths::common::CTools::pow2(treePrediction / eta - x)); }; TDoubleVec leafMinimizers(tree.size(), 0.0); { maths::common::CPRNG::CXorOShiro128Plus rng; TArgMinLossVec leafValues(tree.size(), mse.minimizer(lambda, rng)); auto result = frame->readRows( 1, core::bindRetrievableState( [&](TArgMinLossVec& leafValues_, const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{mse.dimensionPrediction()}; const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double actual{readActual( row, regression->columnHoldingDependentVariable())}; double weight{readExampleWeight(row, extraColumns)}; leafValues_[rootNode.leafIndex(encodedRow, tree)].add( encodedRow, false /*new example*/, prediction, actual, weight); } }, std::move(leafValues))); leafValues = std::move(result.first[0].s_FunctionState); leafMinimizers.reserve(leafValues.size()); for (std::size_t i = 0; i < leafValues.size(); ++i) { if (tree[i].isLeaf()) { leafMinimizers[i] = leafValues[i].value()(0); } } } TDoubleVecVec leafLosses(leafMinimizers.size(), TDoubleVec(3)); for (std::size_t i = 0; i < leafMinimizers.size(); ++i) { leafLosses[i][min] = lambda * maths::common::CTools::pow2(leafMinimizers[i]); leafLosses[i][minMinusEps] = lambda * maths::common::CTools::pow2(leafMinimizers[i] - eps); leafLosses[i][minPlusEps] = lambda * maths::common::CTools::pow2(leafMinimizers[i] + eps); } { auto result = frame->readRows( 1, core::bindRetrievableState( [&](TDoubleVecVec& adjustedLosses_, const TRowItr& beginRows, const TRowItr& endRows) { const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto i = rootNode.leafIndex(encodedRow, tree); double x{leafMinimizers[i]}; adjustedLosses_[i][min] += adjustedLoss(row, x); adjustedLosses_[i][minMinusEps] += adjustedLoss(row, x - eps); adjustedLosses_[i][minPlusEps] += adjustedLoss(row, x + eps); } }, std::move(leafLosses))); leafLosses = std::move(result.first[0].s_FunctionState); } double decrease{0.0}; for (const auto& leafLoss : leafLosses) { BOOST_TEST_REQUIRE(leafLoss[min] <= leafLoss[minMinusEps]); BOOST_TEST_REQUIRE(leafLoss[min] <= leafLoss[minPlusEps]); decrease += leafLoss[minMinusEps] - leafLoss[min]; decrease += leafLoss[minPlusEps] - leafLoss[min]; } LOG_DEBUG(<< "total decrease = " << decrease); BOOST_TEST_REQUIRE(decrease > 0.0); } BOOST_AUTO_TEST_CASE(testMseIncrementalGradientAndCurvature) { // Test that: // 1. The gradient and curvature of the loss match MSE when mu is zero. // 2. The gradient is corrected towards the old prediction, i.e. if the // old prediction for a row is positive (negative) the gradient is // larger (smaller) than the gradient of MSE. std::size_t rows{200}; std::size_t cols{5}; auto frame = setupLinearRegressionProblem(rows, cols); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double eta{regression->impl().hyperparameters().eta().value()}; double mu{0.1}; auto forest = regression->impl().trainedModel(); BOOST_TEST_REQUIRE(forest.size() > 1); const auto& tree = forest[1]; const auto& extraColumns = regression->impl().extraColumns(); const auto& encoder = regression->impl().encoder(); maths::analytics::boosted_tree::CMse mse; // Test mu == 0 { maths::analytics::boosted_tree::CMseIncremental mseIncremental{eta, 0.0, tree}; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{mse.dimensionPrediction()}; for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double actual{readActual(row, regression->columnHoldingDependentVariable())}; double expectedGradient; double expectedCurvature; double actualGradient; double actualCurvature; mse.gradient(prediction, actual, [&](std::size_t, double gradient) { expectedGradient = gradient; }); mse.curvature(prediction, actual, [&](std::size_t, double curvature) { expectedCurvature = curvature; }); mseIncremental.gradient(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double gradient) { actualGradient = gradient; }); mseIncremental.curvature(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double curvature) { actualCurvature = curvature; }); BOOST_TEST_REQUIRE(expectedGradient, actualGradient); } }); } // Test mu == 0.1 { maths::analytics::boosted_tree::CMseIncremental mseIncremental{eta, mu, tree}; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{mse.dimensionPrediction()}; const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double treePrediction{rootNode.value(encodedRow, tree)(0)}; double actual{readActual(row, regression->columnHoldingDependentVariable())}; double mseGradient; double mseIncrementalGradient; mse.gradient(prediction, actual, [&](std::size_t, double gradient) { mseGradient = gradient; }); mseIncremental.gradient(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double gradient) { mseIncrementalGradient = gradient; }); LOG_TRACE(<< "tree prediction = " << treePrediction << ", MSE gradient = " << mseGradient << ", incremental MSE gradient = " << mseIncrementalGradient); if (treePrediction < 0.0) { BOOST_TEST_REQUIRE(mseIncrementalGradient < mseGradient); } if (treePrediction > 0.0) { BOOST_TEST_REQUIRE(mseIncrementalGradient > mseGradient); } } }); } } BOOST_AUTO_TEST_CASE(testBinomialLogisticIncrementalArgmin) { // Test that the minimizer finds a local minimum of the adjusted binomial // logistic loss function (it's convex so this is unique). double eps{0.01}; std::size_t min{0}; std::size_t minMinusEps{1}; std::size_t minPlusEps{2}; std::size_t rows{200}; std::size_t cols{5}; auto frame = setupLinearBinaryClassificationProblem(rows, cols); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); double lambda{ classifier->impl().hyperparameters().leafWeightPenaltyMultiplier().value()}; double eta{classifier->impl().hyperparameters().eta().value()}; double mu{0.1}; auto forest = classifier->impl().trainedModel(); BOOST_TEST_REQUIRE(forest.size() > 1); const auto& tree = forest[1]; const auto& extraColumns = classifier->impl().extraColumns(); const auto& encoder = classifier->impl().encoder(); maths::analytics::boosted_tree::CBinomialLogisticLossIncremental bll{eta, mu, tree}; auto adjustedLoss = [&](const TRowRef& row, double x) { double actual{readActual(row, classifier->columnHoldingDependentVariable())}; auto prediction = readPrediction(row, extraColumns, bll.dimensionPrediction()); double treePrediction{root(tree).value(encoder.encode(row), tree)(0)}; double weight{readExampleWeight(row, extraColumns)}; double po1{maths::common::CTools::logisticFunction(treePrediction / eta)}; double pn1{maths::common::CTools::logisticFunction(x)}; double p1{maths::common::CTools::logisticFunction(prediction(0) + x)}; return -weight * ((1.0 - actual) * std::log(1 - p1) + actual * std::log(p1) + mu * ((1.0 - po1) * std::log(1.0 - pn1) + po1 * std::log(pn1))); }; TDoubleVec leafMinimizers(tree.size(), 0.0); { maths::common::CPRNG::CXorOShiro128Plus rng; TArgMinLossVec leafValues(tree.size(), bll.minimizer(lambda, rng)); for (std::size_t i = 0; i < 2; ++i) { auto result = frame->readRows( 1, core::bindRetrievableState( [&](TArgMinLossVec& leafValues_, const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{bll.dimensionPrediction()}; const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double actual{readActual( row, classifier->columnHoldingDependentVariable())}; double weight{readExampleWeight(row, extraColumns)}; leafValues_[rootNode.leafIndex(encodedRow, tree)].add( encodedRow, false /*new example*/, prediction, actual, weight); } }, std::move(leafValues))); leafValues = std::move(result.first[0].s_FunctionState); for (auto& leaf : leafValues) { leaf.nextPass(); } } for (std::size_t i = 0; i < leafValues.size(); ++i) { if (tree[i].isLeaf()) { leafMinimizers[i] = leafValues[i].value()(0); } } } TDoubleVecVec leafLosses(leafMinimizers.size(), TDoubleVec(3)); for (std::size_t i = 0; i < leafMinimizers.size(); ++i) { leafLosses[i][min] = lambda * maths::common::CTools::pow2(leafMinimizers[i]); leafLosses[i][minMinusEps] = lambda * maths::common::CTools::pow2(leafMinimizers[i] - eps); leafLosses[i][minPlusEps] = lambda * maths::common::CTools::pow2(leafMinimizers[i] + eps); } { auto result = frame->readRows( 1, core::bindRetrievableState( [&](TDoubleVecVec& adjustedLosses_, const TRowItr& beginRows, const TRowItr& endRows) { const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto i = rootNode.leafIndex(encodedRow, tree); double x{leafMinimizers[i]}; adjustedLosses_[i][min] += adjustedLoss(row, x); adjustedLosses_[i][minMinusEps] += adjustedLoss(row, x - eps); adjustedLosses_[i][minPlusEps] += adjustedLoss(row, x + eps); } }, std::move(leafLosses))); leafLosses = std::move(result.first[0].s_FunctionState); } double decrease{0.0}; for (const auto& leafLoss : leafLosses) { BOOST_TEST_REQUIRE(leafLoss[min] <= leafLoss[minMinusEps]); BOOST_TEST_REQUIRE(leafLoss[min] <= leafLoss[minPlusEps]); decrease += leafLoss[minMinusEps] - leafLoss[min]; decrease += leafLoss[minPlusEps] - leafLoss[min]; } BOOST_TEST_REQUIRE(decrease > 0.0); } BOOST_AUTO_TEST_CASE(testBinomialLogisticLossIncrementalGradientAndCurvature) { // Test that: // 1. The gradient and curvature of the loss match binomial logistic // loss when mu is zero. // 2. The gradient is corrected towards the old prediction, i.e. if the // old tree's prediction for a row is larger (smaller) than the new // prediction the gradient is less (greater) than the gradient of the // binomial logistic loss. std::size_t rows{200}; std::size_t cols{5}; auto frame = setupLinearBinaryClassificationProblem(rows, cols); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); double eta{classifier->impl().hyperparameters().eta().value()}; double mu{0.1}; auto forest = classifier->impl().trainedModel(); BOOST_TEST_REQUIRE(forest.size() > 1); const auto& tree = forest[1]; const auto& extraColumns = classifier->impl().extraColumns(); const auto& encoder = classifier->impl().encoder(); maths::analytics::boosted_tree::CBinomialLogisticLoss bll; // Test mu == 0 { maths::analytics::boosted_tree::CBinomialLogisticLossIncremental bllIncremental{ eta, 0.0, tree}; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{bll.dimensionPrediction()}; for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double actual{readActual(row, classifier->columnHoldingDependentVariable())}; double expectedGradient; double expectedCurvature; double actualGradient; double actualCurvature; bll.gradient(prediction, actual, [&](std::size_t, double gradient) { expectedGradient = gradient; }); bll.curvature(prediction, actual, [&](std::size_t, double curvature) { expectedCurvature = curvature; }); bllIncremental.gradient(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double gradient) { actualGradient = gradient; }); bllIncremental.curvature(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double curvature) { actualCurvature = curvature; }); BOOST_TEST_REQUIRE(expectedGradient, actualGradient); } }); } // Test mu == 0.1 { maths::analytics::boosted_tree::CBinomialLogisticLossIncremental bllIncremental{ eta, mu, tree}; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { std::size_t dimensionPrediction{bll.dimensionPrediction()}; const auto& rootNode = root(tree); for (auto row_ = beginRows; row_ != endRows; ++row_) { auto row = *row_; auto encodedRow = encoder.encode(row); auto prediction = readPrediction(row, extraColumns, dimensionPrediction); double treePrediction{rootNode.value(encodedRow, tree)(0) / eta}; double actual{readActual(row, classifier->columnHoldingDependentVariable())}; double bllGradient; double bllIncrementalGradient; bll.gradient(prediction, actual, [&](std::size_t, double gradient) { bllGradient = gradient; }); bllIncremental.gradient(encodedRow, false /*new example*/, prediction, actual, [&](std::size_t, double gradient) { bllIncrementalGradient = gradient; }); LOG_TRACE(<< "tree prediction = " << treePrediction << ", MSE gradient = " << bllGradient << ", incremental MSE gradient = " << bllIncrementalGradient); if (treePrediction > prediction(0)) { BOOST_TEST_REQUIRE(bllIncrementalGradient < bllGradient); } if (treePrediction < prediction(0)) { BOOST_TEST_REQUIRE(bllIncrementalGradient > bllGradient); } } }); } } BOOST_AUTO_TEST_SUITE_END()

lib/maths/analytics/unittest/CBoostedTreeLossTest.cc (1,337 lines of code) (raw):