lib/maths/analytics/unittest/CBoostedTreeUtilsTest.cc (160 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CDataFrame.h> #include <core/CLogger.h> #include <maths/analytics/CBoostedTree.h> #include <maths/analytics/CBoostedTreeFactory.h> #include <maths/analytics/CBoostedTreeImpl.h> #include <maths/analytics/CBoostedTreeLoss.h> #include <maths/analytics/CBoostedTreeUtils.h> #include <test/CRandomNumbers.h> #include "BoostedTreeTestData.h" #include <boost/test/unit_test.hpp> #include <algorithm> #include <cmath> #include <cstddef> #include <limits> #include <map> #include <utility> #include <vector> BOOST_AUTO_TEST_SUITE(CBoostedTreeUtilsTest) using namespace ml; using maths::analytics::boosted_tree_detail::CSearchTree; using TDoubleVec = std::vector<double>; using TDoubleVecVec = std::vector<TDoubleVec>; using TSizeVec = std::vector<std::size_t>; BOOST_AUTO_TEST_CASE(testRetrainTreeSelectionProbabilities) { std::size_t rows{100}; std::size_t cols{4}; TDoubleVec m{10.0, 2.0, 5.0}; TDoubleVec s{1.0, -0.9, 0.8}; double noiseVariance{1.0}; TTargetFunc target{[=](const core::CDataFrame::TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }}; test::CRandomNumbers rng; auto frame = setupRegressionProblem(rng, target, noiseVariance, rows, cols); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); const auto& impl = regression->impl(); // Edge case. { core::CPackedBitVector allRowsMask{rows, true}; maths::analytics::CBoostedTree::TNodeVecVec emptyForest; auto probabilities = impl.retrainTreeSelectionProbabilities(*frame, allRowsMask, emptyForest); BOOST_TEST_REQUIRE(probabilities.empty()); } // Test some invariants of the probabilities: // 1. That number probabilities is equal to the number of trees, // 2. That the probabilities are non-negative, and // 3. That the probabilities are normalized. for (std::size_t test = 0; test < 5; ++test) { m[0] += 5.0; s[0] -= 0.1; m[2] += 1.0; s[2] += 0.1; TTargetFunc deltaTarget{[=](const core::CDataFrame::TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }}; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, 10, x[i]); } addData(rng, deltaTarget, x, noiseVariance, *frame); core::CPackedBitVector allRowsMask{frame->numberRows(), true}; core::CPackedBitVector newRowsMask{frame->numberRows() - 10, false}; for (std::size_t i = 0; i < 10; ++i) { newRowsMask.extend(true); } regression->predict(); auto probabilities = impl.retrainTreeSelectionProbabilities( *frame, allRowsMask, impl.trainedModel()); BOOST_REQUIRE_EQUAL(impl.trainedModel().size(), probabilities.size()); BOOST_TEST_REQUIRE(*std::max_element(probabilities.begin(), probabilities.end()) >= 0.0); BOOST_REQUIRE_CLOSE( 1.0, std::accumulate(probabilities.begin(), probabilities.end(), 0.0), 1e-6); } } BOOST_AUTO_TEST_CASE(testSearchTree) { // Check that the result of CSearchTree::upperBound is identical to std::upper_bound // on some edge cases and random data. using TFloatVec = std::vector<maths::common::CFloatStorage>; TSizeVec size; TDoubleVec set; TDoubleVec probes; TDoubleVec extraProbes; TFloatVec fset; LOG_DEBUG(<< "Empty"); { CSearchTree tree{{}}; BOOST_REQUIRE_EQUAL(0, tree.upperBound(0.0F)); BOOST_REQUIRE_EQUAL(0, tree.upperBound(-1000.0F)); BOOST_REQUIRE_EQUAL(0, tree.upperBound(10000.0F)); } LOG_DEBUG(<< "Before start"); { CSearchTree tree{{0.0}}; BOOST_REQUIRE_EQUAL(1, tree.upperBound(0.0)); BOOST_REQUIRE_EQUAL(0, tree.upperBound(-1.0)); } LOG_DEBUG(<< "Duplicate"); for (std::size_t i = 0; i < 5; ++i) { fset.push_back(0.0); CSearchTree tree{fset}; BOOST_REQUIRE_EQUAL(0, tree.upperBound(-1.0)); BOOST_REQUIRE_EQUAL(i + 1, tree.upperBound(0.0)); BOOST_REQUIRE_EQUAL(i + 1, tree.upperBound(1.0)); } LOG_DEBUG(<< "Infinity"); { CSearchTree tree{{0.0}}; BOOST_REQUIRE_EQUAL(1, tree.upperBound(std::numeric_limits<float>::infinity())); } LOG_DEBUG(<< "Small"); for (std::size_t i = 0; i < 5; ++i) { set.push_back(static_cast<double>(i + 1)); CSearchTree tree{{set.begin(), set.end()}}; for (std::size_t j = 0; j <= i; ++j) { BOOST_REQUIRE_EQUAL(j, tree.upperBound(static_cast<double>(j) + 0.5)); } } // Random small sets. test::CRandomNumbers rng; for (std::size_t i = 0; i < 10000; ++i) { if (i % 500 == 0) { LOG_DEBUG(<< static_cast<double>(i) / 100.0 << "%"); } rng.generateUniformSamples(1, 100, 1, size); rng.generateUniformSamples(-1000.0, 1000.0, size[0], set); rng.generateUniformSamples(-2000.0, 2000.0, 10, extraProbes); fset.assign(set.begin(), set.end()); std::sort(fset.begin(), fset.end()); probes.clear(); probes.insert(probes.end(), set.begin(), set.end()); probes.insert(probes.end(), extraProbes.begin(), extraProbes.end()); CSearchTree tree({fset.begin(), fset.end()}); for (auto probe : probes) { maths::common::CFloatStorage fprobe{probe}; auto expected = std::upper_bound(fset.begin(), fset.end(), fprobe) - fset.begin(); BOOST_REQUIRE_EQUAL(expected, tree.upperBound(fprobe)); } } LOG_DEBUG(<< "Large"); rng.generateUniformSamples(-1000.0, 1000.0, 100000, set); rng.generateUniformSamples(-2000.0, 2000.0, 1000, extraProbes); fset.assign(set.begin(), set.end()); std::sort(fset.begin(), fset.end()); probes.clear(); probes.insert(probes.end(), set.begin(), set.end()); probes.insert(probes.end(), extraProbes.begin(), extraProbes.end()); CSearchTree tree({fset.begin(), fset.end()}); for (auto probe : probes) { maths::common::CFloatStorage fprobe{probe}; auto expected = std::upper_bound(fset.begin(), fset.end(), fprobe) - fset.begin(); BOOST_REQUIRE_EQUAL(expected, tree.upperBound(probe)); } } BOOST_AUTO_TEST_SUITE_END()