lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CDataFrame.h> #include <maths/analytics/CDataFramePredictiveModel.h> #include <maths/analytics/CTreeShapFeatureImportance.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CSampling.h> #include <maths/common/CTools.h> #include <maths/common/CToolsDetail.h> #include <api/CDataFrameAnalyzer.h> #include <api/CDataFrameTrainBoostedTreeClassifierRunner.h> #include <api/CDataFrameTrainBoostedTreeRunner.h> #include <api/CInferenceModelMetadata.h> #include <test/CDataFrameAnalysisSpecificationFactory.h> #include <test/CRandomNumbers.h> #include <boost/test/unit_test.hpp> #include <memory> #include <random> #include <utility> BOOST_AUTO_TEST_SUITE(CDataFrameAnalyzerFeatureImportanceTest) using namespace ml; namespace { using TDoubleVec = std::vector<double>; using TVector = maths::common::CDenseVector<double>; using TStrVec = std::vector<std::string>; using TRowItr = core::CDataFrame::TRowItr; using TRowRef = core::CDataFrame::TRowRef; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanAccumulatorVec = std::vector<TMeanAccumulator>; using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using TMemoryMappedMatrix = maths::common::CMemoryMappedDenseMatrix<double>; using TDocumentStrPr = std::pair<json::value, std::string>; using TDataFrameUPtrTemporaryDirectoryPtrPr = test::CDataFrameAnalysisSpecificationFactory::TDataFrameUPtrTemporaryDirectoryPtrPr; void setupLinearRegressionData(const TStrVec& fieldNames, TStrVec& fieldValues, api::CDataFrameAnalyzer& analyzer, const TDoubleVec& weights, const TDoubleVec& values, double noiseVar = 0.0) { test::CRandomNumbers rng; auto target = [&weights, &rng, noiseVar](const TDoubleVec& regressors) { TDoubleVec result(1); rng.generateNormalSamples(0, noiseVar, 1, result); for (std::size_t i = 0; i < weights.size(); ++i) { result[0] += weights[i] * regressors[i]; } return core::CStringUtils::typeToStringPrecise(result[0], core::CIEEE754::E_DoublePrecision); }; for (std::size_t i = 0; i < values.size(); i += weights.size()) { TDoubleVec row(weights.size()); for (std::size_t j = 0; j < weights.size(); ++j) { row[j] = values[i + j]; } fieldValues[0] = target(row); for (std::size_t j = 0; j < row.size(); ++j) { fieldValues[j + 1] = core::CStringUtils::typeToStringPrecise( row[j], core::CIEEE754::E_DoublePrecision); } analyzer.handleRecord(fieldNames, fieldValues); } } void setupRegressionDataWithMissingFeatures(const TStrVec& fieldNames, TStrVec& fieldValues, api::CDataFrameAnalyzer& analyzer, std::size_t rows, std::size_t cols) { test::CRandomNumbers rng; auto target = [](const TDoubleVec& regressors) { double result{0.0}; for (auto regressor : regressors) { result += regressor; } return core::CStringUtils::typeToStringPrecise(result, core::CIEEE754::E_DoublePrecision); }; for (std::size_t i = 0; i < rows; ++i) { TDoubleVec regressors; rng.generateUniformSamples(0.0, 10.0, cols - 1, regressors); fieldValues[0] = target(regressors); for (std::size_t j = 0; j < regressors.size(); ++j) { if (regressors[j] <= 9.0) { fieldValues[j + 1] = core::CStringUtils::typeToStringPrecise( regressors[j], core::CIEEE754::E_DoublePrecision); } } analyzer.handleRecord(fieldNames, fieldValues); } } void setupBinaryClassificationData(const TStrVec& fieldNames, TStrVec& fieldValues, api::CDataFrameAnalyzer& analyzer, const TDoubleVec& weights, const TDoubleVec& values) { TStrVec classes{"foo", "bar"}; maths::common::CPRNG::CXorOShiro128Plus rng; std::uniform_real_distribution<double> u01; auto target = [&](const TDoubleVec& regressors) { double logOddsBar{0.0}; for (std::size_t i = 0; i < weights.size(); ++i) { logOddsBar += weights[i] * regressors[i]; } return classes[u01(rng) < maths::common::CTools::logisticFunction(logOddsBar)]; }; for (std::size_t i = 0; i < values.size(); i += weights.size()) { TDoubleVec row(weights.size()); for (std::size_t j = 0; j < weights.size(); ++j) { row[j] = values[i + j]; } fieldValues[0] = target(row); for (std::size_t j = 0; j < row.size(); ++j) { fieldValues[j + 1] = core::CStringUtils::typeToStringPrecise( row[j], core::CIEEE754::E_DoublePrecision); } analyzer.handleRecord(fieldNames, fieldValues); } } void setupMultiClassClassificationData(const TStrVec& fieldNames, TStrVec& fieldValues, api::CDataFrameAnalyzer& analyzer, const TDoubleVec& weights, const TDoubleVec& values) { TStrVec classes{"foo", "bar", "baz"}; maths::common::CPRNG::CXorOShiro128Plus rng; std::uniform_real_distribution<double> u01; int numberFeatures{static_cast<int>(weights.size())}; int numberClasses{static_cast<int>(classes.size())}; TDoubleVec storage(numberClasses * numberFeatures); for (int i = 0; i < numberClasses; ++i) { for (int j = 0; j < numberFeatures; ++j) { storage[j * numberClasses + i] = static_cast<double>(i) * weights[j]; } } auto probability = [&](const TDoubleVec& row) { TMemoryMappedMatrix W(&storage[0], numberClasses, numberFeatures); TVector x(numberFeatures); for (int i = 0; i < numberFeatures; ++i) { x(i) = row[i]; } TVector result{W * x}; maths::common::CTools::inplaceSoftmax(result); return result; }; auto target = [&](const TDoubleVec& row) { TDoubleVec probabilities{probability(row).to<TDoubleVec>()}; return classes[maths::common::CSampling::categoricalSample(rng, probabilities)]; }; for (std::size_t i = 0; i < values.size(); i += weights.size()) { TDoubleVec row(weights.size()); for (std::size_t j = 0; j < weights.size(); ++j) { row[j] = values[i + j]; } fieldValues[0] = target(row); for (std::size_t j = 0; j < row.size(); ++j) { fieldValues[j + 1] = core::CStringUtils::typeToStringPrecise( row[j], core::CIEEE754::E_DoublePrecision); } analyzer.handleRecord(fieldNames, fieldValues); } } struct SFixture { TDocumentStrPr runRegression(std::size_t shapValues, const TDoubleVec& weights, double noiseVar = 0.0) { auto outputWriterFactory = [&]() { return std::make_unique<core::CJsonOutputStreamWrapper>(s_Output); }; TDataFrameUPtrTemporaryDirectoryPtrPr frameAndDirectory; auto spec = test::CDataFrameAnalysisSpecificationFactory{} .rows(s_Rows) .memoryLimit(26000000) .earlyStoppingEnabled(false) .predictionCategoricalFieldNames({"c1"}) .predictionAlpha(s_Alpha) .predictionLambda(s_Lambda) .predictionGamma(s_Gamma) .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) .predictionEta(s_Eta) .predictionMaximumNumberTrees(s_MaximumNumberTrees) .predictionFeatureBagFraction(s_FeatureBagFraction) .predictionNumberTopShapValues(shapValues) .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target", &frameAndDirectory); api::CDataFrameAnalyzer analyzer{std::move(spec), std::move(frameAndDirectory), std::move(outputWriterFactory)}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; test::CRandomNumbers rng; TDoubleVec values; rng.generateUniformSamples(-10.0, 10.0, weights.size() * s_Rows, values); // make the first column categorical for (auto it = values.begin(); it < values.end(); it += 4) { *it = (*it < 0) ? -10.0 : 10.0; } setupLinearRegressionData(fieldNames, fieldValues, analyzer, weights, values, noiseVar); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); LOG_DEBUG(<< "estimated memory usage = " << core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); LOG_DEBUG(<< "peak memory = " << core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage)); LOG_DEBUG(<< "time to train = " << core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) << "ms"); BOOST_TEST_REQUIRE( core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); json::error_code ec; json::value results = json::parse(s_Output.str(), ec); BOOST_TEST_REQUIRE(ec.failed() == false); BOOST_TEST_REQUIRE(results.is_array()); return std::make_pair(std::move(results), s_Output.str()); } TDocumentStrPr runBinaryClassification(std::size_t shapValues, const TDoubleVec& weights) { auto outputWriterFactory = [&]() { return std::make_unique<core::CJsonOutputStreamWrapper>(s_Output); }; TDataFrameUPtrTemporaryDirectoryPtrPr frameAndDirectory; auto spec = test::CDataFrameAnalysisSpecificationFactory{} .rows(s_Rows) .memoryLimit(26000000) .earlyStoppingEnabled(false) .predictionCategoricalFieldNames({"target"}) .predictionAlpha(s_Alpha) .predictionLambda(s_Lambda) .predictionGamma(s_Gamma) .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) .predictionEta(s_Eta) .predictionMaximumNumberTrees(s_MaximumNumberTrees) .predictionFeatureBagFraction(s_FeatureBagFraction) .predictionNumberTopShapValues(shapValues) .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target", &frameAndDirectory); api::CDataFrameAnalyzer analyzer{std::move(spec), std::move(frameAndDirectory), std::move(outputWriterFactory)}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; test::CRandomNumbers rng; TDoubleVec values; rng.generateUniformSamples(-10.0, 10.0, weights.size() * s_Rows, values); setupBinaryClassificationData(fieldNames, fieldValues, analyzer, weights, values); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); LOG_DEBUG(<< "estimated memory usage = " << core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); LOG_DEBUG(<< "peak memory = " << core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage)); LOG_DEBUG(<< "time to train = " << core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) << "ms"); BOOST_TEST_REQUIRE( core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); json::error_code ec; json::value results = json::parse(s_Output.str(), ec); BOOST_TEST_REQUIRE(ec.failed() == false); BOOST_TEST_REQUIRE(results.is_array()); return std::make_pair(std::move(results), s_Output.str()); } TDocumentStrPr runMultiClassClassification(std::size_t shapValues, const TDoubleVec& weights) { auto outputWriterFactory = [&]() { return std::make_unique<core::CJsonOutputStreamWrapper>(s_Output); }; TDataFrameUPtrTemporaryDirectoryPtrPr frameAndDirectory; auto spec = test::CDataFrameAnalysisSpecificationFactory{} .rows(s_Rows) .memoryLimit(26000000) .earlyStoppingEnabled(false) .predictionCategoricalFieldNames({"target"}) .predictionAlpha(s_Alpha) .predictionLambda(s_Lambda) .predictionGamma(s_Gamma) .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) .predictionEta(s_Eta) .predictionMaximumNumberTrees(s_MaximumNumberTrees) .predictionFeatureBagFraction(s_FeatureBagFraction) .predictionNumberTopShapValues(shapValues) .numberClasses(3) .numberTopClasses(3) .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::classification(), "target", &frameAndDirectory); api::CDataFrameAnalyzer analyzer{std::move(spec), std::move(frameAndDirectory), std::move(outputWriterFactory)}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; test::CRandomNumbers rng; TDoubleVec values; rng.generateUniformSamples(-10.0, 10.0, weights.size() * s_Rows, values); setupMultiClassClassificationData(fieldNames, fieldValues, analyzer, weights, values); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); LOG_DEBUG(<< "estimated memory usage = " << core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); LOG_DEBUG(<< "peak memory = " << core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage)); LOG_DEBUG(<< "time to train = " << core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) << "ms"); BOOST_TEST_REQUIRE( core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); json::error_code ec; json::value results = json::parse(s_Output.str(), ec); BOOST_TEST_REQUIRE(ec.failed() == false); return std::make_pair(std::move(results), s_Output.str()); } json::value runRegressionWithMissingFeatures(std::size_t shapValues) { auto outputWriterFactory = [&]() { return std::make_unique<core::CJsonOutputStreamWrapper>(s_Output); }; TDataFrameUPtrTemporaryDirectoryPtrPr frameAndDirectory; auto spec = test::CDataFrameAnalysisSpecificationFactory{} .rows(s_Rows) .memoryLimit(26000000) .earlyStoppingEnabled(false) .predictionAlpha(s_Alpha) .predictionLambda(s_Lambda) .predictionGamma(s_Gamma) .predictionSoftTreeDepthLimit(s_SoftTreeDepthLimit) .predictionSoftTreeDepthTolerance(s_SoftTreeDepthTolerance) .predictionEta(s_Eta) .predictionMaximumNumberTrees(s_MaximumNumberTrees) .predictionFeatureBagFraction(s_FeatureBagFraction) .predictionNumberTopShapValues(shapValues) .predictionSpec(test::CDataFrameAnalysisSpecificationFactory::regression(), "target", &frameAndDirectory); api::CDataFrameAnalyzer analyzer{std::move(spec), std::move(frameAndDirectory), std::move(outputWriterFactory)}; TStrVec fieldNames{"target", "c1", "c2", "c3", "c4", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; setupRegressionDataWithMissingFeatures(fieldNames, fieldValues, analyzer, s_Rows, 5); analyzer.handleRecord(fieldNames, {"", "", "", "", "", "", "$"}); LOG_DEBUG(<< "estimated memory usage = " << core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); LOG_DEBUG(<< "peak memory = " << core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage)); LOG_DEBUG(<< "time to train = " << core::CProgramCounters::counter(counter_t::E_DFTPMTimeToTrain) << "ms"); BOOST_TEST_REQUIRE( core::CProgramCounters::counter(counter_t::E_DFTPMPeakMemoryUsage) < core::CProgramCounters::counter(counter_t::E_DFTPMEstimatedPeakMemoryUsage)); json::error_code ec; json::value results = json::parse(s_Output.str(), ec); BOOST_TEST_REQUIRE(ec.failed() == false); return results; } double s_Alpha{2.0}; double s_Lambda{1.0}; double s_Gamma{10.0}; double s_SoftTreeDepthLimit{5.0}; double s_SoftTreeDepthTolerance{0.1}; double s_Eta{0.9}; std::size_t s_MaximumNumberTrees{2}; double s_FeatureBagFraction{1.0}; int s_Rows{2000}; std::stringstream s_Output; }; template<typename RESULTS> double readShapValue(const RESULTS& results_, std::string shapField) { BOOST_TEST_REQUIRE(results_.at_pointer("/row_results/results/ml").is_object()); if (results_.at_pointer("/row_results/results/ml") .as_object() .contains(api::CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME)) { for (const auto& shapResult : results_ .at_pointer("/row_results/results/ml/" + api::CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME) .as_array()) { BOOST_TEST_REQUIRE(shapResult.is_object()); if (shapResult.as_object() .at(api::CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME) .as_string() == shapField) { json::value jv = shapResult.as_object().at( api::CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME); BOOST_TEST_REQUIRE(jv.is_number()); return jv.to_number<double>(); } } } return 0.0; } template<typename RESULTS> double readClassProbability(const RESULTS& results, std::string className) { BOOST_TEST_REQUIRE(results.at_pointer("/row_results/results/ml").is_object()); if (results.at_pointer("/row_results/results/ml").as_object().contains("top_classes")) { for (const auto& classResult : results.at_pointer("/row_results/results/ml/top_classes").as_array()) { if (classResult.at("class_name").as_string() == className) { json::value jv = classResult.at("class_probability"); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } LOG_DEBUG(<< "Class probability not found"); return 0.0; } template<typename RESULTS> double readShapValue(const RESULTS& results, std::string shapField, std::string className) { BOOST_TEST_REQUIRE(results.at_pointer("/row_results/results/ml").is_object()); if (results.at_pointer("/row_results/results/ml").as_object().contains(api::CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME)) { for (const auto& shapResult_ : results .at_pointer("/row_results/results/ml/" + api::CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME) .as_array()) { BOOST_TEST_REQUIRE(shapResult_.is_object()); if (shapResult_ .at_pointer("/" + api::CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME) .as_string() == shapField) { for (const auto& item : shapResult_ .at_pointer("/" + api::CDataFrameTrainBoostedTreeClassifierRunner::CLASSES_FIELD_NAME) .as_array()) { BOOST_TEST_REQUIRE(item.is_object()); if (item.as_object() .at(api::CDataFrameTrainBoostedTreeClassifierRunner::CLASS_NAME_FIELD_NAME) .as_string() == className) { json::value jv = item.as_object().at( api::CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } } } return 0.0; } template<typename RESULTS> double readTotalShapValue(const RESULTS& results, std::string shapField) { using TModelMetadata = api::CInferenceModelMetadata; BOOST_TEST_REQUIRE(results.at(TModelMetadata::JSON_MODEL_METADATA_TAG).is_object()); if (results.at(TModelMetadata::JSON_MODEL_METADATA_TAG).as_object().contains(TModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG)) { for (const auto& shapResult : results .at_pointer("/" + TModelMetadata::JSON_MODEL_METADATA_TAG + "/" + TModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG) .as_array()) { if (shapResult .at_pointer("/" + TModelMetadata::JSON_FEATURE_NAME_TAG) .as_string() == shapField) { json::value jv = shapResult.at_pointer( "/" + TModelMetadata::JSON_IMPORTANCE_TAG + "/" + TModelMetadata::JSON_MEAN_MAGNITUDE_TAG); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } return 0.0; } template<typename RESULTS> double readTotalShapValue(const RESULTS& results, std::string shapField, std::string className) { using TModelMetadata = api::CInferenceModelMetadata; BOOST_TEST_REQUIRE(results.at(TModelMetadata::JSON_MODEL_METADATA_TAG).is_object()); if (results.at(TModelMetadata::JSON_MODEL_METADATA_TAG).as_object().contains(TModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG)) { for (const auto& shapResult_ : results .at_pointer("/" + TModelMetadata::JSON_MODEL_METADATA_TAG + "/" + TModelMetadata::JSON_TOTAL_FEATURE_IMPORTANCE_TAG) .as_array()) { if (shapResult_ .at_pointer("/" + TModelMetadata::JSON_FEATURE_NAME_TAG) .as_string() == shapField) { for (const auto& item : shapResult_ .at_pointer("/" + TModelMetadata::JSON_CLASSES_TAG) .as_array()) { if (item.at(TModelMetadata::JSON_CLASS_NAME_TAG).as_string() == className) { json::value jv = item.at_pointer( "/" + TModelMetadata::JSON_IMPORTANCE_TAG + "/" + TModelMetadata::JSON_MEAN_MAGNITUDE_TAG); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } } } return 0.0; } template<typename RESULTS> double readBaselineValue(const RESULTS& results) { using TModelMetadata = api::CInferenceModelMetadata; for (const auto& result_ : results.as_array()) { BOOST_TEST_REQUIRE(result_.is_object()); const json::object& result = result_.as_object(); if (result.contains(TModelMetadata::JSON_MODEL_METADATA_TAG)) { BOOST_TEST_REQUIRE( result.at(TModelMetadata::JSON_MODEL_METADATA_TAG).is_object()); if (result.at(TModelMetadata::JSON_MODEL_METADATA_TAG) .as_object() .contains(TModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG)) { json::value jv = result_.at_pointer( "/" + TModelMetadata::JSON_MODEL_METADATA_TAG + "/" + TModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG + "/" + TModelMetadata::JSON_BASELINE_TAG); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } return 0.0; } template<typename RESULTS> double readBaselineValue(const RESULTS& results, std::string className) { using TModelMetadata = api::CInferenceModelMetadata; for (const auto& result_ : results.as_array()) { BOOST_TEST_REQUIRE(result_.is_object()); const json::object& result = result_.as_object(); if (result.contains(TModelMetadata::JSON_MODEL_METADATA_TAG)) { BOOST_TEST_REQUIRE( result.at(TModelMetadata::JSON_MODEL_METADATA_TAG).is_object()); if (result.at(TModelMetadata::JSON_MODEL_METADATA_TAG) .as_object() .contains(TModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG)) { for (const auto& item : result_ .at_pointer("/" + TModelMetadata::JSON_MODEL_METADATA_TAG + "/" + TModelMetadata::JSON_FEATURE_IMPORTANCE_BASELINE_TAG + "/" + TModelMetadata::JSON_CLASSES_TAG) .as_array()) { BOOST_TEST_REQUIRE(item.is_object()); if (item.as_object().at(TModelMetadata::JSON_CLASS_NAME_TAG).as_string() == className) { json::value jv = item.as_object().at(TModelMetadata::JSON_BASELINE_TAG); BOOST_TEST_REQUIRE(jv.is_double()); return jv.to_number<double>(); } } } } } return 0.0; } } BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) { // Test that feature importance statistically correctly recognize the impact of regressors // in a linear model. In particular, that the ordering is as expected and for IID features // the significance is proportional to the multiplier. Also make sure that the SHAP values // are indeed a local approximation of the prediction. std::size_t topShapValues{5}; //Note, number of requested shap values is larger than the number of regressors TDoubleVec weights{50, 150, 50, -50}; auto resultsPair{runRegression(topShapValues, weights)}; // NOTE: Do not use brace initialization here as that will // result in "results" being created as a nested array on linux auto results = std::move(resultsPair.first); TMeanAccumulator baselineAccumulator; TMeanAccumulator c1TotalShapExpected; TMeanAccumulator c2TotalShapExpected; TMeanAccumulator c3TotalShapExpected; TMeanAccumulator c4TotalShapExpected; double c1Sum{0.0}; double c2Sum{0.0}; double c3Sum{0.0}; double c4Sum{0.0}; double c1TotalShapActual{0.0}; double c2TotalShapActual{0.0}; double c3TotalShapActual{0.0}; double c4TotalShapActual{0.0}; bool hasTotalFeatureImportance{false}; double baseline{readBaselineValue(results)}; for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { double c1{readShapValue(result, "c1")}; double c2{readShapValue(result, "c2")}; double c3{readShapValue(result, "c3")}; double c4{readShapValue(result, "c4")}; json::value jv = result.at_pointer("/row_results/results/ml/target_prediction"); BOOST_TEST_REQUIRE(jv.is_double()); double prediction{jv.to_number<double>()}; // make sure that the local approximation differs from a prediction by a numeric error BOOST_REQUIRE_SMALL(prediction - (baseline + c1 + c2 + c3 + c4), 1e-3); c1Sum += std::fabs(c1); c2Sum += std::fabs(c2); c3Sum += std::fabs(c3); c4Sum += std::fabs(c4); c1TotalShapExpected.add(std::fabs(c1)); c2TotalShapExpected.add(std::fabs(c2)); c3TotalShapExpected.add(std::fabs(c3)); c4TotalShapExpected.add(std::fabs(c4)); // assert that no SHAP value for the dependent variable is returned BOOST_REQUIRE_EQUAL(readShapValue(result, "target"), 0.0); } else if (result.as_object().contains("model_metadata")) { BOOST_TEST_REQUIRE(result.at("model_metadata").is_object()); if (result.as_object().at("model_metadata").as_object().contains("total_feature_importance")) { hasTotalFeatureImportance = true; c1TotalShapActual = readTotalShapValue(result, "c1"); c2TotalShapActual = readTotalShapValue(result, "c2"); c3TotalShapActual = readTotalShapValue(result, "c3"); c4TotalShapActual = readTotalShapValue(result, "c4"); } // TODO check that the total feature importance is calculated correctly } } // Since the target is generated using the linear model // // 50 c1 + 150 c2 + 50 c3 - 50 c4, with c1 categorical {-10,10} // // we expect c2 > c1 > c3 \approx c4. LOG_DEBUG(<< "c1Sum = " << c1Sum << ", c2Sum = " << c2Sum << ", c3Sum = " << c3Sum << ", c4Sum = " << c4Sum); BOOST_TEST_REQUIRE(c2Sum > c1Sum); // since c1 is categorical -10 or 10, it's influence is generally higher than that of c3 and c4 which are sampled // randomly on [-10, 10). BOOST_TEST_REQUIRE(c1Sum > c3Sum); BOOST_TEST_REQUIRE(c1Sum > c4Sum); BOOST_REQUIRE_CLOSE(weights[1] / weights[2], c2Sum / c3Sum, 10.0); // ratio within 10% of ratio of coefficients BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 5.0); // c3 and c4 within 5% of each other BOOST_TEST_REQUIRE(hasTotalFeatureImportance); if (c1TotalShapActual == 0 || c2TotalShapActual == 0 || c3TotalShapActual == 0 || c4TotalShapActual == 0) { LOG_INFO(<< "Incorrect results, missing total shap values: " << resultsPair.second); } BOOST_REQUIRE_CLOSE(c1TotalShapActual, maths::common::CBasicStatistics::mean(c1TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c2TotalShapActual, maths::common::CBasicStatistics::mean(c2TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c3TotalShapActual, maths::common::CBasicStatistics::mean(c3TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c4TotalShapActual, maths::common::CBasicStatistics::mean(c4TotalShapExpected), 1.0); } BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoImportance, SFixture) { // Test that feature importance calculates low SHAP values if regressors have no weight. // We also add high noise variance. std::size_t topShapValues{4}; auto resultsPair{runRegression(topShapValues, {10.0, 0.0, 0.0, 0.0}, 10.0)}; // NOTE: Do not use brace initialization here as that will // result in "results" being created as a nested array on linux auto results = std::move(resultsPair.first); TMeanAccumulator cNoImportanceMean; for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { double c1{readShapValue(result, "c1")}; json::value jv = result.at_pointer("/row_results/results/ml/target_prediction"); BOOST_TEST_REQUIRE(jv.is_double()); double prediction{jv.to_number<double>()}; // c1 explains 89-92% of the prediction value depending on platform, // i.e. the difference from the prediction is less than 11%. BOOST_REQUIRE_CLOSE(c1, prediction, 11.0); for (const auto& feature : {"c2", "c3", "c4"}) { double c = readShapValue(result, feature); BOOST_REQUIRE_SMALL(c, 3.5); cNoImportanceMean.add(std::fabs(c)); } } } BOOST_REQUIRE_SMALL(maths::common::CBasicStatistics::mean(cNoImportanceMean), 0.1); } BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) { // Test that feature importance works correctly for classification. In particular, test that // feature importance statistically correctly recognizes the impact of regressors if the // log-odds of the classes are generated by a linear model. Also make sure that the SHAP // values are indeed a local approximation of the predicted log-odds. std::size_t topShapValues{4}; auto resultsPair{runBinaryClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})}; // NOTE: Do not use brace initialization here as that will // result in "results" being created as a nested array on linux auto results = std::move(resultsPair.first); TMeanAccumulator c1TotalShapExpected; TMeanAccumulator c2TotalShapExpected; TMeanAccumulator c3TotalShapExpected; TMeanAccumulator c4TotalShapExpected; double c1Sum{0.0}; double c2Sum{0.0}; double c3Sum{0.0}; double c4Sum{0.0}; double c1TotalShapActual[2]; double c2TotalShapActual[2]; double c3TotalShapActual[2]; double c4TotalShapActual[2]; bool hasTotalFeatureImportance{false}; double baselineFoo{readBaselineValue(results, "foo")}; double baselineBar{readBaselineValue(results, "bar")}; BOOST_TEST_REQUIRE(baselineFoo == -baselineBar); TStrVec classes{"foo", "bar"}; for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { std::string targetPrediction{ result.at_pointer("/row_results/results/ml/target_prediction").as_string()}; json::value jv = result.at_pointer("/row_results/results/ml/prediction_probability"); BOOST_TEST_REQUIRE(jv.is_double()); double predictionProbability{jv.to_number<double>()}; for (const auto& className : classes) { double c1{readShapValue(result, "c1", className)}; double c2{readShapValue(result, "c2", className)}; double c3{readShapValue(result, "c3", className)}; double c4{readShapValue(result, "c4", className)}; double logOdds{std::log(predictionProbability / (1.0 - predictionProbability + 1e-10))}; logOdds = (className == targetPrediction) ? logOdds : -logOdds; double cSum{c1 + c2 + c3 + c4}; double baseline{(className == "bar") ? baselineBar : baselineFoo}; BOOST_REQUIRE_SMALL(logOdds - (baseline + cSum), 1e-3); if (className == targetPrediction) { c1Sum += std::fabs(c1); c2Sum += std::fabs(c2); c3Sum += std::fabs(c3); c4Sum += std::fabs(c4); c1TotalShapExpected.add(std::fabs(c1)); c2TotalShapExpected.add(std::fabs(c2)); c3TotalShapExpected.add(std::fabs(c3)); c4TotalShapExpected.add(std::fabs(c4)); } } } else if (result.as_object().contains("model_metadata")) { BOOST_TEST_REQUIRE(result.at("model_metadata").is_object()); if (result.as_object().at("model_metadata").as_object().contains("total_feature_importance")) { hasTotalFeatureImportance = true; } for (std::size_t i = 0; i < classes.size(); ++i) { c1TotalShapActual[i] = readTotalShapValue(result, "c1", classes[i]); c2TotalShapActual[i] = readTotalShapValue(result, "c2", classes[i]); c3TotalShapActual[i] = readTotalShapValue(result, "c3", classes[i]); c4TotalShapActual[i] = readTotalShapValue(result, "c4", classes[i]); } } } // Since the target is using the linear model // // 0.5 c1 - 0.7 c2 + 0.2 c3 - 0.2 c4 // // to generate the log odds we expect c2 > c1 > c3 \approx c4. LOG_DEBUG(<< "c1Sum = " << c1Sum << ", c2Sum = " << c2Sum << ", c3Sum = " << c3Sum << ", c4Sum = " << c4Sum); BOOST_TEST_REQUIRE(c2Sum > c1Sum); BOOST_TEST_REQUIRE(c1Sum > c3Sum); BOOST_TEST_REQUIRE(c1Sum > c4Sum); BOOST_TEST_REQUIRE(hasTotalFeatureImportance); for (std::size_t i = 0; i < classes.size(); ++i) { if (c1TotalShapActual[i] == 0 || c2TotalShapActual[i] == 0 || c3TotalShapActual[i] == 0 || c4TotalShapActual[i] == 0) { LOG_INFO(<< "Incorrect results, missing total shap values: " << resultsPair.second); } BOOST_REQUIRE_CLOSE(c1TotalShapActual[i], maths::common::CBasicStatistics::mean(c1TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c2TotalShapActual[i], maths::common::CBasicStatistics::mean(c2TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c3TotalShapActual[i], maths::common::CBasicStatistics::mean(c3TotalShapExpected), 1.0); BOOST_REQUIRE_CLOSE(c4TotalShapActual[i], maths::common::CBasicStatistics::mean(c4TotalShapExpected), 1.0); } } BOOST_FIXTURE_TEST_CASE(testMultiClassClassificationFeatureImportanceAllShap, SFixture) { std::size_t topShapValues{4}; auto resultsPair{runMultiClassClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})}; // NOTE: Do not use brace initialization here as that will // result in "results" being created as a nested array on linux auto results = std::move(resultsPair.first); TMeanAccumulatorVec c1TotalShapExpected(3); TMeanAccumulatorVec c2TotalShapExpected(3); TMeanAccumulatorVec c3TotalShapExpected(3); TMeanAccumulatorVec c4TotalShapExpected(3); double c1TotalShapActual[3]; double c2TotalShapActual[3]; double c3TotalShapActual[3]; double c4TotalShapActual[3]; bool hasTotalFeatureImportance{false}; TStrVec classes{"foo", "bar", "baz"}; TDoubleVec baselines; baselines.reserve(3); // get baselines for (const auto& className : classes) { baselines.push_back(readBaselineValue(results, className)); } double localApproximations[3]; double classProbabilities[3]; for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { double c1{0.0}; double c2{0.0}; double c3{0.0}; double c4{0.0}; double denominator{0.0}; for (std::size_t i = 0; i < classes.size(); ++i) { // class shap values should sum(abs()) to the overall feature importance double c1ClassName{readShapValue(result, "c1", classes[i])}; c1 += std::abs(c1ClassName); c1TotalShapExpected[i].add(std::abs(c1ClassName)); double c2ClassName{readShapValue(result, "c2", classes[i])}; c2 += std::abs(c2ClassName); c2TotalShapExpected[i].add(std::abs(c2ClassName)); double c3ClassName{readShapValue(result, "c3", classes[i])}; c3 += std::abs(c3ClassName); c3TotalShapExpected[i].add(std::abs(c3ClassName)); double c4ClassName{readShapValue(result, "c4", classes[i])}; c4 += std::abs(c4ClassName); c4TotalShapExpected[i].add(std::abs(c4ClassName)); double classProbability{readClassProbability(result, classes[i])}; double localApproximation{baselines[i] + c1ClassName + c2ClassName + c3ClassName + c4ClassName}; localApproximations[i] = localApproximation; classProbabilities[i] = classProbability; denominator += std::exp(localApproximation); } // Test that sum of feature importances is a local approximations of // prediction probabilities for all classes for (std::size_t i = 0; i < classes.size(); ++i) { BOOST_REQUIRE_CLOSE(classProbabilities[i], std::exp(localApproximations[i]) / denominator, 1.0); } // We should have at least one feature that is important BOOST_TEST_REQUIRE((c1 > 0.0 || c2 > 0.0 || c3 > 0.0 || c4 > 0.0)); } else if (result.as_object().contains("model_metadata")) { BOOST_TEST_REQUIRE(result.at("model_metadata").is_object()); if (result.as_object().at("model_metadata").as_object().contains("total_feature_importance")) { hasTotalFeatureImportance = true; } for (std::size_t i = 0; i < classes.size(); ++i) { c1TotalShapActual[i] = readTotalShapValue(result, "c1", classes[i]); c2TotalShapActual[i] = readTotalShapValue(result, "c2", classes[i]); c3TotalShapActual[i] = readTotalShapValue(result, "c3", classes[i]); c4TotalShapActual[i] = readTotalShapValue(result, "c4", classes[i]); } } } BOOST_TEST_REQUIRE(hasTotalFeatureImportance); for (std::size_t i = 0; i < classes.size(); ++i) { if (c1TotalShapActual[i] == 0 || c2TotalShapActual[i] == 0 || c3TotalShapActual[i] == 0 || c4TotalShapActual[i] == 0) { LOG_INFO(<< "Incorrect results, missing total shap values: " << resultsPair.second); } BOOST_REQUIRE_CLOSE( c1TotalShapActual[i], maths::common::CBasicStatistics::mean(c1TotalShapExpected[i]), 1.0); BOOST_REQUIRE_CLOSE( c2TotalShapActual[i], maths::common::CBasicStatistics::mean(c2TotalShapExpected[i]), 1.0); BOOST_REQUIRE_CLOSE( c3TotalShapActual[i], maths::common::CBasicStatistics::mean(c3TotalShapExpected[i]), 1.0); BOOST_REQUIRE_CLOSE( c4TotalShapActual[i], maths::common::CBasicStatistics::mean(c4TotalShapExpected[i]), 1.0); } } BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoShap, SFixture) { // Test that if topShapValue is set to 0, no feature importance values are returned. std::size_t topShapValues{0}; auto resultsPair{runRegression(topShapValues, {50.0, 150.0, 50.0, -50.0})}; // NOTE: Do not use brace initialization here as that will // result in "results" being created as a nested array on linux auto results = std::move(resultsPair.first); for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { BOOST_TEST_REQUIRE(result.at_pointer("/row_results/results/ml").is_object()); BOOST_TEST_REQUIRE( result.at_pointer("/row_results/results/ml").as_object().contains(api::CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME) == false); } } } BOOST_FIXTURE_TEST_CASE(testMissingFeatures, SFixture) { // Test that feature importance behaves correctly when some features are missing: // We randomly omit 10% of all data in a simple additive model target=c1+c2+c3+c4. Hence, // calculated feature importances should be very similar and the bias should be close // to 0. std::size_t topShapValues{4}; auto results = runRegressionWithMissingFeatures(topShapValues); TMeanVarAccumulator bias; double c1Sum{0.0}; double c2Sum{0.0}; double c3Sum{0.0}; double c4Sum{0.0}; for (const auto& result : results.as_array()) { BOOST_TEST_REQUIRE(result.is_object()); if (result.as_object().contains("row_results")) { double c1{readShapValue(result, "c1")}; double c2{readShapValue(result, "c2")}; double c3{readShapValue(result, "c3")}; double c4{readShapValue(result, "c4")}; json::value jv = result.at_pointer("/row_results/results/ml/target_prediction"); BOOST_TEST_REQUIRE(jv.is_double()); double prediction{jv.to_number<double>()}; // The difference between the prediction and the sum of all SHAP values // constitutes bias. bias.add(prediction - (c1 + c2 + c3 + c4)); c1Sum += std::fabs(c1); c2Sum += std::fabs(c2); c3Sum += std::fabs(c3); c4Sum += std::fabs(c4); } } LOG_DEBUG(<< "c1Sum = " << c1Sum << ", c2Sum = " << c2Sum << ", c3Sum = " << c3Sum << ", c4Sum = " << c4Sum); BOOST_REQUIRE_CLOSE(c1Sum, c2Sum, 20.0); // c1 and c2 within 20% of each other BOOST_REQUIRE_CLOSE(c1Sum, c3Sum, 20.0); // c1 and c3 within 20% of each other BOOST_REQUIRE_CLOSE(c1Sum, c4Sum, 20.0); // c1 and c4 within 20% of each other // Make sure the local approximation differs from the prediction always by the same bias // (up to a numeric error). BOOST_REQUIRE_SMALL(maths::common::CBasicStatistics::variance(bias), 1e-6); } BOOST_AUTO_TEST_SUITE_END()

lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc (846 lines of code) (raw):