lib/api/unittest/CDataFrameAnalysisRunnerTest.cc (144 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CLogger.h> #include <core/CRegex.h> #include <api/CDataFrameAnalysisSpecification.h> #include <api/CDataFrameAnalysisSpecificationJsonWriter.h> #include <api/CDataFrameOutliersRunner.h> #include <api/CMemoryUsageEstimationResultJsonWriter.h> #include <test/CDataFrameAnalysisSpecificationFactory.h> #include <test/CTestTmpDir.h> #include <boost/test/unit_test.hpp> #include <mutex> #include <string> #include <vector> BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisRunnerTest) using namespace ml; using TBoolVec = std::vector<bool>; using TStrVec = std::vector<std::string>; BOOST_AUTO_TEST_CASE(testComputeExecutionStrategyForOutliers) { using TSizeVec = std::vector<std::size_t>; TSizeVec numbersRows{100, 100000, 1000000}; TSizeVec numbersCols{3, 10, 50}; for (auto numberRows : numbersRows) { for (auto numberCols : numbersCols) { LOG_DEBUG(<< "# rows = " << numberRows << ", # cols = " << numberCols); test::CDataFrameAnalysisSpecificationFactory specFactory; auto spec = specFactory.rows(numberRows) .columns(numberCols) .memoryLimit(100000000) .outlierComputeInfluence(true) .outlierSpec(); api::CDataFrameOutliersRunnerFactory factory; auto runner = factory.make(*spec); LOG_DEBUG(<< " Use main memory = " << runner->storeDataFrameInMainMemory()); LOG_DEBUG(<< " # partitions = " << runner->numberPartitions()); LOG_DEBUG(<< " # rows per partition = " << runner->maximumNumberRowsPerPartition()); // Check some invariants: // 1. strategy is in main memory iff the number of partitions is one, // 2. number partitions x maximum number rows >= number rows, // 3. (number partitions - 1) x maximum number rows <= number rows. bool inMainMemory{runner->storeDataFrameInMainMemory()}; std::size_t numberPartitions{runner->numberPartitions()}; std::size_t maxRowsPerPartition{runner->maximumNumberRowsPerPartition()}; BOOST_REQUIRE_EQUAL(numberPartitions == 1, inMainMemory); BOOST_TEST_REQUIRE(numberPartitions * maxRowsPerPartition >= numberRows); BOOST_TEST_REQUIRE((numberPartitions - 1) * maxRowsPerPartition <= numberRows); } } // TODO test running memory is in acceptable range. } BOOST_AUTO_TEST_CASE(testComputeAndSaveExecutionStrategyDiskUsageFlag) { TStrVec errors; std::mutex errorsMutex; auto errorHandler = [&errors, &errorsMutex](std::string error) { std::lock_guard<std::mutex> lock{errorsMutex}; errors.push_back(error); }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; api::CDataFrameOutliersRunnerFactory factory; // Test large memory requirement without disk usage { errors.clear(); test::CDataFrameAnalysisSpecificationFactory specFactory; auto spec = specFactory.rows(1000) .columns(100) .memoryLimit(500000) .outlierComputeInfluence(true) .diskUsageAllowed(false) .outlierSpec(); // no error should be registered BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size())); } // Test large memory requirement with disk usage { errors.clear(); test::CDataFrameAnalysisSpecificationFactory specFactory; auto spec = specFactory.rows(1000) .columns(100) .memoryLimit(500000) .outlierComputeInfluence(true) .diskUsageAllowed(true) .outlierSpec(); // no error should be registered BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size())); } // Test low memory requirement without disk usage { errors.clear(); test::CDataFrameAnalysisSpecificationFactory specFactory; auto spec = specFactory.rows(10) .columns(10) .memoryLimit(500000) .outlierComputeInfluence(true) .diskUsageAllowed(false) .outlierSpec(); // no error should be registered BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size())); } } namespace { void testEstimateMemoryUsage(std::int64_t numberRows, const std::string& expectedExpectedMemoryWithoutDisk, const std::string& expectedExpectedMemoryWithDisk, int expectedNumberErrors) { std::ostringstream sstream; TStrVec errors; std::mutex errorsMutex; auto errorHandler = [&errors, &errorsMutex](std::string error) { std::lock_guard<std::mutex> lock{errorsMutex}; errors.push_back(error); }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; // The output writer won't close the JSON structures until is is destroyed. { test::CDataFrameAnalysisSpecificationFactory specFactory; auto spec = specFactory.rows(numberRows) .memoryLimit(100000000) .outlierComputeInfluence(true) .outlierSpec(); core::CJsonOutputStreamWrapper wrappedOutStream(sstream); api::CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream); spec->estimateMemoryUsage(writer); } json::error_code ec; json::value arrayDoc_ = json::parse(sstream.str(), ec); BOOST_TEST_REQUIRE(ec.failed() == false); BOOST_TEST_REQUIRE(arrayDoc_.is_array()); const json::array& arrayDoc = arrayDoc_.as_array(); BOOST_REQUIRE_EQUAL(1, arrayDoc.size()); const json::value& result_{arrayDoc[0]}; BOOST_TEST_REQUIRE(result_.is_object()); const json::object& result = result_.as_object(); BOOST_TEST_REQUIRE(result.contains("expected_memory_without_disk")); BOOST_REQUIRE_EQUAL(expectedExpectedMemoryWithoutDisk, result.at("expected_memory_without_disk").as_string()); BOOST_TEST_REQUIRE(result.contains("expected_memory_with_disk")); BOOST_REQUIRE_EQUAL(expectedExpectedMemoryWithDisk, result.at("expected_memory_with_disk").as_string()); BOOST_REQUIRE_EQUAL(expectedNumberErrors, static_cast<int>(errors.size())); } } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor0Rows) { testEstimateMemoryUsage(0, "0mb", "0mb", 1); } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1Row) { testEstimateMemoryUsage(1, "1mb", "1mb", 0); } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) { testEstimateMemoryUsage(10000, "5mb", "2mb", 0); } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) { testEstimateMemoryUsage(100000, "48mb", "12mb", 0); } BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) { testEstimateMemoryUsage(10000000, "6440mb", "147mb", 0); } BOOST_AUTO_TEST_SUITE_END()