lib/api/unittest/CDataFrameAnalysisRunnerTest.cc (144 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CLogger.h>
#include <core/CRegex.h>
#include <api/CDataFrameAnalysisSpecification.h>
#include <api/CDataFrameAnalysisSpecificationJsonWriter.h>
#include <api/CDataFrameOutliersRunner.h>
#include <api/CMemoryUsageEstimationResultJsonWriter.h>
#include <test/CDataFrameAnalysisSpecificationFactory.h>
#include <test/CTestTmpDir.h>
#include <boost/test/unit_test.hpp>
#include <mutex>
#include <string>
#include <vector>
BOOST_AUTO_TEST_SUITE(CDataFrameAnalysisRunnerTest)
using namespace ml;
using TBoolVec = std::vector<bool>;
using TStrVec = std::vector<std::string>;
BOOST_AUTO_TEST_CASE(testComputeExecutionStrategyForOutliers) {
using TSizeVec = std::vector<std::size_t>;
TSizeVec numbersRows{100, 100000, 1000000};
TSizeVec numbersCols{3, 10, 50};
for (auto numberRows : numbersRows) {
for (auto numberCols : numbersCols) {
LOG_DEBUG(<< "# rows = " << numberRows << ", # cols = " << numberCols);
test::CDataFrameAnalysisSpecificationFactory specFactory;
auto spec = specFactory.rows(numberRows)
.columns(numberCols)
.memoryLimit(100000000)
.outlierComputeInfluence(true)
.outlierSpec();
api::CDataFrameOutliersRunnerFactory factory;
auto runner = factory.make(*spec);
LOG_DEBUG(<< " Use main memory = " << runner->storeDataFrameInMainMemory());
LOG_DEBUG(<< " # partitions = " << runner->numberPartitions());
LOG_DEBUG(<< " # rows per partition = "
<< runner->maximumNumberRowsPerPartition());
// Check some invariants:
// 1. strategy is in main memory iff the number of partitions is one,
// 2. number partitions x maximum number rows >= number rows,
// 3. (number partitions - 1) x maximum number rows <= number rows.
bool inMainMemory{runner->storeDataFrameInMainMemory()};
std::size_t numberPartitions{runner->numberPartitions()};
std::size_t maxRowsPerPartition{runner->maximumNumberRowsPerPartition()};
BOOST_REQUIRE_EQUAL(numberPartitions == 1, inMainMemory);
BOOST_TEST_REQUIRE(numberPartitions * maxRowsPerPartition >= numberRows);
BOOST_TEST_REQUIRE((numberPartitions - 1) * maxRowsPerPartition <= numberRows);
}
}
// TODO test running memory is in acceptable range.
}
BOOST_AUTO_TEST_CASE(testComputeAndSaveExecutionStrategyDiskUsageFlag) {
TStrVec errors;
std::mutex errorsMutex;
auto errorHandler = [&errors, &errorsMutex](std::string error) {
std::lock_guard<std::mutex> lock{errorsMutex};
errors.push_back(error);
};
core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler};
api::CDataFrameOutliersRunnerFactory factory;
// Test large memory requirement without disk usage
{
errors.clear();
test::CDataFrameAnalysisSpecificationFactory specFactory;
auto spec = specFactory.rows(1000)
.columns(100)
.memoryLimit(500000)
.outlierComputeInfluence(true)
.diskUsageAllowed(false)
.outlierSpec();
// no error should be registered
BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size()));
}
// Test large memory requirement with disk usage
{
errors.clear();
test::CDataFrameAnalysisSpecificationFactory specFactory;
auto spec = specFactory.rows(1000)
.columns(100)
.memoryLimit(500000)
.outlierComputeInfluence(true)
.diskUsageAllowed(true)
.outlierSpec();
// no error should be registered
BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size()));
}
// Test low memory requirement without disk usage
{
errors.clear();
test::CDataFrameAnalysisSpecificationFactory specFactory;
auto spec = specFactory.rows(10)
.columns(10)
.memoryLimit(500000)
.outlierComputeInfluence(true)
.diskUsageAllowed(false)
.outlierSpec();
// no error should be registered
BOOST_REQUIRE_EQUAL(0, static_cast<int>(errors.size()));
}
}
namespace {
void testEstimateMemoryUsage(std::int64_t numberRows,
const std::string& expectedExpectedMemoryWithoutDisk,
const std::string& expectedExpectedMemoryWithDisk,
int expectedNumberErrors) {
std::ostringstream sstream;
TStrVec errors;
std::mutex errorsMutex;
auto errorHandler = [&errors, &errorsMutex](std::string error) {
std::lock_guard<std::mutex> lock{errorsMutex};
errors.push_back(error);
};
core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler};
// The output writer won't close the JSON structures until is is destroyed.
{
test::CDataFrameAnalysisSpecificationFactory specFactory;
auto spec = specFactory.rows(numberRows)
.memoryLimit(100000000)
.outlierComputeInfluence(true)
.outlierSpec();
core::CJsonOutputStreamWrapper wrappedOutStream(sstream);
api::CMemoryUsageEstimationResultJsonWriter writer(wrappedOutStream);
spec->estimateMemoryUsage(writer);
}
json::error_code ec;
json::value arrayDoc_ = json::parse(sstream.str(), ec);
BOOST_TEST_REQUIRE(ec.failed() == false);
BOOST_TEST_REQUIRE(arrayDoc_.is_array());
const json::array& arrayDoc = arrayDoc_.as_array();
BOOST_REQUIRE_EQUAL(1, arrayDoc.size());
const json::value& result_{arrayDoc[0]};
BOOST_TEST_REQUIRE(result_.is_object());
const json::object& result = result_.as_object();
BOOST_TEST_REQUIRE(result.contains("expected_memory_without_disk"));
BOOST_REQUIRE_EQUAL(expectedExpectedMemoryWithoutDisk,
result.at("expected_memory_without_disk").as_string());
BOOST_TEST_REQUIRE(result.contains("expected_memory_with_disk"));
BOOST_REQUIRE_EQUAL(expectedExpectedMemoryWithDisk,
result.at("expected_memory_with_disk").as_string());
BOOST_REQUIRE_EQUAL(expectedNumberErrors, static_cast<int>(errors.size()));
}
}
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor0Rows) {
testEstimateMemoryUsage(0, "0mb", "0mb", 1);
}
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor1Row) {
testEstimateMemoryUsage(1, "1mb", "1mb", 0);
}
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000Rows) {
testEstimateMemoryUsage(10000, "5mb", "2mb", 0);
}
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor100000Rows) {
testEstimateMemoryUsage(100000, "48mb", "12mb", 0);
}
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsageFor10000000Rows) {
testEstimateMemoryUsage(10000000, "6440mb", "147mb", 0);
}
BOOST_AUTO_TEST_SUITE_END()