lib/maths/analytics/unittest/CDataFrameUtilsTest.cc (1,343 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CContainerPrinter.h>
#include <core/CPackedBitVector.h>
#include <maths/analytics/CDataFrameCategoryEncoder.h>
#include <maths/analytics/CDataFrameUtils.h>
#include <maths/analytics/CMic.h>
#include <maths/common/CBasicStatistics.h>
#include <maths/common/CLinearAlgebraEigen.h>
#include <maths/common/COrderings.h>
#include <maths/common/CPRNG.h>
#include <maths/common/CQuantileSketch.h>
#include <maths/common/CTools.h>
#include <maths/common/CToolsDetail.h>
#include <test/BoostTestCloseAbsolute.h>
#include <test/CRandomNumbers.h>
#include <test/CTestTmpDir.h>
#include <boost/test/unit_test.hpp>
#include <boost/unordered_map.hpp>
#include <functional>
#include <limits>
#include <numeric>
#include <vector>
BOOST_AUTO_TEST_SUITE(CDataFrameUtilsTest)
using namespace ml;
namespace {
using TBoolVec = std::vector<bool>;
using TDoubleVec = std::vector<double>;
using TDoubleVecVec = std::vector<TDoubleVec>;
using TSizeVec = std::vector<std::size_t>;
using TDoubleDoubleUMap = boost::unordered_map<double, double>;
using TFactoryFunc = std::function<std::unique_ptr<core::CDataFrame>()>;
using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator;
using TMeanAccumulatorVec = std::vector<TMeanAccumulator>;
using TMeanAccumulatorVecVec = std::vector<TMeanAccumulatorVec>;
using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TMeanVarAccumulatorVec = std::vector<TMeanVarAccumulator>;
using TQuantileSketchVec = std::vector<maths::common::CFastQuantileSketch>;
auto generateCategoricalData(test::CRandomNumbers& rng,
std::size_t rows,
std::size_t cols,
TDoubleVec expectedFrequencies) {
TDoubleVecVec frequencies;
rng.generateDirichletSamples(expectedFrequencies, cols, frequencies);
TDoubleVecVec values(cols);
for (std::size_t i = 0; i < frequencies.size(); ++i) {
for (std::size_t j = 0; j < frequencies[i].size(); ++j) {
std::size_t target{static_cast<std::size_t>(
static_cast<double>(rows) * frequencies[i][j] + 0.5)};
values[i].resize(values[i].size() + target, static_cast<double>(j));
}
values[i].resize(rows, values[i].back());
rng.random_shuffle(values[i].begin(), values[i].end());
rng.discard(1000000); // Make sure the categories are not correlated
}
return std::make_pair(frequencies, values);
}
core::CPackedBitVector maskAll(std::size_t rows) {
return {rows, true};
}
core::CPackedBitVector generateRandomRowMask(test::CRandomNumbers& rng, std::size_t numberRows) {
TSizeVec sampleCount;
rng.generateUniformSamples(numberRows / 2, 3 * numberRows / 2, 1, sampleCount);
TSizeVec sampledRows;
rng.generateUniformSamples(0, numberRows, sampleCount[0], sampledRows);
std::sort(sampledRows.begin(), sampledRows.end());
sampledRows.erase(std::unique(sampledRows.begin(), sampledRows.end()),
sampledRows.end());
core::CPackedBitVector rowMask;
for (auto i : sampledRows) {
rowMask.extend(false, i - rowMask.size());
rowMask.extend(true);
}
rowMask.extend(false, numberRows - rowMask.size());
return rowMask;
}
}
BOOST_AUTO_TEST_CASE(testColumnDataTypes) {
test::CRandomNumbers rng;
std::size_t rows{2000};
std::size_t cols{4};
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols).first; }};
TSizeVec columnMask(cols);
std::iota(columnMask.begin(), columnMask.end(), 0);
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 2}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
double min{0.0};
double max{10.0};
maths::analytics::CDataFrameUtils::TDataTypeVec expectedTypes{
{true, max, min}, {false, max, min}, {false, max, min}, {false, max, min}};
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
TDoubleVec values;
rng.generateUniformSamples(min, max, cols, values);
*(column++) = std::floor(values[0]);
expectedTypes[0].s_Min =
std::min(expectedTypes[0].s_Min, std::floor(values[0]));
expectedTypes[0].s_Max =
std::max(expectedTypes[0].s_Max, std::floor(values[0]));
for (std::size_t j = 1; j < cols; ++j, ++column) {
*column = values[j];
expectedTypes[j].s_Min = std::min(
maths::common::CFloatStorage{expectedTypes[j].s_Min},
maths::common::CFloatStorage{values[j]});
expectedTypes[j].s_Max = std::max(
maths::common::CFloatStorage{expectedTypes[j].s_Max},
maths::common::CFloatStorage{values[j]});
}
});
}
frame->finishWritingRows();
maths::analytics::CDataFrameUtils::TDataTypeVec actualTypes(
maths::analytics::CDataFrameUtils::columnDataTypes(
threads, *frame, maskAll(rows), columnMask));
// Round trip the expected types to a string to check persistence.
maths::analytics::CDataFrameUtils::TDataTypeVec restoredTypes;
std::string delimitedCollection{core::CPersistUtils::toString(
expectedTypes,
[](const auto& type) { return type.toDelimited(); },
maths::analytics::CDataFrameUtils::SDataType::EXTERNAL_DELIMITER)};
LOG_DEBUG(<< "delimited = " << delimitedCollection);
BOOST_TEST_REQUIRE(core::CPersistUtils::fromString(
delimitedCollection,
[](const std::string& delimited, auto& type) {
return type.fromDelimited(delimited);
},
restoredTypes, maths::analytics::CDataFrameUtils::SDataType::EXTERNAL_DELIMITER));
BOOST_REQUIRE_EQUAL(expectedTypes.size(), actualTypes.size());
for (std::size_t i = 0; i < expectedTypes.size(); ++i) {
double eps{100.0 * std::numeric_limits<double>::epsilon()};
BOOST_REQUIRE_EQUAL(expectedTypes[i].s_IsInteger, actualTypes[i].s_IsInteger);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTypes[i].s_Min,
actualTypes[i].s_Min,
eps * expectedTypes[i].s_Min);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTypes[i].s_Max,
actualTypes[i].s_Max,
eps * expectedTypes[i].s_Max);
BOOST_REQUIRE_EQUAL(expectedTypes[i].s_IsInteger,
restoredTypes[i].s_IsInteger);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTypes[i].s_Min,
restoredTypes[i].s_Min,
eps * expectedTypes[i].s_Min);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTypes[i].s_Max,
restoredTypes[i].s_Max,
eps * expectedTypes[i].s_Max);
}
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testStandardizeColumns) {
test::CRandomNumbers rng;
std::size_t rows{2000};
std::size_t cols{4};
std::size_t capacity{500};
TDoubleVecVec values(4);
TMeanVarAccumulatorVec moments(4);
{
std::size_t i = 0;
for (auto a : {-10.0, 0.0}) {
for (auto b : {5.0, 30.0}) {
rng.generateUniformSamples(a, b, rows, values[i++]);
}
}
for (i = 0; i < cols; ++i) {
moments[i].add(values[i]);
}
}
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 4}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
BOOST_TEST_REQUIRE(maths::analytics::CDataFrameUtils::standardizeColumns(
threads, *frame));
// Check the column values are what we expect given the data we generated.
bool passed{true};
frame->readRows(1, [&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
for (std::size_t j = 0; j < row->numberColumns(); ++j) {
double mean{maths::common::CBasicStatistics::mean(moments[j])};
double sd{std::sqrt(
maths::common::CBasicStatistics::variance(moments[j]))};
double expected{(values[j][row->index()] - mean) / sd};
if (std::fabs((*row)[j] - expected) > 1e-6) {
LOG_ERROR(<< "Expected " << expected << " got " << (*row)[j]);
passed = false;
}
}
}
});
BOOST_TEST_REQUIRE(passed);
// Check that the mean and variance of the columns are zero and one,
// respectively.
TMeanVarAccumulatorVec columnsMoments(cols);
frame->readRows(1, [&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
for (std::size_t j = 0; j < row->numberColumns(); ++j) {
columnsMoments[j].add((*row)[j]);
}
}
});
for (const auto& columnMoments : columnsMoments) {
double mean{maths::common::CBasicStatistics::mean(columnMoments)};
double variance{maths::common::CBasicStatistics::variance(columnMoments)};
LOG_DEBUG(<< "mean = " << mean << ", variance = " << variance);
BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, mean, 1e-6);
BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, variance, 1e-6);
}
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testColumnQuantiles) {
test::CRandomNumbers rng;
std::size_t rows{2000};
std::size_t cols{4};
std::size_t capacity{500};
TDoubleVecVec values(4);
TQuantileSketchVec expectedQuantiles(4, maths::common::CFastQuantileSketch{rows});
{
std::size_t i = 0;
for (auto a : {-10.0, 0.0}) {
for (auto b : {5.0, 30.0}) {
rng.generateUniformSamples(a, b, rows, values[i++]);
}
}
for (i = 0; i < cols; ++i) {
for (auto x : values[i]) {
expectedQuantiles[i].add(x);
}
}
}
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
TSizeVec columnMask(cols);
std::iota(columnMask.begin(), columnMask.end(), 0);
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 4}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
auto[actualQuantiles, successful] = maths::analytics::CDataFrameUtils::columnQuantiles(
threads, *frame, maskAll(rows), columnMask,
maths::common::CFastQuantileSketch{100});
BOOST_TEST_REQUIRE(successful);
// Check the quantile sketches match.
TMeanAccumulatorVec columnsMae(4);
for (std::size_t i = 5; i < 100; i += 5) {
for (std::size_t feature = 0; feature < columnMask.size(); ++feature) {
double x{static_cast<double>(i)};
double qa;
double qe;
BOOST_TEST_REQUIRE(expectedQuantiles[feature].quantile(x, qe));
BOOST_TEST_REQUIRE(actualQuantiles[feature].quantile(x, qa));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
qe, qa, 0.08 * std::max(std::fabs(qe), 2.5));
columnsMae[feature].add(std::fabs(qa - qe));
}
}
TMeanAccumulator mae;
for (auto& columnMae : columnsMae) {
LOG_DEBUG(<< "Column MAE = "
<< maths::common::CBasicStatistics::mean(columnMae));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(columnMae) < 0.07);
mae += columnMae;
}
LOG_DEBUG(<< "MAE = " << maths::common::CBasicStatistics::mean(mae));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(mae) < 0.04);
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testColumnQuantilesWithEncoding) {
test::CRandomNumbers rng;
std::size_t rows{5000};
std::size_t cols{6};
std::size_t capacity{500};
TDoubleVecVec features(cols - 1);
rng.generateUniformSamples(0.96, 5.01, rows, features[0]);
std::for_each(features[0].begin(), features[0].end(),
[](double& category) { category = std::floor(category); });
for (std::size_t i = 1; i + 1 < features.size(); ++i) {
rng.generateNormalSamples(0.0, 9.0, rows, features[i]);
}
rng.generateUniformSamples(0.97, 5.03, rows, features[cols - 2]);
std::for_each(features[cols - 2].begin(), features[cols - 2].end(),
[](double& category) { category = std::floor(category); });
TDoubleVec weights;
rng.generateUniformSamples(1.0, 10.0, cols - 1, weights);
auto target = [&weights](const TDoubleVec& rowFeatures) {
double result{0.0};
for (std::size_t i = 0; i < weights.size(); ++i) {
result += weights[i] * rowFeatures[i];
}
return result;
};
auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
frame->categoricalColumns(TBoolVec{false, true, false, false, false, true});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&features, target, i, rowFeatures = TDoubleVec{} ](
core::CDataFrame::TFloatVecItr column, std::int32_t&) mutable {
rowFeatures.resize(features.size());
for (std::size_t j = 0; j < features.size(); ++j) {
rowFeatures[j] = features[j][i];
}
*column++ = target(rowFeatures);
for (std::size_t j = 0; j < rowFeatures.size(); ++j, ++column) {
*column = rowFeatures[j];
}
});
}
frame->finishWritingRows();
maths::analytics::CDataFrameCategoryEncoder encoder{{1, *frame, 0}};
TSizeVec columnMask(encoder.numberEncodedColumns());
std::iota(columnMask.begin(), columnMask.end(), 0);
TQuantileSketchVec expectedQuantiles{columnMask.size(),
maths::common::CFastQuantileSketch{100}};
frame->readRows(1, [&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
maths::analytics::CEncodedDataFrameRowRef encodedRow{encoder.encode(*row)};
for (std::size_t i = 0; i < columnMask.size(); ++i) {
expectedQuantiles[i].add(encodedRow[columnMask[i]]);
}
}
});
auto[actualQuantiles, successful] = maths::analytics::CDataFrameUtils::columnQuantiles(
1, *frame, maskAll(rows), columnMask,
maths::common::CFastQuantileSketch{100}, &encoder);
BOOST_TEST_REQUIRE(successful);
for (std::size_t i = 5; i < 100; i += 5) {
for (std::size_t feature = 0; feature < columnMask.size(); ++feature) {
double x{static_cast<double>(i)};
double qa;
double qe;
BOOST_TEST_REQUIRE(expectedQuantiles[feature].quantile(x, qe));
BOOST_TEST_REQUIRE(actualQuantiles[feature].quantile(x, qa));
BOOST_REQUIRE_EQUAL(qe, qa);
}
}
}
BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasks) {
// Check some invariants of the test and train masks:
// 1) The folds are approximately the same size,
// 2) The test masks are disjoint for each fold,
// 3) The train and test masks are disjoint for a given fold,
// 4) They're all subsets of the initial mask supplied,
// 5) The number of examples in each category per fold is proportional
// to their overall frequency.
// 6) Test we get the correct size masks if we are using more or less
// training data than implied by k-fold cross-validation.
test::CRandomNumbers testRng;
maths::common::CPRNG::CXorOShiro128Plus rng;
std::size_t numberRows{2000};
std::size_t numberCols{1};
std::size_t numberBins{10};
for (std::size_t trial = 0; trial < 10; ++trial) {
TDoubleVec categories;
testRng.generateNormalSamples(0.0, 3.0, numberRows, categories);
TSizeVec numberFolds;
testRng.generateUniformSamples(2, 6, 1, numberFolds);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{true});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
*column = std::floor(std::fabs(categories[i]));
});
}
frame->finishWritingRows();
core::CPackedBitVector allTrainingRowMask{generateRandomRowMask(testRng, numberRows)};
TDoubleDoubleUMap categoryCounts;
for (auto i = allTrainingRowMask.beginOneBits();
i != allTrainingRowMask.endOneBits(); ++i) {
categoryCounts[std::floor(std::fabs(categories[*i]))] += 1.0;
}
maths::analytics::CDataFrameUtils::TPackedBitVectorVec trainingRowMasks;
maths::analytics::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
std::tie(trainingRowMasks, testingRowMasks, std::ignore) =
maths::analytics::CDataFrameUtils::stratifiedCrossValidationRowMasks(
1, *frame, 0, rng, numberFolds[0],
1.0 - 1.0 / static_cast<double>(numberFolds[0]), numberBins,
allTrainingRowMask);
BOOST_REQUIRE_EQUAL(numberFolds[0], trainingRowMasks.size());
BOOST_REQUIRE_EQUAL(numberFolds[0], testingRowMasks.size());
core::CPackedBitVector allTestingRowMask{numberRows, false};
for (std::size_t fold = 0; fold < numberFolds[0]; ++fold) {
// Count should be very nearly the expected value.
double expectedTestRowCount{allTrainingRowMask.manhattan() /
static_cast<double>(numberFolds[0])};
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTestRowCount,
testingRowMasks[fold].manhattan(), 10.0);
BOOST_REQUIRE_EQUAL(0.0, testingRowMasks[fold].inner(allTestingRowMask));
BOOST_REQUIRE_EQUAL(0.0, trainingRowMasks[fold].inner(testingRowMasks[fold]));
BOOST_REQUIRE_EQUAL(trainingRowMasks[fold].manhattan(),
trainingRowMasks[fold].inner(allTrainingRowMask));
BOOST_REQUIRE_EQUAL(testingRowMasks[fold].manhattan(),
testingRowMasks[fold].inner(allTrainingRowMask));
allTestingRowMask |= testingRowMasks[fold];
TDoubleDoubleUMap testingCategoryCounts;
frame->readRows(1, 0, frame->numberRows(),
[&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
testingCategoryCounts[(*row)[0]] += 1.0;
}
},
&testingRowMasks[fold]);
for (const auto& count : categoryCounts) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(
count.second / static_cast<double>(numberFolds[0]),
testingCategoryCounts[count.first], 5.0);
}
}
}
for (std::size_t trial = 0; trial < 10; ++trial) {
TDoubleVec value;
testRng.generateNormalSamples(0.0, 3.0, numberRows, value);
TSizeVec numberFolds;
testRng.generateUniformSamples(2, 6, 1, numberFolds);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{false});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column,
std::int32_t&) { *column = value[i]; });
}
frame->finishWritingRows();
core::CPackedBitVector allTrainingRowMask{generateRandomRowMask(testRng, numberRows)};
maths::analytics::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
std::tie(std::ignore, testingRowMasks, std::ignore) =
maths::analytics::CDataFrameUtils::stratifiedCrossValidationRowMasks(
1, *frame, 0, rng, numberFolds[0],
1.0 - 1.0 / static_cast<double>(numberFolds[0]), numberBins,
allTrainingRowMask);
TDoubleVecVec targetDecile(numberFolds[0], TDoubleVec(numberBins));
core::CPackedBitVector allTestingRowMask{numberRows, false};
for (std::size_t fold = 0; fold < numberFolds[0]; ++fold) {
// Count should be very nearly the expected value.
double expectedTestRowCount{allTrainingRowMask.manhattan() /
static_cast<double>(numberFolds[0])};
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedTestRowCount,
testingRowMasks[fold].manhattan(), 10.0);
BOOST_REQUIRE_EQUAL(0.0, testingRowMasks[fold].inner(allTestingRowMask));
BOOST_REQUIRE_EQUAL(testingRowMasks[fold].manhattan(),
testingRowMasks[fold].inner(allTrainingRowMask));
allTestingRowMask |= testingRowMasks[fold];
TDoubleVec values;
frame->readRows(1, 0, frame->numberRows(),
[&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
values.push_back((*row)[0]);
}
},
&testingRowMasks[fold]);
std::sort(values.begin(), values.end());
for (std::size_t i = 1; i < numberBins; ++i) {
targetDecile[fold][i] = values[(i * values.size()) / numberBins];
}
}
for (std::size_t i = 1; i < numberBins; ++i) {
TMeanVarAccumulator testTargetDecileMoments;
for (std::size_t fold = 0; fold < numberFolds[0]; ++fold) {
testTargetDecileMoments.add(targetDecile[fold][i]);
}
LOG_DEBUG(<< "variance in test set target percentile = "
<< maths::common::CBasicStatistics::variance(testTargetDecileMoments));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::variance(
testTargetDecileMoments) < 0.02);
}
}
for (auto fraction : {0.1, 0.4}) {
TDoubleVec categories;
testRng.generateNormalSamples(0.0, 3.0, numberRows, categories);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{true});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
*column = std::floor(std::fabs(categories[i]));
});
}
frame->finishWritingRows();
core::CPackedBitVector allTrainingRowMask{numberRows, true};
maths::analytics::CDataFrameUtils::TPackedBitVectorVec trainingRowMasks;
maths::analytics::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
std::tie(trainingRowMasks, testingRowMasks, std::ignore) =
maths::analytics::CDataFrameUtils::stratifiedCrossValidationRowMasks(
1, *frame, 0, rng, 3, fraction, numberBins, allTrainingRowMask);
BOOST_REQUIRE_EQUAL(trainingRowMasks.size(), testingRowMasks.size());
for (std::size_t i = 0; i < trainingRowMasks.size(); ++i) {
BOOST_REQUIRE_EQUAL(
numberRows, static_cast<std::size_t>(
(trainingRowMasks[i] | testingRowMasks[i]).manhattan()));
BOOST_REQUIRE_EQUAL(fraction, trainingRowMasks[i].manhattan() /
static_cast<double>(numberRows));
}
}
}
BOOST_AUTO_TEST_CASE(testStratifiedCrossValidationRowMasksRareCategories) {
// Here we test a case that the desired sample size for a specific class
// is zero. In this case we should reassess the class frequencies for
// the unsampled set and still get 5 splits with all classes represented
// in at least one fold.
std::size_t numberFolds{5};
std::size_t numberBins{10};
TDoubleVec categories{0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 4, 5, 5, 6, 6, 6, 6};
auto frame = core::makeMainStorageDataFrame(1).first;
frame->categoricalColumns(TBoolVec{true});
for (auto category : categories) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column,
std::int32_t&) { *column = category; });
}
frame->finishWritingRows();
maths::common::CPRNG::CXorOShiro128Plus rng;
maths::analytics::CDataFrameUtils::TPackedBitVectorVec testingRowMasks;
std::tie(std::ignore, testingRowMasks, std::ignore) =
maths::analytics::CDataFrameUtils::stratifiedCrossValidationRowMasks(
1, *frame, 0, rng, numberFolds, 1.0 - 1.0 / static_cast<double>(numberFolds),
numberBins, core::CPackedBitVector{categories.size(), true});
core::CPackedBitVector allTestingRowMask(categories.size(), false);
for (const auto& testingRowMask : testingRowMasks) {
allTestingRowMask ^= testingRowMask;
BOOST_TEST_REQUIRE(5.0, testingRowMask.manhattan());
}
BOOST_TEST_REQUIRE(25.0, allTestingRowMask.manhattan());
}
BOOST_AUTO_TEST_CASE(testStratifiedSamplingRowMasks) {
test::CRandomNumbers testRng;
maths::common::CPRNG::CXorOShiro128Plus rng;
std::size_t numberRows{2000};
std::size_t numberCols{1};
std::size_t numberBins{10};
TSizeVec desiredNumberSamples(1);
// Test categorical targets.
for (std::size_t trial = 0; trial < 10; ++trial) {
TDoubleVec categories;
testRng.generateNormalSamples(0.0, 3.0, numberRows, categories);
testRng.generateUniformSamples(200, 500, 1, desiredNumberSamples);
double desiredSamplesFraction{static_cast<double>(desiredNumberSamples[0]) / numberRows};
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{true});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow([&](auto column, std::int32_t&) {
*column = std::floor(std::fabs(categories[i]));
});
}
frame->finishWritingRows();
core::CPackedBitVector allTrainingRowMask{numberRows, true};
TDoubleDoubleUMap categoryCounts;
for (auto i = allTrainingRowMask.beginOneBits();
i != allTrainingRowMask.endOneBits(); ++i) {
categoryCounts[std::floor(std::fabs(categories[*i]))] += 1.0;
}
core::CPackedBitVector samplingRowMask;
samplingRowMask = maths::analytics::CDataFrameUtils::stratifiedSamplingRowMask(
1, *frame, 0, rng, desiredNumberSamples[0], numberBins, allTrainingRowMask);
// Count should be very nearly the expected value.
BOOST_REQUIRE_CLOSE(static_cast<double>(desiredNumberSamples[0]),
samplingRowMask.manhattan(), 5.0);
core::CPackedBitVector allTestingRowMask{numberRows, false};
TDoubleDoubleUMap testingCategoryCounts;
frame->readRows(1, 0, frame->numberRows(),
[&](const auto& beginRows, const auto& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
testingCategoryCounts[(*row)[0]] += 1.0;
}
},
&samplingRowMask);
for (const auto& count : categoryCounts) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(count.second * desiredSamplesFraction,
testingCategoryCounts[count.first], 2.0);
}
}
// Test numerical targets.
for (std::size_t trial = 0; trial < 10; ++trial) {
TDoubleVec value;
testRng.generateNormalSamples(0.0, 3.0, numberRows, value);
testRng.generateUniformSamples(500, 750, 1, desiredNumberSamples);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{false});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column,
std::int32_t&) { *column = value[i]; });
}
frame->finishWritingRows();
core::CPackedBitVector allTrainingRowMask{numberRows, true};
core::CPackedBitVector samplingRowMask;
samplingRowMask = maths::analytics::CDataFrameUtils::stratifiedSamplingRowMask(
1, *frame, 0, rng, desiredNumberSamples[0], numberBins, allTrainingRowMask);
// Count should be very nearly the expected value.
BOOST_REQUIRE_CLOSE(static_cast<double>(desiredNumberSamples[0]),
samplingRowMask.manhattan(), 1.0);
// Ensure that the target's distribution is similar.
maths::common::CQuantileSketch expectedQuantiles{numberRows};
maths::common::CQuantileSketch actualQuantiles{numberRows};
for (std::size_t i = 0; i < numberRows; ++i) {
expectedQuantiles.add(value[i]);
if (samplingRowMask[i]) {
actualQuantiles.add(value[i]);
}
}
double percentageStep{1.0 / numberBins * 100.0};
double expected;
double actual;
for (double percentage = percentageStep; percentage < 100.0;
percentage += percentageStep) {
expectedQuantiles.quantile(percentage, expected);
actualQuantiles.quantile(percentage, actual);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.05);
}
}
}
BOOST_AUTO_TEST_CASE(testDistributionPreservingSamplingRowMasks) {
test::CRandomNumbers testRng;
maths::common::CPRNG::CXorOShiro128Plus rng;
TSizeVec numberDistributionSourceRows(1);
std::size_t numberAdditionalRows{1000};
std::size_t numberCols{1};
// Test that for categorical data the count of individual classes remains the same.
for (std::size_t trial = 0; trial < 10; ++trial) {
testRng.generateUniformSamples(500, 750, 1, numberDistributionSourceRows);
TDoubleVec categories;
testRng.generateNormalSamples(0.0, 3.0, numberDistributionSourceRows[0], categories);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{true});
for (std::size_t i = 0; i < numberDistributionSourceRows[0]; ++i) {
frame->writeRow([&](auto column, std::int32_t&) {
*column = std::floor(std::fabs(categories[i]));
});
}
for (std::size_t i = 0; i < numberAdditionalRows; ++i) {
frame->writeRow([&](auto column, std::int32_t&) { *column = 0; });
}
frame->finishWritingRows();
core::CPackedBitVector distributionSourceRowMask{
numberDistributionSourceRows[0], true};
distributionSourceRowMask.extend(false, numberAdditionalRows);
core::CPackedBitVector allRowsMask{distributionSourceRowMask.size(), true};
TDoubleDoubleUMap expectedCategoryCounts;
for (auto i = distributionSourceRowMask.beginOneBits();
i != distributionSourceRowMask.endOneBits(); ++i) {
expectedCategoryCounts[std::floor(std::fabs(categories[*i]))] += 1.0;
}
auto sampledRowMask = maths::analytics::CDataFrameUtils::distributionPreservingSamplingRowMask(
1, *frame, 0, rng, numberDistributionSourceRows[0], 10,
distributionSourceRowMask, allRowsMask);
BOOST_REQUIRE_EQUAL(sampledRowMask.size(), allRowsMask.size());
BOOST_REQUIRE_CLOSE(sampledRowMask.manhattan(),
numberDistributionSourceRows[0], 1);
auto actualCategoryCounts = maths::analytics::CDataFrameUtils::categoryCounts(
1, *frame, sampledRowMask, {0})[0];
LOG_TRACE(<< "Expected category count " << expectedCategoryCounts);
LOG_TRACE(<< "Actual category count " << actualCategoryCounts);
BOOST_REQUIRE_EQUAL(actualCategoryCounts.size(), expectedCategoryCounts.size());
for (std::size_t i = 0; i < expectedCategoryCounts.size(); ++i) {
BOOST_REQUIRE_EQUAL(actualCategoryCounts[i], expectedCategoryCounts[i]);
}
}
// Test for regression data that quantiles fit to the new data. This part of the test
// resembles the test for testStratifiedSamplingRowMasks. This is on purpose, since the
// behaviour should be similar.
for (std::size_t trial = 0; trial < 10; ++trial) {
testRng.generateUniformSamples(500, 750, 1, numberDistributionSourceRows);
TDoubleVec value;
auto numberRows = numberDistributionSourceRows[0] + numberAdditionalRows;
std::size_t numberBins{10};
testRng.generateNormalSamples(0.0, 3.0, numberRows, value);
auto frame = core::makeMainStorageDataFrame(numberCols).first;
frame->categoricalColumns(TBoolVec{false});
for (std::size_t i = 0; i < numberRows; ++i) {
frame->writeRow(
[&](auto column, std::int32_t&) { *column = value[i]; });
}
frame->finishWritingRows();
core::CPackedBitVector distributionSourceRowMask{
numberDistributionSourceRows[0], true};
distributionSourceRowMask.extend(false, numberAdditionalRows);
core::CPackedBitVector allRowsMask{distributionSourceRowMask.size(), true};
auto sampledRowMask = maths::analytics::CDataFrameUtils::distributionPreservingSamplingRowMask(
1, *frame, 0, rng, numberDistributionSourceRows[0], numberBins,
distributionSourceRowMask, allRowsMask);
// Count should be very nearly the expected value.
BOOST_REQUIRE_EQUAL(sampledRowMask.size(), allRowsMask.size());
BOOST_REQUIRE_CLOSE(static_cast<double>(numberDistributionSourceRows[0]),
sampledRowMask.manhattan(), 1.0);
// Ensure that the target's distribution is similar.
maths::common::CQuantileSketch expectedQuantiles{numberRows};
maths::common::CQuantileSketch actualQuantiles{numberRows};
for (std::size_t i = 0; i < numberRows; ++i) {
if (distributionSourceRowMask[i]) {
expectedQuantiles.add(value[i]);
}
if (sampledRowMask[i]) {
actualQuantiles.add(value[i]);
}
}
double percentageStep{1.0 / numberBins * 100.0};
double expected;
double actual;
for (double percentage = percentageStep; percentage < 100.0;
percentage += percentageStep) {
expectedQuantiles.quantile(percentage, expected);
actualQuantiles.quantile(percentage, actual);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expected, actual, 0.05);
}
}
}
BOOST_AUTO_TEST_CASE(testMicWithColumn) {
// Test we get the exact MICe value when the number of rows is less than
// the target sample size.
test::CRandomNumbers rng;
std::size_t capacity{500};
std::size_t numberRows{2000};
std::size_t numberCols{4};
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(),
numberCols, numberRows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{[=] {
return core::makeMainStorageDataFrame(numberCols, capacity).first;
}};
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
TDoubleVecVec rows;
for (std::size_t i = 0; i < numberRows; ++i) {
TDoubleVec row;
rng.generateUniformSamples(-5.0, 5.0, 4, row);
row[3] = 2.0 * row[0] - 1.5 * row[1] + 4.0 * row[2];
rows.push_back(row);
frame->writeRow([&row, numberCols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < numberCols; ++j, ++column) {
*column = row[j];
}
});
}
frame->finishWritingRows();
TDoubleVec expected(4, 0.0);
for (std::size_t j : {0, 1, 2}) {
maths::analytics::CMic mic;
for (const auto& row : rows) {
mic.add(row[j], row[3]);
}
expected[j] = mic.compute();
}
TDoubleVec actual(maths::analytics::CDataFrameUtils::metricMicWithColumn(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3}, *frame,
maskAll(numberRows), {0, 1, 2}));
LOG_DEBUG(<< "expected = " << expected);
LOG_DEBUG(<< "actual = " << actual);
BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expected),
core::CContainerPrinter::print(actual));
}
}
BOOST_AUTO_TEST_CASE(testMicWithColumnWithMissing) {
// Test we get the exact MICe value with missing values when the number
// of rows is less than the target sample size.
test::CRandomNumbers rng;
std::size_t capacity{500};
std::size_t numberRows{2000};
std::size_t numberCols{4};
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(),
numberCols, numberRows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{[=] {
return core::makeMainStorageDataFrame(numberCols, capacity).first;
}};
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
TDoubleVecVec rows;
TSizeVec missing(4, 0);
for (std::size_t i = 0; i < numberRows; ++i) {
TDoubleVec row;
rng.generateUniformSamples(-5.0, 5.0, 4, row);
row[3] = 2.0 * row[0] - 1.5 * row[1] + 4.0 * row[2];
for (std::size_t j = 0; j < row.size(); ++j) {
TDoubleVec u01;
rng.generateUniformSamples(0.0, 1.0, 1, u01);
if (u01[0] < 0.01) {
row[j] = core::CDataFrame::valueOfMissing();
++missing[j];
}
}
rows.push_back(row);
frame->writeRow([&row, numberCols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < numberCols; ++j, ++column) {
*column = row[j];
}
});
}
frame->finishWritingRows();
TDoubleVec expected(4, 0.0);
for (std::size_t j : {0, 1, 2}) {
maths::analytics::CMic mic;
for (const auto& row : rows) {
if (maths::analytics::CDataFrameUtils::isMissing(row[j]) == false &&
maths::analytics::CDataFrameUtils::isMissing(row[3]) == false) {
mic.add(row[j], row[3]);
}
}
expected[j] = (1.0 - static_cast<double>(missing[j]) /
static_cast<double>(rows.size())) *
mic.compute();
}
TDoubleVec actual(maths::analytics::CDataFrameUtils::metricMicWithColumn(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3}, *frame,
maskAll(numberRows), {0, 1, 2}));
LOG_DEBUG(<< "expected = " << expected);
LOG_DEBUG(<< "actual = " << actual);
BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(expected),
core::CContainerPrinter::print(actual));
}
}
BOOST_AUTO_TEST_CASE(testCategoryFrequencies) {
// Test we get the correct frequencies for each category.
std::size_t rows{5000};
std::size_t cols{4};
std::size_t capacity{500};
test::CRandomNumbers rng;
TDoubleVecVec expectedFrequencies;
TDoubleVecVec values;
std::tie(expectedFrequencies, values) = generateCategoricalData(
rng, rows, cols, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0});
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 4}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
frame->categoricalColumns(TBoolVec{true, false, true, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
TDoubleVecVec actualFrequencies{maths::analytics::CDataFrameUtils::categoryFrequencies(
threads, *frame, maskAll(rows), {0, 1, 2, 3})};
BOOST_REQUIRE_EQUAL(std::size_t{4}, actualFrequencies.size());
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(actualFrequencies.size(),
expectedFrequencies.size());
for (std::size_t j = 0; j < actualFrequencies[i].size(); ++j) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedFrequencies[i][j],
actualFrequencies[i][j],
1.0 / static_cast<double>(rows));
}
}
for (std::size_t i : {1, 3}) {
BOOST_TEST_REQUIRE(actualFrequencies[i].empty());
}
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testCategoryFrequenciesWithMissing) {
// Test we get the correct frequencies for each category with missing values.
std::size_t rows{5000};
std::size_t cols{4};
std::size_t capacity{500};
double probabilityMissing{0.01};
double missingStandardDeviation{
std::sqrt(probabilityMissing * static_cast<double>(rows))};
test::CRandomNumbers rng;
TDoubleVecVec expectedFrequencies;
TDoubleVecVec values;
std::tie(expectedFrequencies, values) = generateCategoricalData(
rng, rows, cols, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0});
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
frame->categoricalColumns(TBoolVec{true, false, true, false});
TDoubleVec u01;
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) mutable {
for (std::size_t j = 0; j < cols; ++j, ++column) {
rng.generateUniformSamples(0.0, 1.0, 1, u01);
if (u01[0] < probabilityMissing) {
*column = core::CDataFrame::valueOfMissing();
} else {
*column = values[j][i];
}
}
});
}
frame->finishWritingRows();
TDoubleVecVec actualFrequencies{maths::analytics::CDataFrameUtils::categoryFrequencies(
1, *frame, maskAll(rows), {0, 1, 2, 3})};
BOOST_REQUIRE_EQUAL(std::size_t{4}, actualFrequencies.size());
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(actualFrequencies.size(), expectedFrequencies.size());
for (std::size_t j = 0; j < actualFrequencies[i].size(); ++j) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(
expectedFrequencies[i][j], actualFrequencies[i][j],
3.0 * missingStandardDeviation / static_cast<double>(rows));
}
}
for (std::size_t i : {1, 3}) {
BOOST_TEST_REQUIRE(actualFrequencies[i].empty());
}
}
}
BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategories) {
// Test we get the correct mean values for each category.
std::size_t rows{2000};
std::size_t cols{4};
std::size_t capacity{500};
test::CRandomNumbers rng;
TDoubleVecVec frequencies;
TDoubleVecVec values;
std::tie(frequencies, values) = generateCategoricalData(
rng, rows, cols - 1, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0});
values.resize(cols);
values[cols - 1].resize(rows, 0.0);
TMeanAccumulatorVecVec expectedMeans(cols, TMeanAccumulatorVec(8));
for (std::size_t i = 0; i < rows; ++i) {
for (std::size_t j = 0; j + 1 < cols; ++j) {
values[cols - 1][i] += values[j][i];
}
for (std::size_t j = 0; j + 1 < cols; ++j) {
expectedMeans[j][static_cast<std::size_t>(values[j][i])].add(
values[cols - 1][i]);
}
}
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 4}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
frame->categoricalColumns(TBoolVec{true, false, true, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
TDoubleVecVec actualMeans(maths::analytics::CDataFrameUtils::meanValueOfTargetForCategories(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3},
threads, *frame, maskAll(rows), {0, 1, 2}));
BOOST_REQUIRE_EQUAL(std::size_t{4}, actualMeans.size());
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(actualMeans.size(), expectedMeans.size());
for (std::size_t j = 0; j < actualMeans[i].size(); ++j) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::mean(expectedMeans[i][j]),
actualMeans[i][j],
static_cast<double>(std::numeric_limits<float>::epsilon()) *
maths::common::CBasicStatistics::mean(expectedMeans[i][j]));
}
}
for (std::size_t i : {1, 3}) {
BOOST_TEST_REQUIRE(actualMeans[i].empty());
}
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testMeanValueOfTargetForCategoriesWithMissing) {
// Test that rows missing the target variable are ignored.
std::size_t rows{2000};
std::size_t cols{4};
std::size_t capacity{500};
test::CRandomNumbers rng;
TDoubleVecVec frequencies;
TDoubleVecVec values;
std::tie(frequencies, values) = generateCategoricalData(
rng, rows, cols - 1, {10.0, 30.0, 1.0, 5.0, 15.0, 9.0, 20.0, 10.0});
values.resize(cols);
values[cols - 1].resize(rows, 0.0);
TMeanAccumulatorVecVec expectedMeans(cols, TMeanAccumulatorVec(8));
TDoubleVec u01;
for (std::size_t i = 0; i < rows; ++i) {
for (std::size_t j = 0; j + 1 < cols; ++j) {
rng.generateUniformSamples(0.0, 1.0, 1, u01);
if (u01[0] < 0.01) {
values[j][i] = core::CDataFrame::valueOfMissing();
}
}
rng.generateUniformSamples(0.0, 1.0, 1, u01);
if (u01[0] < 0.9) {
for (std::size_t j = 0; j + 1 < cols; ++j) {
if (maths::analytics::CDataFrameUtils::isMissing(values[j][i]) == false) {
values[cols - 1][i] += values[j][i];
}
}
for (std::size_t j = 0; j + 1 < cols; ++j) {
if (maths::analytics::CDataFrameUtils::isMissing(values[j][i]) == false) {
expectedMeans[j][static_cast<std::size_t>(values[j][i])].add(
values[cols - 1][i]);
}
}
} else {
values[cols - 1][i] = core::CDataFrame::valueOfMissing();
}
}
auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
frame->categoricalColumns(TBoolVec{true, false, true, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
TDoubleVecVec actualMeans(maths::analytics::CDataFrameUtils::meanValueOfTargetForCategories(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3}, 1, *frame,
core::CPackedBitVector{rows, true}, {0, 1, 2}));
BOOST_REQUIRE_EQUAL(std::size_t{4}, actualMeans.size());
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(actualMeans.size(), expectedMeans.size());
for (std::size_t j = 0; j < actualMeans[i].size(); ++j) {
BOOST_REQUIRE_EQUAL(maths::common::CBasicStatistics::mean(expectedMeans[i][j]),
actualMeans[i][j]);
}
}
}
BOOST_AUTO_TEST_CASE(testCategoryMicWithColumn) {
// Test one uncorrelated and one uncorrelated categorical field MICe.
std::size_t rows{5000};
std::size_t cols{4};
std::size_t capacity{2000};
test::CRandomNumbers rng;
TDoubleVecVec frequencies;
TDoubleVecVec values;
std::tie(frequencies, values) =
generateCategoricalData(rng, rows, cols - 1, {20.0, 60.0, 5.0, 15.0, 1.0});
values.resize(cols);
rng.generateNormalSamples(0.0, 1.0, rows, values[cols - 1]);
for (std::size_t i = 0; i < rows; ++i) {
values[cols - 1][i] += 2.0 * values[2][i];
}
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
core::stopDefaultAsyncExecutor();
for (auto threads : {1, 4}) {
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
frame->categoricalColumns(TBoolVec{true, false, true, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
auto mics = maths::analytics::CDataFrameUtils::categoricalMicWithColumn(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3},
threads, *frame, maskAll(rows), {0, 1, 2},
{{[](std::size_t, std::size_t sampleColumn, std::size_t category) {
return std::make_unique<maths::analytics::CDataFrameUtils::COneHotCategoricalColumnValue>(
sampleColumn, category);
},
0.01}})[0];
LOG_DEBUG(<< "mics[0] = " << mics[0]);
LOG_DEBUG(<< "mics[2] = " << mics[2]);
BOOST_REQUIRE_EQUAL(std::size_t{4}, mics.size());
for (const auto& mic : mics) {
BOOST_TEST_REQUIRE(std::is_sorted(
mic.begin(), mic.end(), [](const auto& lhs, const auto& rhs) {
return maths::common::COrderings::lexicographicalCompare(
-lhs.second, lhs.first, -rhs.second, rhs.first);
}));
}
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(std::size_t{5}, mics[i].size());
}
for (std::size_t i : {1, 3}) {
BOOST_TEST_REQUIRE(mics[i].empty());
}
BOOST_TEST_REQUIRE(mics[0][0].second < 0.05);
BOOST_TEST_REQUIRE(mics[2][0].second > 0.50);
// The expected order is a function of both the category frequency
// and its order since the target value is order + noise so the
// larger the order the smaller the noise, relatively.
TSizeVec categoryOrder;
for (const auto& category : mics[2]) {
categoryOrder.push_back(category.first);
}
BOOST_REQUIRE_EQUAL(std::string{"[1, 3, 0, 4, 2]"},
core::CContainerPrinter::print(categoryOrder));
}
core::startDefaultAsyncExecutor();
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testCategoryMicWithColumnWithMissing) {
std::size_t rows{5000};
std::size_t cols{4};
std::size_t capacity{2000};
test::CRandomNumbers rng;
TDoubleVecVec frequencies;
TDoubleVecVec values;
std::tie(frequencies, values) =
generateCategoricalData(rng, rows, cols - 1, {20.0, 60.0, 5.0, 15.0, 1.0});
values.resize(cols);
rng.generateNormalSamples(0.0, 1.0, rows, values[cols - 1]);
TDoubleVec u01;
for (std::size_t i = 0; i < rows; ++i) {
values[cols - 1][i] += 2.0 * values[2][i];
for (std::size_t j = 0; j < cols - 1; ++j) {
rng.generateUniformSamples(0.0, 1.0, 1, u01);
if (u01[0] < 0.01) {
values[j][i] = core::CDataFrame::valueOfMissing();
}
}
}
TFactoryFunc makeOnDisk{[=] {
return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity)
.first;
}};
TFactoryFunc makeMainMemory{
[=] { return core::makeMainStorageDataFrame(cols, capacity).first; }};
for (const auto& factory : {makeOnDisk, makeMainMemory}) {
auto frame = factory();
frame->categoricalColumns(TBoolVec{true, false, true, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&values, i, cols](core::CDataFrame::TFloatVecItr column,
std::int32_t&) {
for (std::size_t j = 0; j < cols; ++j, ++column) {
*column = values[j][i];
}
});
}
frame->finishWritingRows();
auto mics = maths::analytics::CDataFrameUtils::categoricalMicWithColumn(
maths::analytics::CDataFrameUtils::CMetricColumnValue{3}, 1, *frame,
maskAll(rows), {0, 1, 2},
{{[](std::size_t, std::size_t sampleColumn, std::size_t category) {
return std::make_unique<maths::analytics::CDataFrameUtils::COneHotCategoricalColumnValue>(
sampleColumn, category);
},
0.01}})[0];
LOG_DEBUG(<< "mics[0] = " << mics[0]);
LOG_DEBUG(<< "mics[2] = " << mics[2]);
BOOST_REQUIRE_EQUAL(std::size_t{4}, mics.size());
for (const auto& mic : mics) {
BOOST_TEST_REQUIRE(std::is_sorted(
mic.begin(), mic.end(), [](const auto& lhs, const auto& rhs) {
return maths::common::COrderings::lexicographicalCompare(
-lhs.second, lhs.first, -rhs.second, rhs.first);
}));
}
for (std::size_t i : {0, 2}) {
BOOST_REQUIRE_EQUAL(std::size_t{5}, mics[i].size());
}
for (std::size_t i : {1, 3}) {
BOOST_TEST_REQUIRE(mics[i].empty());
}
BOOST_TEST_REQUIRE(mics[0][0].second < 0.04);
BOOST_TEST_REQUIRE(mics[2][0].second > 0.49);
// The expected order is a function of both the category frequency
// and its order since the target value is order + noise so the
// larger the order the smaller the noise, relatively.
TSizeVec categoryOrder;
for (const auto& category : mics[2]) {
categoryOrder.push_back(category.first);
}
BOOST_REQUIRE_EQUAL(std::string{"[1, 3, 0, 4, 2]"},
core::CContainerPrinter::print(categoryOrder));
}
}
BOOST_AUTO_TEST_CASE(testMaximumMinimumRecallClassWeights) {
// Test we reliably increase the minimum class recall for predictions with uneven accuracy.
using TDoubleVector = maths::common::CDenseVector<double>;
using TMemoryMappedFloatVector =
maths::common::CMemoryMappedDenseVector<maths::common::CFloatStorage>;
std::size_t rows{5000};
std::size_t capacity{2000};
test::CRandomNumbers rng;
for (std::size_t numberClasses : {2, 3}) {
std::size_t cols{numberClasses + 1};
auto readPrediction = [&](const core::CDataFrame::TRowRef& row) {
return TMemoryMappedFloatVector{row.data(), static_cast<int>(numberClasses)};
};
TBoolVec categoricalColumns(cols, false);
categoricalColumns[numberClasses] = true;
for (std::size_t t = 0; t < 5; ++t) {
core::stopDefaultAsyncExecutor();
TDoubleVec predictions;
TSizeVec category;
auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
frame->categoricalColumns(categoricalColumns);
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
rng.generateUniformSamples(0, numberClasses, 1, category);
rng.generateNormalSamples(0.0, 1.0, numberClasses, predictions);
for (std::size_t j = 0; j < numberClasses; ++j) {
column[j] += predictions[j];
}
column[category[0]] += static_cast<double>(category[0] + 1);
column[numberClasses] = static_cast<double>(category[0]);
});
}
frame->finishWritingRows();
TDoubleVecVec minRecalls(2, TDoubleVec(2));
TDoubleVecVec maxRecalls(2, TDoubleVec(2));
std::size_t i{0};
for (auto numberThreads : {1, 4}) {
auto weights = maths::analytics::CDataFrameUtils::maximumMinimumRecallClassWeights(
numberThreads, *frame, maskAll(rows), numberClasses,
numberClasses, readPrediction);
TDoubleVector prediction;
TDoubleVector correct[2]{TDoubleVector::Zero(numberClasses),
TDoubleVector::Zero(numberClasses)};
TDoubleVector counts{TDoubleVector::Zero(numberClasses)};
frame->readRows(1, [&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
prediction = readPrediction(*row);
maths::common::CTools::inplaceSoftmax(prediction);
std::size_t weightedPredictedClass;
weights.cwiseProduct(prediction).maxCoeff(&weightedPredictedClass);
std::size_t actualClass{
static_cast<std::size_t>((*row)[numberClasses])};
if (weightedPredictedClass == actualClass) {
correct[0](actualClass) += 1.0;
}
std::size_t unweightedPredictedClass;
prediction.maxCoeff(&unweightedPredictedClass);
if (unweightedPredictedClass == actualClass) {
correct[1](actualClass) += 1.0;
}
counts(actualClass) += 1.0;
}
});
LOG_TRACE(<< "weighted class recalls = "
<< correct[0].cwiseQuotient(counts).transpose());
LOG_TRACE(<< "unweighted class recalls = "
<< correct[1].cwiseQuotient(counts).transpose());
minRecalls[i][0] = correct[0].cwiseQuotient(counts).minCoeff();
maxRecalls[i][0] = correct[0].cwiseQuotient(counts).maxCoeff();
minRecalls[i][1] = correct[1].cwiseQuotient(counts).minCoeff();
maxRecalls[i][1] = correct[1].cwiseQuotient(counts).maxCoeff();
++i;
core::startDefaultAsyncExecutor();
}
LOG_DEBUG(<< "min recalls = " << minRecalls);
LOG_DEBUG(<< "max recalls = " << maxRecalls);
// Threaded and non-threaded results are close.
BOOST_REQUIRE_CLOSE(minRecalls[0][0], minRecalls[1][0], 1.5); // 1.5 %
// We improved the minimum class recall by at least 10%.
BOOST_TEST_REQUIRE(minRecalls[0][0] > 1.1 * minRecalls[0][1]);
// The minimum and maximum class recalls are close: we're at the global maximum.
BOOST_TEST_REQUIRE(1.06 * minRecalls[0][0] > maxRecalls[0][0]);
BOOST_TEST_REQUIRE(1.06 * minRecalls[1][0] > maxRecalls[1][0]);
}
}
core::stopDefaultAsyncExecutor();
}
BOOST_AUTO_TEST_CASE(testMaximumMinimumRecallClassWeightsBadInputs) {
// Check that we successfully handle NaN and infinite inputs.
using TDoubleVector = maths::common::CDenseVector<double>;
using TMemoryMappedFloatVector =
maths::common::CMemoryMappedDenseVector<maths::common::CFloatStorage>;
std::size_t rows{5000};
std::size_t capacity{2000};
test::CRandomNumbers rng;
for (std::size_t numberClasses : {2, 3}) {
std::size_t cols{numberClasses + 1};
auto readPrediction = [&](const core::CDataFrame::TRowRef& row) {
return TMemoryMappedFloatVector{row.data(), static_cast<int>(numberClasses)};
};
TBoolVec categoricalColumns(cols, false);
categoricalColumns[numberClasses] = true;
TDoubleVec predictions;
TSizeVec category;
auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
frame->categoricalColumns(categoricalColumns);
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
rng.generateUniformSamples(0, numberClasses, 1, category);
rng.generateNormalSamples(0.0, 1.0, numberClasses, predictions);
for (std::size_t j = 0; j < numberClasses; ++j) {
column[j] += predictions[j];
}
column[category[0]] += static_cast<double>(category[0] + 1);
if (i % 500 == 0) {
column[category[0]] = std::numeric_limits<double>::quiet_NaN();
}
if (i % 600 == 0) {
column[category[0]] = std::numeric_limits<double>::infinity();
}
column[numberClasses] = static_cast<double>(category[0]);
});
}
frame->finishWritingRows();
TDoubleVec minRecalls(2);
TDoubleVec maxRecalls(2);
auto weights = maths::analytics::CDataFrameUtils::maximumMinimumRecallClassWeights(
1, *frame, maskAll(rows), numberClasses, numberClasses, readPrediction);
TDoubleVector prediction;
TDoubleVector correct[2]{TDoubleVector::Zero(numberClasses),
TDoubleVector::Zero(numberClasses)};
TDoubleVector counts{TDoubleVector::Zero(numberClasses)};
frame->readRows(1, [&](const core::CDataFrame::TRowItr& beginRows,
const core::CDataFrame::TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
prediction = readPrediction(*row);
maths::common::CTools::inplaceSoftmax(prediction);
std::size_t weightedPredictedClass;
weights.cwiseProduct(prediction).maxCoeff(&weightedPredictedClass);
std::size_t actualClass{static_cast<std::size_t>((*row)[numberClasses])};
if (weightedPredictedClass == actualClass) {
correct[0](actualClass) += 1.0;
}
std::size_t unweightedPredictedClass;
prediction.maxCoeff(&unweightedPredictedClass);
if (unweightedPredictedClass == actualClass) {
correct[1](actualClass) += 1.0;
}
counts(actualClass) += 1.0;
}
});
minRecalls[0] = correct[0].cwiseQuotient(counts).minCoeff();
maxRecalls[0] = correct[0].cwiseQuotient(counts).maxCoeff();
minRecalls[1] = correct[1].cwiseQuotient(counts).minCoeff();
maxRecalls[1] = correct[1].cwiseQuotient(counts).maxCoeff();
LOG_DEBUG(<< "min recalls = " << minRecalls);
LOG_DEBUG(<< "max recalls = " << maxRecalls);
// We improved the minimum class recall by at least 10%.
BOOST_TEST_REQUIRE(minRecalls[0] > 1.1 * minRecalls[1]);
// The minimum and maximum class recalls are close: we're at the global maximum.
BOOST_TEST_REQUIRE(1.06 * minRecalls[0] > maxRecalls[0]);
}
}
BOOST_AUTO_TEST_SUITE_END()