lib/maths/analytics/unittest/BoostedTreeTestData.cc (116 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include "BoostedTreeTestData.h"
#include <core/CContainerPrinter.h>
#include <core/CPackedBitVector.h>
#include <maths/common/CTools.h>
#include <test/CRandomNumbers.h>
#include <vector>
using namespace ml;
using TBoolVec = std::vector<bool>;
using TDoubleVec = std::vector<double>;
using TDoubleVecVec = std::vector<TDoubleVec>;
using TRowItr = core::CDataFrame::TRowItr;
using TRowRef = core::CDataFrame::TRowRef;
void addData(test::CRandomNumbers& rng,
const TTargetFunc& target,
const TDoubleVecVec& x,
double noiseVariance,
core::CDataFrame& frame) {
if (x.empty()) {
return;
}
std::size_t rows{x[0].size()};
std::size_t cols{x.size() + 1};
std::size_t offset{frame.numberRows()};
core::CPackedBitVector mask{frame.numberRows(), false};
for (std::size_t i = 0; i < rows; ++i) {
mask.extend(true);
}
TDoubleVec noise{[&] {
TDoubleVec result(rows, 0.0);
if (noiseVariance > 0.0) {
rng.generateNormalSamples(0.0, noiseVariance, rows, result);
}
return result;
}()};
for (std::size_t i = 0; i < rows; ++i) {
frame.writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
for (std::size_t j = 0; j < cols - 1; ++j, ++column) {
*column = x[j][i];
}
});
}
frame.finishWritingRows();
frame.writeColumns(1, 0, frame.numberRows(),
[&](const TRowItr& beginRows, const TRowItr& endRows) {
for (auto row = beginRows; row != endRows; ++row) {
double targetValue{target(*row) + noise[row->index() - offset]};
row->writeColumn(cols - 1, targetValue);
}
},
&mask);
}
std::unique_ptr<core::CDataFrame> setupRegressionProblem(test::CRandomNumbers& rng,
const TTargetFunc& target,
double noiseVariance,
std::size_t rows,
std::size_t cols) {
auto frame = core::makeMainStorageDataFrame(cols, rows).first;
TDoubleVecVec x(cols - 1);
for (std::size_t i = 0; i < cols - 1; ++i) {
rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
}
frame->categoricalColumns(TBoolVec(cols, false));
addData(rng, target, x, noiseVariance, *frame);
return frame;
}
TTargetFunc linearRegression(test::CRandomNumbers& rng, std::size_t cols) {
TDoubleVec m;
TDoubleVec s;
rng.generateUniformSamples(0.0, 10.0, cols - 1, m);
rng.generateUniformSamples(-10.0, 10.0, cols - 1, s);
return [=](const TRowRef& row) {
double result{0.0};
for (std::size_t i = 0; i < cols - 1; ++i) {
result += m[i] + s[i] * row[i];
}
return result;
};
}
std::unique_ptr<core::CDataFrame> setupLinearRegressionProblem(std::size_t rows,
std::size_t cols) {
test::CRandomNumbers rng;
double noiseVariance{100.0};
auto target = linearRegression(rng, cols);
return setupRegressionProblem(rng, target, noiseVariance, rows, cols);
}
std::unique_ptr<core::CDataFrame> setupClassificationProblem(test::CRandomNumbers& rng,
const TTargetFunc& target,
std::size_t rows,
std::size_t cols) {
auto frame = core::makeMainStorageDataFrame(cols, rows).first;
TDoubleVecVec x(cols - 1);
for (std::size_t i = 0; i < cols - 1; ++i) {
rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
}
TBoolVec categoricalColumns(cols, false);
categoricalColumns[cols - 1] = true;
frame->categoricalColumns(categoricalColumns);
addData(rng, target, x, 0.0, *frame);
return frame;
}
std::unique_ptr<core::CDataFrame>
setupLinearBinaryClassificationProblem(std::size_t rows, std::size_t cols) {
test::CRandomNumbers rng;
auto target = linearRegression(rng, cols);
return setupClassificationProblem(
rng,
[&](const TRowRef& row) {
TDoubleVec noise;
rng.generateNormalSamples(0.0, 0.25, 1, noise);
return maths::common::CTools::logisticFunction(0.025 * target(row) +
noise[0]) < 0.5
? 0.0
: 1.0;
},
rows, cols);
}