in lib/maths/common/CKMostCorrelated.cc [381:576]
void CKMostCorrelated::mostCorrelated(TCorrelationVec& result) const {
using TMaxDoubleAccumulator =
CBasicStatistics::COrderStatisticsStack<double, 2, std::greater<double>>;
using TMaxCorrelationAccumulator = CBasicStatistics::COrderStatisticsHeap<SCorrelation>;
using TPointRTree = bgi::rtree<TPointSizePr, bgi::quadratic<16>>;
result.clear();
std::size_t N = m_MostCorrelated.size();
std::size_t V = m_Projected.size();
std::size_t desired = 2 * m_K;
LOG_TRACE(<< "N = " << N << ", V = " << V << ", desired = " << desired);
if (V == 1) {
return;
}
TSizeSizePrUSet lookup;
for (std::size_t i = 0; i < m_MostCorrelated.size(); ++i) {
std::size_t X = m_MostCorrelated[i].s_X;
std::size_t Y = m_MostCorrelated[i].s_Y;
lookup.insert(std::make_pair(std::min(X, Y), std::max(X, Y)));
}
std::size_t replace = std::max(
static_cast<std::size_t>(REPLACE_FRACTION * static_cast<double>(desired) + 0.5),
std::max(desired - N, std::size_t(1)));
LOG_TRACE(<< "replace = " << replace);
TMaxCorrelationAccumulator mostCorrelated(replace);
if (10 * replace > V * (V - 1)) {
LOG_TRACE(<< "Exhaustive search");
for (auto x = m_Projected.begin(); x != m_Projected.end(); ++x) {
std::size_t X = x->first;
auto y = x;
while (++y != m_Projected.end()) {
std::size_t Y = y->first;
if (lookup.count(std::make_pair(std::min(X, Y), std::max(X, Y))) == 0) {
SCorrelation cxy(X, x->second.first, x->second.second, Y,
y->second.first, y->second.second);
mostCorrelated.add(cxy);
}
}
}
} else {
LOG_TRACE(<< "Nearest neighbour search");
// 1) Build an r-tree,
// 2) Lookup up V / replace nearest neighbours of each point
// and its negative to initialise search,
// 3) Create a predicate with separation corresponding to the
// smallest correlation,
// 4) Search for neighbours of each point and its negative for
// points in range updating the predicate in the loop with
// the new least correlated variable.
// Bound the correlation based on the sparsity of the metric.
TMaxDoubleAccumulator fmax;
double dimension = 0.0;
for (auto i = m_Projected.begin(); i != m_Projected.end(); ++i) {
const core::CPackedBitVector& ix = i->second.second;
dimension = static_cast<double>(ix.dimension());
fmax.add(ix.manhattan() / dimension);
}
fmax.sort();
if (fmax[1] <= MINIMUM_FREQUENCY) {
return;
}
double amax = fmax[1] * dimension;
TPointSizePrVec points;
points.reserve(m_Projected.size());
for (auto i = m_Projected.begin(); i != m_Projected.end(); ++i) {
points.emplace_back(i->second.first.to<double>().toArray(), i->first);
}
LOG_TRACE(<< "# points = " << points.size());
unsigned int k = static_cast<unsigned int>(replace / V + 1);
LOG_TRACE(<< "k = " << k);
// The nearest neighbour search is very slow compared with
// the search over the smallest correlation box predicate
// so we use a small number of seed variables if V is large
// compared to the number to replace.
TSizeVec seeds;
if (2 * replace < V) {
CSampling::uniformSample(m_Rng, 0, V, 2 * replace, seeds);
std::sort(seeds.begin(), seeds.end());
seeds.erase(std::unique(seeds.begin(), seeds.end()), seeds.end());
} else {
seeds.reserve(V);
seeds.assign(boost::counting_iterator<std::size_t>(0),
boost::counting_iterator<std::size_t>(V));
}
try {
TPointRTree rtree(points);
TPointSizePrVec nearest;
for (std::size_t i = 0; i < seeds.size(); ++i) {
std::size_t X = points[seeds[i]].second;
const TVectorPackedBitVectorPr& px = m_Projected.at(X);
nearest.clear();
bgi::query(rtree,
bgi::satisfies(CNotEqual(X)) &&
bgi::satisfies(CPairNotIn(lookup, X)) &&
bgi::nearest((px.first.to<double>()).toArray(), k),
std::back_inserter(nearest));
bgi::query(rtree,
bgi::satisfies(CNotEqual(X)) &&
bgi::satisfies(CPairNotIn(lookup, X)) &&
bgi::nearest((-px.first.to<double>()).toArray(), k),
std::back_inserter(nearest));
for (std::size_t j = 0; j < nearest.size(); ++j) {
std::size_t n = mostCorrelated.count();
std::size_t S = n == desired ? mostCorrelated.biggest().s_X : 0;
std::size_t T = n == desired ? mostCorrelated.biggest().s_Y : 0;
std::size_t Y = nearest[j].second;
const TVectorPackedBitVectorPr& py = m_Projected.at(Y);
SCorrelation cxy(X, px.first, px.second, Y, py.first, py.second);
if (lookup.count(std::make_pair(cxy.s_X, cxy.s_Y)) > 0) {
continue;
}
if (mostCorrelated.add(cxy)) {
if (n == desired) {
lookup.erase(std::make_pair(S, T));
}
lookup.insert(std::make_pair(cxy.s_X, cxy.s_Y));
}
}
}
LOG_TRACE(<< "# seeds = " << mostCorrelated.count());
LOG_TRACE(<< "seed most correlated = " << mostCorrelated);
for (std::size_t i = 0; i < points.size(); ++i) {
const SCorrelation& biggest = mostCorrelated.biggest();
double threshold = biggest.distance(amax);
LOG_TRACE(<< "threshold = " << threshold);
std::size_t X = points[i].second;
const TVectorPackedBitVectorPr& px = m_Projected.at(X);
TVector width(std::sqrt(threshold));
nearest.clear();
{
bgm::box<TPoint> box((px.first - width).to<double>().toArray(),
(px.first + width).to<double>().toArray());
bgi::query(rtree,
bgi::within(box) && bgi::satisfies(CNotEqual(X)) &&
bgi::satisfies(CCloserThan(
threshold, px.first.to<double>().toArray())) &&
bgi::satisfies(CPairNotIn(lookup, X)),
std::back_inserter(nearest));
}
{
bgm::box<TPoint> box((-px.first - width).to<double>().toArray(),
(-px.first + width).to<double>().toArray());
bgi::query(rtree,
bgi::within(box) && bgi::satisfies(CNotEqual(X)) &&
bgi::satisfies(CCloserThan(
threshold, (-px.first).to<double>().toArray())) &&
bgi::satisfies(CPairNotIn(lookup, X)),
std::back_inserter(nearest));
}
LOG_TRACE(<< "# candidates = " << nearest.size());
for (std::size_t j = 0; j < nearest.size(); ++j) {
std::size_t n = mostCorrelated.count();
std::size_t S = n == desired ? mostCorrelated.biggest().s_X : 0;
std::size_t T = n == desired ? mostCorrelated.biggest().s_Y : 0;
std::size_t Y = nearest[j].second;
const TVectorPackedBitVectorPr& py = m_Projected.at(Y);
SCorrelation cxy(X, px.first, px.second, Y, py.first, py.second);
if (lookup.count(std::make_pair(cxy.s_X, cxy.s_Y)) > 0) {
continue;
}
if (mostCorrelated.add(cxy)) {
if (n == desired) {
lookup.erase(std::make_pair(S, T));
}
lookup.insert(std::make_pair(cxy.s_X, cxy.s_Y));
}
}
}
} catch (const std::exception& e) {
LOG_ERROR(<< "Failed to compute most correlated " << e.what());
return;
}
}
mostCorrelated.sort();
result.assign(mostCorrelated.begin(), mostCorrelated.end());
LOG_TRACE(<< "most correlated " << result);
}