void CKMostCorrelated::mostCorrelated()

in lib/maths/common/CKMostCorrelated.cc [381:576]


void CKMostCorrelated::mostCorrelated(TCorrelationVec& result) const {
    using TMaxDoubleAccumulator =
        CBasicStatistics::COrderStatisticsStack<double, 2, std::greater<double>>;
    using TMaxCorrelationAccumulator = CBasicStatistics::COrderStatisticsHeap<SCorrelation>;
    using TPointRTree = bgi::rtree<TPointSizePr, bgi::quadratic<16>>;

    result.clear();

    std::size_t N = m_MostCorrelated.size();
    std::size_t V = m_Projected.size();
    std::size_t desired = 2 * m_K;
    LOG_TRACE(<< "N = " << N << ", V = " << V << ", desired = " << desired);
    if (V == 1) {
        return;
    }

    TSizeSizePrUSet lookup;
    for (std::size_t i = 0; i < m_MostCorrelated.size(); ++i) {
        std::size_t X = m_MostCorrelated[i].s_X;
        std::size_t Y = m_MostCorrelated[i].s_Y;
        lookup.insert(std::make_pair(std::min(X, Y), std::max(X, Y)));
    }

    std::size_t replace = std::max(
        static_cast<std::size_t>(REPLACE_FRACTION * static_cast<double>(desired) + 0.5),
        std::max(desired - N, std::size_t(1)));
    LOG_TRACE(<< "replace = " << replace);

    TMaxCorrelationAccumulator mostCorrelated(replace);

    if (10 * replace > V * (V - 1)) {
        LOG_TRACE(<< "Exhaustive search");

        for (auto x = m_Projected.begin(); x != m_Projected.end(); ++x) {
            std::size_t X = x->first;
            auto y = x;
            while (++y != m_Projected.end()) {
                std::size_t Y = y->first;
                if (lookup.count(std::make_pair(std::min(X, Y), std::max(X, Y))) == 0) {
                    SCorrelation cxy(X, x->second.first, x->second.second, Y,
                                     y->second.first, y->second.second);
                    mostCorrelated.add(cxy);
                }
            }
        }
    } else {
        LOG_TRACE(<< "Nearest neighbour search");

        // 1) Build an r-tree,
        // 2) Lookup up V / replace nearest neighbours of each point
        //    and its negative to initialise search,
        // 3) Create a predicate with separation corresponding to the
        //    smallest correlation,
        // 4) Search for neighbours of each point and its negative for
        //    points in range updating the predicate in the loop with
        //    the new least correlated variable.

        // Bound the correlation based on the sparsity of the metric.
        TMaxDoubleAccumulator fmax;
        double dimension = 0.0;
        for (auto i = m_Projected.begin(); i != m_Projected.end(); ++i) {
            const core::CPackedBitVector& ix = i->second.second;
            dimension = static_cast<double>(ix.dimension());
            fmax.add(ix.manhattan() / dimension);
        }
        fmax.sort();
        if (fmax[1] <= MINIMUM_FREQUENCY) {
            return;
        }
        double amax = fmax[1] * dimension;

        TPointSizePrVec points;
        points.reserve(m_Projected.size());
        for (auto i = m_Projected.begin(); i != m_Projected.end(); ++i) {
            points.emplace_back(i->second.first.to<double>().toArray(), i->first);
        }
        LOG_TRACE(<< "# points = " << points.size());

        unsigned int k = static_cast<unsigned int>(replace / V + 1);
        LOG_TRACE(<< "k = " << k);

        // The nearest neighbour search is very slow compared with
        // the search over the smallest correlation box predicate
        // so we use a small number of seed variables if V is large
        // compared to the number to replace.
        TSizeVec seeds;
        if (2 * replace < V) {
            CSampling::uniformSample(m_Rng, 0, V, 2 * replace, seeds);
            std::sort(seeds.begin(), seeds.end());
            seeds.erase(std::unique(seeds.begin(), seeds.end()), seeds.end());
        } else {
            seeds.reserve(V);
            seeds.assign(boost::counting_iterator<std::size_t>(0),
                         boost::counting_iterator<std::size_t>(V));
        }

        try {
            TPointRTree rtree(points);
            TPointSizePrVec nearest;
            for (std::size_t i = 0; i < seeds.size(); ++i) {
                std::size_t X = points[seeds[i]].second;
                const TVectorPackedBitVectorPr& px = m_Projected.at(X);

                nearest.clear();
                bgi::query(rtree,
                           bgi::satisfies(CNotEqual(X)) &&
                               bgi::satisfies(CPairNotIn(lookup, X)) &&
                               bgi::nearest((px.first.to<double>()).toArray(), k),
                           std::back_inserter(nearest));
                bgi::query(rtree,
                           bgi::satisfies(CNotEqual(X)) &&
                               bgi::satisfies(CPairNotIn(lookup, X)) &&
                               bgi::nearest((-px.first.to<double>()).toArray(), k),
                           std::back_inserter(nearest));

                for (std::size_t j = 0; j < nearest.size(); ++j) {
                    std::size_t n = mostCorrelated.count();
                    std::size_t S = n == desired ? mostCorrelated.biggest().s_X : 0;
                    std::size_t T = n == desired ? mostCorrelated.biggest().s_Y : 0;
                    std::size_t Y = nearest[j].second;
                    const TVectorPackedBitVectorPr& py = m_Projected.at(Y);
                    SCorrelation cxy(X, px.first, px.second, Y, py.first, py.second);
                    if (lookup.count(std::make_pair(cxy.s_X, cxy.s_Y)) > 0) {
                        continue;
                    }
                    if (mostCorrelated.add(cxy)) {
                        if (n == desired) {
                            lookup.erase(std::make_pair(S, T));
                        }
                        lookup.insert(std::make_pair(cxy.s_X, cxy.s_Y));
                    }
                }
            }
            LOG_TRACE(<< "# seeds = " << mostCorrelated.count());
            LOG_TRACE(<< "seed most correlated = " << mostCorrelated);

            for (std::size_t i = 0; i < points.size(); ++i) {
                const SCorrelation& biggest = mostCorrelated.biggest();
                double threshold = biggest.distance(amax);
                LOG_TRACE(<< "threshold = " << threshold);

                std::size_t X = points[i].second;
                const TVectorPackedBitVectorPr& px = m_Projected.at(X);

                TVector width(std::sqrt(threshold));
                nearest.clear();
                {
                    bgm::box<TPoint> box((px.first - width).to<double>().toArray(),
                                         (px.first + width).to<double>().toArray());
                    bgi::query(rtree,
                               bgi::within(box) && bgi::satisfies(CNotEqual(X)) &&
                                   bgi::satisfies(CCloserThan(
                                       threshold, px.first.to<double>().toArray())) &&
                                   bgi::satisfies(CPairNotIn(lookup, X)),
                               std::back_inserter(nearest));
                }
                {
                    bgm::box<TPoint> box((-px.first - width).to<double>().toArray(),
                                         (-px.first + width).to<double>().toArray());
                    bgi::query(rtree,
                               bgi::within(box) && bgi::satisfies(CNotEqual(X)) &&
                                   bgi::satisfies(CCloserThan(
                                       threshold, (-px.first).to<double>().toArray())) &&
                                   bgi::satisfies(CPairNotIn(lookup, X)),
                               std::back_inserter(nearest));
                }
                LOG_TRACE(<< "# candidates = " << nearest.size());

                for (std::size_t j = 0; j < nearest.size(); ++j) {
                    std::size_t n = mostCorrelated.count();
                    std::size_t S = n == desired ? mostCorrelated.biggest().s_X : 0;
                    std::size_t T = n == desired ? mostCorrelated.biggest().s_Y : 0;
                    std::size_t Y = nearest[j].second;
                    const TVectorPackedBitVectorPr& py = m_Projected.at(Y);
                    SCorrelation cxy(X, px.first, px.second, Y, py.first, py.second);
                    if (lookup.count(std::make_pair(cxy.s_X, cxy.s_Y)) > 0) {
                        continue;
                    }
                    if (mostCorrelated.add(cxy)) {
                        if (n == desired) {
                            lookup.erase(std::make_pair(S, T));
                        }
                        lookup.insert(std::make_pair(cxy.s_X, cxy.s_Y));
                    }
                }
            }
        } catch (const std::exception& e) {
            LOG_ERROR(<< "Failed to compute most correlated " << e.what());
            return;
        }
    }

    mostCorrelated.sort();
    result.assign(mostCorrelated.begin(), mostCorrelated.end());
    LOG_TRACE(<< "most correlated " << result);
}