theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp (62 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_ #define BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_ #include <cstdint> #include <stdexcept> #include "bounds_on_ratios_in_sampled_sets.hpp" namespace datasketches { /** * Bounds on ratios in Theta sketched sets. * This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where: * <ul> * <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li> * <li><i>B</i> is a Theta Sketch of population <i>PopB</i> that is a subset of <i>A</i>, * obtained by an intersection of <i>A</i> with some other Theta Sketch <i>C</i>, * which acts like a predicate or selection clause.</li> * <li>The estimate of the ratio <i>PopB/PopA</i> is * estimate_of_b_over_a(<i>A, B</i>).</li> * <li>The Upper Bound estimate on the ratio PopB/PopA is * upper_bound_for_b_over_a(<i>A, B</i>).</li> * <li>The Lower Bound estimate on the ratio PopB/PopA is * lower_bound_for_b_over_a(<i>A, B</i>).</li> * </ul> * Note: The theta of <i>A</i> cannot be greater than the theta of <i>B</i>. * If <i>B</i> is formed as an intersection of <i>A</i> and some other set <i>C</i>, * then the theta of <i>B</i> is guaranteed to be less than or equal to the theta of <i>B</i>. */ template<typename ExtractKey> class bounds_on_ratios_in_theta_sketched_sets { public: /** * Gets the approximate lower bound for B over A based on a 95% confidence interval * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the approximate lower bound for B over A */ template<typename SketchA, typename SketchB> static double lower_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = theta64_a == theta64_b ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 0; const double f = sketch_b.get_theta(); return bounds_on_ratios_in_sampled_sets::lower_bound_for_b_over_a(count_a, count_b, f); } /** * Gets the approximate upper bound for B over A based on a 95% confidence interval * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the approximate upper bound for B over A */ template<typename SketchA, typename SketchB> static double upper_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = (theta64_a == theta64_b) ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 1; const double f = sketch_b.get_theta(); return bounds_on_ratios_in_sampled_sets::upper_bound_for_b_over_a(count_a, count_b, f); } /** * Gets the estimate for B over A * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the estimate for B over A */ template<typename SketchA, typename SketchB> static double estimate_of_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = (theta64_a == theta64_b) ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 0.5; return static_cast<double>(count_b) / static_cast<double>(count_a); } private: static inline void check_thetas(uint64_t theta_a, uint64_t theta_b) { if (theta_b > theta_a) { throw std::invalid_argument("theta_a must be <= theta_b"); } } template<typename Sketch> static uint64_t count_less_than_theta64(const Sketch& sketch, uint64_t theta) { uint64_t count = 0; for (const auto& entry: sketch) if (ExtractKey()(entry) < theta) ++count; return count; } }; } /* namespace datasketches */ # endif