velox/functions/lib/KllSketch.h (57 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <functional>
#include <random>
#include "folly/Random.h"
#include "folly/Range.h"
namespace facebook::velox::functions::kll {
constexpr uint16_t kDefaultK = 200;
/// Estimate the proper k value to ensure the error bound epsilon.
uint16_t kFromEpsilon(double epsilon);
/// Implementation of KLL sketch that can achieve nearly optimal
/// accuracy per retained item.
///
/// This sketch is configured with a parameter k, which affects the
/// size of the sketch and its estimation error.
///
/// The estimation error is commonly called epsilon and is a fraction
/// between zero and one. Larger values of k result in smaller values
/// of epsilon. Epsilon is always with respect to the quantile and
/// cannot be applied to the corresponding values.
///
/// The default k of 200 yields a "single-sided" epsilon of about
/// 1.33%.
///
/// See https://arxiv.org/abs/1603.05346v2 for more details.
template <
typename T,
typename Allocator = std::allocator<T>,
typename Compare = std::less<T>>
struct KllSketch {
KllSketch(
uint16_t k = kll::kDefaultK,
const Allocator& = Allocator(),
uint32_t seed = folly::Random::rand32());
/// Add one new value to the sketch.
void insert(T value);
/// Merge this sketch with values from multiple other sketches.
/// @tparam Iter Iterator type dereferenceable to the same type as this sketch
/// (KllSketch<T, Allocator, Compare>)
/// @param sketches Range of sketches to be merged to this one
template <typename Iter>
void merge(const folly::Range<Iter>& sketches);
/// Estimate the value of the given quantile.
/// @param quantile Quantile in [0, 1] to be estimated
T estimateQuantile(double quantile);
/// Estimate the values of the given quantiles. This is more
/// efficient than calling estimateQuantile(double) repeatedly.
/// @tparam Iter Iterator type dereferenceable to double
/// @param quantiles Range of quantiles in [0, 1] to be estimated
template <typename Iter>
std::vector<T, Allocator> estimateQuantiles(
const folly::Range<Iter>& quantiles);
/// The total number of values being added to the sketch.
size_t totalCount() const {
return n_;
}
private:
uint32_t insertPosition();
int findLevelToCompact() const;
void addEmptyTopLevelToCompletelyFullSketch();
template <typename Iter>
void estimateQuantiles(const folly::Range<Iter>& fractions, T* out);
uint8_t numLevels() const {
return levels_.size() - 1;
}
uint32_t getNumRetained() const {
return levels_.back() - levels_[0];
}
uint32_t safeLevelSize(uint8_t level) const {
return level < numLevels() ? levels_[level + 1] - levels_[level] : 0;
}
using AllocU32 = typename std::allocator_traits<
Allocator>::template rebind_alloc<uint32_t>;
uint16_t k_;
Allocator allocator_;
std::independent_bits_engine<folly::Random::DefaultGenerator, 1, uint32_t>
randomBit_;
size_t n_;
T minValue_;
T maxValue_;
std::vector<T, Allocator> items_;
std::vector<uint32_t, AllocU32> levels_;
bool isLevelZeroSorted_;
};
} // namespace facebook::velox::functions::kll
#include "velox/functions/lib/KllSketch-inl.h"