cpc/include/cpc_sketch.hpp

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef CPC_SKETCH_HPP_ #define CPC_SKETCH_HPP_ #include <iostream> #include <functional> #include <string> #include <vector> #include "u32_table.hpp" #include "cpc_common.hpp" #include "cpc_compressor.hpp" #include "cpc_confidence.hpp" #include "common_defs.hpp" namespace datasketches { // forward-declarations template<typename A> class cpc_sketch_alloc; template<typename A> class cpc_union_alloc; /// CPC sketch alias with default allocator using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>; /** * Allocation and initialization of global decompression (decoding) tables. * Call this before anything else if you want to control the initialization time. * For instance, to have this happen outside of a transaction context. * Otherwise initialization happens on the first use (serialization or deserialization). * It is safe to call more than once assuming no race conditions. * This is not thread safe! Neither is the rest of the library. */ template<typename A> void cpc_init(); /** * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch * * This is a very compact (in serialized form) distinct counting sketch. * The theory is described in the following paper: * https://arxiv.org/abs/1708.06839 * * @author Kevin Lang * @author Alexander Saydakov */ template<typename A> class cpc_sketch_alloc { public: using allocator_type = A; /** * Creates an instance of the sketch given the lg_k parameter and hash seed. * @param lg_k base 2 logarithm of the number of bins in the sketch * @param seed for hash function * @param allocator instance of an allocator */ explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); /// @return allocator A get_allocator() const; /// @return configured lg_k of this sketch uint8_t get_lg_k() const; /// @return true if this sketch represents an empty set bool is_empty() const; /// @return estimate of the distinct count of the input stream double get_estimate() const; /** * Returns the approximate lower error bound given a parameter kappa (1, 2 or 3). * This parameter is similar to the number of standard deviations of the normal distribution * and corresponds to approximately 67%, 95% and 99% confidence intervals. * @param kappa parameter to specify confidence interval (1, 2 or 3) * @return the lower bound */ double get_lower_bound(unsigned kappa) const; /** * Returns the approximate upper error bound given a parameter kappa (1, 2 or 3). * This parameter is similar to the number of standard deviations of the normal distribution * and corresponds to approximately 67%, 95% and 99% confidence intervals. * @param kappa parameter to specify confidence interval (1, 2 or 3) * @return the upper bound */ double get_upper_bound(unsigned kappa) const; /** * Update this sketch with a given string. * @param value string to update the sketch with */ void update(const std::string& value); /** * Update this sketch with a given unsigned 64-bit integer. * @param value uint64_t to update the sketch with */ void update(uint64_t value); /** * Update this sketch with a given signed 64-bit integer. * @param value int64_t to update the sketch with */ void update(int64_t value); /** * Update this sketch with a given unsigned 32-bit integer. * For compatibility with Java implementation. * @param value uint32_t to update the sketch with */ void update(uint32_t value); /** * Update this sketch with a given signed 32-bit integer. * For compatibility with Java implementation. * @param value int32_t to update the sketch with */ void update(int32_t value); /** * Update this sketch with a given unsigned 16-bit integer. * For compatibility with Java implementation. * @param value uint16_t to update the sketch with */ void update(uint16_t value); /** * Update this sketch with a given signed 16-bit integer. * For compatibility with Java implementation. * @param value int16_t to update the sketch with */ void update(int16_t value); /** * Update this sketch with a given unsigned 8-bit integer. * For compatibility with Java implementation. * @param value uint8_t to update the sketch with */ void update(uint8_t value); /** * Update this sketch with a given signed 8-bit integer. * For compatibility with Java implementation. * @param value int8_t to update the sketch with */ void update(int8_t value); /** * Update this sketch with a given double-precision floating point value. * For compatibility with Java implementation. * @param value double to update the sketch with */ void update(double value); /** * Update this sketch with a given floating point value. * For compatibility with Java implementation. * @param value float to update the sketch with */ void update(float value); /** * Update this sketch with given data of any type. * This is a "universal" update that covers all cases above, * but may produce different hashes. * Be very careful to hash input values consistently using the same approach * both over time and on different platforms * and while passing sketches between C++ environment and Java environment. * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected * @param value pointer to the data * @param size of the data in bytes */ void update(const void* value, size_t size); /** * Returns a human-readable summary of this sketch * @return a human-readable summary of this sketch */ string<A> to_string() const; /** * This method serializes the sketch into a given stream in a binary form * @param os output stream */ void serialize(std::ostream& os) const; // This is a convenience alias for users // The type returned by the following serialize method using vector_bytes = vector_u8<A>; /** * This method serializes the sketch as a vector of bytes. * An optional header can be reserved in front of the sketch. * It is an uninitialized space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch * @return serialized sketch as a vector of bytes */ vector_bytes serialize(unsigned header_size_bytes = 0) const; /** * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch * @param allocator instance of an Allocator * @return an instance of a sketch */ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); /** * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch * @param allocator instance of an Allocator * @return an instance of the sketch */ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); /** * The actual size of a compressed CPC sketch has a small random variance, but the following * empirically measured size should be large enough for at least 99.9 percent of sketches. * * <p>For small values of <i>n</i> the size can be much smaller. * * @param lg_k the given value of lg_k. * @return the estimated maximum compressed serialized size of a sketch. */ static size_t get_max_serialized_size_bytes(uint8_t lg_k); /// @private for internal use uint32_t get_num_coupons() const; /// @private for debugging // this should catch some forms of corruption during serialization-deserialization bool validate() const; private: static const uint8_t SERIAL_VERSION = 1; static const uint8_t FAMILY = 16; enum flags { IS_BIG_ENDIAN, IS_COMPRESSED, HAS_HIP, HAS_TABLE, HAS_WINDOW }; // Note: except for brief transitional moments, these sketches always obey // the following strict mapping between the flavor of a sketch and the // number of coupons that it has collected enum flavor { EMPTY, // 0 == C < 1 SPARSE, // 1 <= C < 3K/32 HYBRID, // 3K/32 <= C < K/2 PINNED, // K/2 <= C < 27K/8 [NB: 27/8 = 3 + 3/8] SLIDING // 27K/8 <= C }; uint8_t lg_k; uint64_t seed; bool was_merged; // is the sketch the result of merging? uint32_t num_coupons; // the number of coupons collected so far u32_table<A> surprising_value_table; vector_u8<A> sliding_window; uint8_t window_offset; // derivable from num_coupons, but made explicit for speed uint8_t first_interesting_column; // This is part of a speed optimization double kxp; double hip_est_accum; // for deserialization and cpc_union::get_result() cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table, vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed); inline void row_col_update(uint32_t row_col); inline void update_sparse(uint32_t row_col); inline void update_windowed(uint32_t row_col); inline void update_hip(uint32_t row_col); void promote_sparse_to_windowed(); void move_window(); void refresh_kxp(const uint64_t* bit_matrix); friend double get_hip_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa); friend double get_hip_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa); friend double get_icon_confidence_lb<A>(const cpc_sketch_alloc<A>& sketch, int kappa); friend double get_icon_confidence_ub<A>(const cpc_sketch_alloc<A>& sketch, int kappa); double get_hip_estimate() const; double get_icon_estimate() const; inline flavor determine_flavor() const; static inline flavor determine_flavor(uint8_t lg_k, uint64_t c); static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c); // this produces a full-size k-by-64 bit matrix vector_u64<A> build_bit_matrix() const; static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window); inline void write_hip(std::ostream& os) const; inline size_t copy_hip_to_mem(void* dst) const; static void check_lg_k(uint8_t lg_k); friend cpc_compressor<A>; friend cpc_union_alloc<A>; }; } /* namespace datasketches */ #include "cpc_sketch_impl.hpp" #endif

cpc/include/cpc_sketch.hpp (98 lines of code) (raw):