filters/include/bloom_filter.hpp (159 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef _BLOOM_FILTER_HPP_ #define _BLOOM_FILTER_HPP_ #include <cstdint> #include <memory> #include <vector> #include "common_defs.hpp" namespace datasketches { // forward declarations template<typename A> class bloom_filter_alloc; // aliases with default allocator using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>; /** * <p>A Bloom filter is a data structure that can be used for probabilistic * set membership.</p> * * <p>When querying a Bloom filter, there are no false positives. Specifically: * When querying an item that has already been inserted to the filter, the filter will * always indicate that the item is present. There is a chance of false positives, where * querying an item that has never been presented to the filter will indicate that the * item has already been seen. Consequently, any query should be interpreted as * "might have seen."</p> * * <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear * in size and does not resize itself. A Bloom filter will work up to a target number of * distinct items, beyond which it will saturate and the false positive rate will start to * increase. The size of a Bloom filter will be linear in the expected number of * distinct items.</p> * * <p>See the bloom_filter_builder_alloc class for methods to create a filter, especially * one sized correctly for a target number of distinct elements and a target * false positive probability.</p> * * <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher, * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p> */ template<typename Allocator = std::allocator<uint8_t>> class bloom_filter_alloc { public: // no public constructor; use builder or deserialize/wrap methods class builder; /** * This method deserializes a Bloom filter from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array * @param allocator instance of an Allocator * @return an instance of a Bloom filter */ static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); /** * This method deserializes a Bloom filter from a given stream. * @param is input stream * @param allocator instance of an Allocator * @return an instance of a Bloom filter */ static bloom_filter_alloc deserialize(std::istream& is, const Allocator& allocator = Allocator()); /** * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does * not take ownership of the underlying memory. Does not allow modifying the filter. * * @param data The memory to wrap * @param length_bytes The length of the memory in bytes * @param allocator instance of an Allocator * @return a const (read-only) Bloom filter wrapping the provided memory */ static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); /** * @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does * not take ownership of the underlying memory. Allows modifying the filter. * * @param data the memory to wrap * @param length_bytes the length of the memory in bytes * @param allocator instance of an Allocator * @return a Bloom filter wrapping the provided memory */ static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); /** * Copy constructor * @param other filter to be copied */ bloom_filter_alloc(const bloom_filter_alloc& other); /** Move constructor * @param other filter to be moved */ bloom_filter_alloc(bloom_filter_alloc&& other) noexcept; /** * Copy assignment * @param other filter to be copied * @return reference to this filter */ bloom_filter_alloc& operator=(const bloom_filter_alloc& other); /** * Move assignment * @param other filter to be moved * @return reference to this filter */ bloom_filter_alloc& operator=(bloom_filter_alloc&& other); /** * @brief Destroy the bloom filter object */ ~bloom_filter_alloc(); // This is a convenience alias for users // The type returned by the following serialize method using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>; /** * This method serializes the filter as a vector of bytes. * An optional header can be reserved in front of the filter. * It is a blank space of a given size. * Some integrations such as PostgreSQL may need this header space. * @param header_size_bytes space to reserve in front of the filter * @return serialized filter as a vector of bytes */ vector_bytes serialize(unsigned header_size_bytes = 0) const; /** * This method serializes the filter into a given stream in a binary form * @param os output stream */ void serialize(std::ostream& os) const; /** * Checks if the Bloom Filter has processed any items * @return True if the BloomFilter is empty, otherwise False */ bool is_empty() const; /** * Returns the number of bits in the Bloom Filter that are set to 1. * @return The number of bits in use in this filter */ uint64_t get_bits_used(); /** * Returns the total number of bits in the Bloom Filter. * @return The total size of the Bloom Filter */ uint64_t get_capacity() const; /** * Returns the configured number of hash functions for this Bloom Filter * @return The number of hash functions to apply to inputs */ uint16_t get_num_hashes() const; /** * Returns the hash seed for this Bloom Filter. * @return The hash seed for this filter */ uint64_t get_seed() const; /** * Resets the Bloom Filter to its original state. */ void reset(); // UPDATE METHODS /** * Updates the filter with the given std::string. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns. * @param item The given string. */ void update(const std::string& item); /** * Updates the filter with the given unsigned 64-bit integer. * @param item The given integer. */ void update(uint64_t item); /** * Updates the filter with the given unsigned 32-bit integer. * @param item The given integer. */ void update(uint32_t item); /** * Updates the filter with the given unsigned 16-bit integer. * @param item The given integer. */ void update(uint16_t item); /** * Updates the filter with the given unsigned 8-bit integer. * @param item The given integer. */ void update(uint8_t item); /** * Updates the filter with the given signed 64-bit integer. * @param item The given integer. */ void update(int64_t item); /** * Updates the filter with the given signed 32-bit integer. * @param item The given integer. */ void update(int32_t item); /** * Updates the filter with the given signed 16-bit integer. * @param item The given integer. */ void update(int16_t item); /** * Updates the filter with the given signed 8-bit integer. * @param item The given integer. */ void update(int8_t item); /** * Updates the filter with the given 64-bit floating point value. * @param item The given double. */ void update(double item); /** * Updates the filter with the give 32-bit floating point value. * @param item The given float. */ void update(float item); /** * Updates the filter with the given data array. * @param data The given array. * @param length_bytes The array length in bytes. */ void update(const void* data, size_t length_bytes); // QUERY-AND-UPDATE METHODS /** * Updates the filter with the given std::string and returns the result from * querying the filter prior to the update. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns false. * @param item The given string. * @return The result from querying the filter prior to the update. */ bool query_and_update(const std::string& item); /** * Updates the filter with the given unsigned 64-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(uint64_t item); /** * Updates the filter with the given unsigned 32-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(uint32_t item); /** * Updates the filter with the given unsigned 16-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(uint16_t item); /** * Updates the filter with the given unsigned 8-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(uint8_t item); /** * Updates the filter with the given signed 64-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(int64_t item); /** * Updates the filter with the given signed 32-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(int32_t item); /** * Updates the filter with the given signed 16-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(int16_t item); /** * Updates the filter with the given signed 8-bit integer and returns the result from * querying the filter prior to the update. * @param item The given integer. * @return The result from querying the filter prior to the update. */ bool query_and_update(int8_t item); /** * Updates the filter with the given 64-bit floating point value and returns the result from * querying the filter prior to the update. * @param item The given double. * @return The result from querying the filter prior to the update. */ bool query_and_update(double item); /** * Updates the filter with the give 32-bit floating point value and returns the result from * querying the filter prior to the update. * @param item The given float. * @return The result from querying the filter prior to the update. */ bool query_and_update(float item); /** * Updates the filter with the given data array and returns the result from * querying the filter prior to the update. * @param data The given array. * @param length_bytes The array length in bytes. * @return The result from querying the filter prior to the update. */ bool query_and_update(const void* data, size_t length_bytes); // QUERY METHODS /** * Queries the filter with the given std::string and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty the method always returns false. * @param item The given string. * @return The result from querying the filter with the given item. */ bool query(const std::string& item) const; /** * Queries the filter with the given unsigned 64-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(uint64_t item) const; /** * Queries the filter with the given unsigned 32-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(uint32_t item) const; /** * Queries the filter with the given unsigned 16-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(uint16_t item) const; /** * Queries the filter with the given unsigned 8-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(uint8_t item) const; /** * Queries the filter with the given signed 64-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(int64_t item) const; /** * Queries the filter with the given signed 32-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(int32_t item) const; /** * Queries the filter with the given signed 16-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(int16_t item) const; /** * Queries the filter with the given signed 8-bit integer and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given integer. * @return The result from querying the filter with the given item. */ bool query(int8_t item) const; /** * Queries the filter with the given 64-bit floating point value and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given double. * @return The result from querying the filter with the given item. */ bool query(double item) const; /** * Queries the filter with the given 32-bit floating point value and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param item The given float. * @return The result from querying the filter with the given item. */ bool query(float item) const; /** * Queries the filter with the given data array. and returns the result from * Queries the filter with the given 64-bit floating point value and returns whether the value * might have been seen previoiusly. The filter's expected Fale Positive Probability * determines the chances of a true result being a false positive. False engatives are * never possible. * @param data The given array. * @param length_bytes The array length in bytes. * @return The result from querying the filter with the given item. */ bool query(const void* data, size_t length_bytes) const; // OTHER OPERATIONS /** * Unions two Bloom Filters by applying a logical OR. The result will recognized * any values seen by either filter (as well as false positives). * @param other A BloomFilter to union with this one */ void union_with(const bloom_filter_alloc& other); /** * Intersects two Bloom Filters by applying a logical AND. The result will recognize * only values seen by both filters (as well as false positives). * @param other A Bloom Filter to union with this one */ void intersect(const bloom_filter_alloc& other); /** * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership. */ void invert(); /** * Helps identify if two Bloom Filters may be unioned or intersected. * @param other A Bloom Filter to check for compatibility with this one * @return True if the filters are compatible, otherwise false */ bool is_compatible(const bloom_filter_alloc& other) const; /** * @brief Checks if the Bloom Filter is read-only. * * @return True if the filter is read-only, otherwise false. */ bool is_read_only() const; /** * @brief Returns whether the filter owns its underlying memory * @return True if the filter owns its memory, otherwise false */ bool is_memory_owned() const; /** * @brief Checks if the Bloom Filter was created by a call to wrap(). * * @return True if the filter was created by wrapping memory, otherwise false. */ bool is_wrapped() const; /** * @brief Returns a pointer to the memory this filter wraps, if it exists. * @return A pointer to the wrapped memory, or nullptr if is_wrapped() is false. */ const uint8_t* get_wrapped_memory() const; /** * @brief Gets the serialized size of the Bloom Filter in bytes * @return The serialized size of the Bloom Filter in bytes */ size_t get_serialized_size_bytes() const; /** * @brief Gets the serialized size of the Bloom Filter with the given number of bits, in bytes * @param num_bits The number of bits in the Bloom Filter for the size calculation * @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes */ static size_t get_serialized_size_bytes(uint64_t num_bits); /** * @brief Returns a human-readable string representation of the Bloom Filter. * @param print_filter If true, the filter bits will be printed as well. * @return A human-readable string representation of the Bloom Filter. */ string<Allocator> to_string(bool print_filter = false) const; private: using A = Allocator; using AllocUint8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>; static const uint64_t DIRTY_BITS_VALUE = static_cast<uint64_t>(-1LL); static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16; static const uint64_t NUM_BITS_SET_OFFSET_BYTES = 24; static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32; static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t); static const uint8_t PREAMBLE_LONGS_EMPTY = 3; static const uint8_t PREAMBLE_LONGS_STANDARD = 4; static const uint8_t FAMILY_ID = 21; static const uint8_t SER_VER = 1; static const uint8_t EMPTY_FLAG_MASK = 4; // used by builder methods bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator); bloom_filter_alloc(uint8_t* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator); // used by deserialize and wrap bloom_filter_alloc(uint64_t seed, uint16_t num_hashes, bool is_dirty, bool is_owned, bool is_read_only, uint64_t capacity_bits, uint64_t num_bits_set, uint8_t* bit_array, uint8_t* memory, const A& allocator); static bloom_filter_alloc internal_deserialize_or_wrap(void* bytes, size_t length_bytes, bool read_only, bool wrap, const A& allocator); // internal query/update methods void internal_update(uint64_t h0, uint64_t h1); bool internal_query_and_update(uint64_t h0, uint64_t h1); bool internal_query(uint64_t h0, uint64_t h1) const; void update_num_bits_set(uint64_t num_bits_set); Allocator allocator_; uint64_t seed_; uint16_t num_hashes_; bool is_dirty_; bool is_owned_; // if true, data is not owned by filter AND memory_ holds the entire filter not just the bit array bool is_read_only_; // if true, filter is read-only uint64_t capacity_bits_; uint64_t num_bits_set_; uint8_t* bit_array_; // data backing bit_array_, regardless of ownership uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr }; /** * <p>This class provides methods to help estimate the correct parameters when * creating a Bloom filter, and methods to create the filter using those values.</p> * * <p>The underlying math is described in the * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'> * Wikipedia article on Bloom filters</a>.</p> */ template<typename Allocator> class bloom_filter_alloc<Allocator>::builder { public: /** * Returns the optimal number of hash functions to given target numbers of distinct items * and the Bloom filter size in bits. This function will provide a result even if the input * values exceed the capacity of a single Bloom filter. * @param max_distinct_items The maximum expected number of distinct items to add to the filter * @param num_filter_bits The intended size of the Bloom Filter in bits * @return The suggested number of hash functions to use with the filter */ static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits); /** * Returns the optimal number of hash functions to achieve a target false positive probability. * @param target_false_positive_prob A desired false positive probability per item * @return The suggested number of hash functions to use with the filter. */ static uint16_t suggest_num_hashes(double target_false_positive_prob); /** * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct * items and a target false positive probability. * @param max_distinct_items The maximum expected number of distinct items to add to the filter * @param target_false_positive_prob A desired false positive probability per item * @return The suggested number of bits to use with the filter */ static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob); /** * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, * using a random base seed for the hash function. * @param max_distinct_items The maximum expected number of distinct items to add to the filter * @param target_false_positive_prob A desired false positive probability per item * @param seed A bash hash seed (default: random) * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters */ static bloom_filter_alloc<Allocator> create_by_accuracy(uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); /** * Creates a Bloom filter with given number of bits and number of hash functions, * using the provided base seed for the hash function. * * @param num_bits The size of the BloomFilter, in bits * @param num_hashes The number of hash functions to apply to items * @param seed A base hash seed (default: random) * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters */ static bloom_filter_alloc<Allocator> create_by_size(uint64_t num_bits, uint16_t num_hashes, uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); /** * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, * using a random base seed for the hash function and writing into the provided memory. The filter does * not take ownership of the memory but does overwrite the full contents. * * @param memory A pointer to the memory to use for the filter * @param length_bytes The length of the memory in bytes * @param max_distinct_items The maximum expected number of distinct items to add to the filter * @param target_false_positive_prob A desired false positive probability per item * @param dstMem A WritableMemory to hold the initialized filter * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new Bloom filter configured for the given input parameters in the provided memory */ static bloom_filter_alloc<Allocator> initialize_by_accuracy(void* memory, size_t length_bytes, uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); /** * Initializes a Bloom filter with given number of bits and number of hash functions, * using the provided base seed for the hash function and writing into the provided memory. The filter does * not take ownership of the memory but does overwrite the full contents. * * @param memory A pointer to the memory to use for the filter * @param length_bytes The length of the memory in bytes * @param num_bits The size of the BloomFilter, in bits * @param num_hashes The number of hash functions to apply to items * @param seed A base hash seed (default: random) * @param allocator The allocator to use for the filter (default: standard allocator) * @return A new BloomFilter configured for the given input parameters */ static bloom_filter_alloc<Allocator> initialize_by_size(void* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed = generate_random_seed(), const Allocator& allocator = Allocator()); /** * @brief Generates a random 64-bit seed value * * @return uint64_t a random value over the range of unsigned 64-bit integers */ static uint64_t generate_random_seed(); private: static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes); static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); }; } // namespace datasketches #include "bloom_filter_builder_impl.hpp" #include "bloom_filter_impl.hpp" #endif // _BLOOM_FILTER_HPP_ b