glean/rts/ownership/setu32.h

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <stdint.h> #include <algorithm> #include <cassert> #include <cstring> #include <set> #include <tuple> #include <vector> #include <folly/Optional.h> #if __x86_64__ // AVX required #include <immintrin.h> #include <folly/experimental/EliasFanoCoding.h> #else #include "glean/rts/ownership/fallbackavx.h" #endif namespace facebook { namespace glean { namespace rts { #if !__x86_64__ typedef uint32_t __m256i __attribute__((vector_size(32))); #endif /** * An implementation of 256-bit bitsets. This uses AVX2, most probably * unnecessarily. */ namespace impl { #if __x86_64__ // use AVX inline bool empty(__m256i value) { return _mm256_testz_si256(value, value); } inline __m256i none() { return _mm256_setzero_si256(); } inline __m256i all() { return _mm256_set1_epi32(-1); } inline __m256i single(uint8_t n) { return _mm256_sllv_epi32( _mm256_set1_epi32(1), _mm256_sub_epi32( _mm256_set1_epi32(n), _mm256_set_epi32(224,192,160,128,96,64,32,0))); } inline bool includes(__m256i value, __m256i other) { return _mm256_testc_si256(value, other); } inline size_t count(__m256i value) { const uint64_t* p = reinterpret_cast<const uint64_t*>(&value); // _mm256_popcnt instructions require AVX512 return _mm_popcnt_u64(p[0]) + _mm_popcnt_u64(p[1]) + _mm_popcnt_u64(p[2]) + _mm_popcnt_u64(p[3]); } inline uint32_t upper(__m256i value) { const uint64_t* p = reinterpret_cast<const uint64_t*>(&value); auto a = _lzcnt_u64(p[3]); if (a < 64) { return 255 - a; } a = _lzcnt_u64(p[2]); if (a < 64) { return 191 - a; } a = _lzcnt_u64(p[1]); if (a < 64) { return 127 - a; } return (63 - _lzcnt_u64(p[0])); } inline __m256i or_(__m256i value, __m256i other) { return _mm256_or_si256(value, other); } inline __m256i and_(__m256i value, __m256i other) { return _mm256_and_si256(value, other); } inline __m256i xor_(__m256i value, __m256i other) { return _mm256_xor_si256(value, other); } #else // not x86_64 inline bool empty(__m256i value) { return !(value[0] || value[1] || value[2] || value[3] || value[4] || value[5] || value[6] || value[7]); } inline __m256i none() { __m256i v = {0, 0, 0, 0, 0, 0, 0, 0}; return v; } inline __m256i all() { __m256i v = {0, 0, 0, 0, 0, 0, 0, 0}; return ~v; } inline __m256i single(uint8_t n) { __m256i v0 = {224,192,160,128,96,64,32,0}, v1 = {1,1,1,1,1,1,1,1}; return v1 << (v0 - n); } inline bool includes(__m256i value, __m256i other) { return empty(~value & other); } inline size_t count(__m256i value) { const uint64_t* p = reinterpret_cast<const uint64_t*>(&value); return __builtin_popcountl(p[0]) + __builtin_popcountl(p[1]) + __builtin_popcountl(p[2]) + __builtin_popcountl(p[3]); } inline uint32_t upper(__m256i value) { const uint64_t* p = reinterpret_cast<const uint64_t*>(&value); if (p[3]) { return 255 - __builtin_clzl(p[3]); } if (p[2]) { return 191 - __builtin_clzl(p[2]); } if (p[1]) { return 127 - __builtin_clzl(p[1]); } if (p[0]) { return 63 - __builtin_clzl(p[0]); } return 0; // undefined } inline __m256i or_(__m256i value, __m256i other) { return value | other; } inline __m256i and_(__m256i value, __m256i other) { return value & other; } inline __m256i xor_(__m256i value, __m256i other) { return value ^ other; } #endif } // namespace impl struct Bits256 { __m256i value; Bits256() = default; explicit Bits256(__m256i v) : value(v) {} Bits256(const uint8_t* vals, uint8_t len) { *this = none().with(vals, len); } /// Check if this set is a superset of the other set bool includes(Bits256 other) const { return impl::includes(value, other.value); } bool contains(uint8_t n) const { return !(*this & single(n)).empty(); } bool empty() const { return impl::empty(value); } bool operator==(Bits256 other) const { return (*this ^ other).empty(); } bool operator!=(Bits256 other) const { return !(*this == other); } static Bits256 none() { return Bits256(impl::none()); } static Bits256 all() { return Bits256(impl::all()); } static Bits256 single(uint8_t n) { return Bits256(impl::single(n)); } size_t count() const { return impl::count(value); } uint32_t upper() const { return impl::upper(value); } Bits256 with(const uint8_t *vals, uint8_t len) const { // TODO: try to vectorise and/or use lookup table auto x = *this; for (uint8_t i = 0; i < len; ++i) { x |= single(vals[i]); } return x; } Bits256 operator|(Bits256 other) const { return Bits256(impl::or_(value, other.value)); } Bits256& operator|=(Bits256 other) { *this = *this | other; return *this; } Bits256 operator&(Bits256 other) const { return Bits256(impl::and_(value, other.value)); } Bits256& operator&=(Bits256 other) { *this = *this & other; return *this; } Bits256 operator^(Bits256 other) const { return Bits256(impl::xor_(value, other.value)); } Bits256& operator^=(Bits256 other) { *this = *this ^ other; return *this; } }; /** * A memory-efficient representation of sets of uint32_t. This is quite similar * to Roaring Bitmaps but with a different branching factor. It's not clear what * effect this has - I came up with this independently and didn't have time to * measure. * * We split the uint32_t key space into blocks of 256 values - the upper 24 * bits determine the block number and the lower 8 bits the index within the * block. Each block can be empty, sparse, dense or full. For each non-empty * block, we store a 32-bit block header in the the header array. The header * contain the 24-bit block id and an 8-bit control byte (`Hdr`) which says what * kind of block it is. The header array is sorted by block ids. * * Empty blocks are not stored in the set. Blocks with less than 32 elements are * sparse and are stored as sorted arrays of 8-bit indices (in the `sparse` * vector). Blocks with more than 31 but less than 256 elements are dense and * are stored as 256-bit bitsets in the `dense` vector. Blocks with 256 elements * are full - these blocks are marked separately in the control byte and no * additional data is stored. * * The only two operations on sets that we need are appending a value (which is * guaranteed to be >= the largest value in the set) and set union. In * particular, we don't need random access which allow us to keep the * representation simple. */ class SetU32 { public: /** * Header control words. This is morally equivalent to * * struct Hdr { * unsigned int id: 24; // block id * unsigned int len: 6; // number of elements in a sparse block or 0 if * // this block isn't sparse * unsigned int type: 2; // type = sparse, dense or full * }; * * However, we do our own bit fiddling since we can do certain things faster. * In particular, we try to avoid conditional branches as much as possible. */ struct Hdr { enum Type { Sparse = 0, Dense = 1, Full = 2 }; Hdr() = default; static Hdr null() { return {0, 0, Sparse}; } static Hdr sparse(uint32_t id, uint8_t len) { return {id, len, Sparse}; } static Hdr dense(uint32_t id) { return {id, 0, Dense}; } static Hdr full(uint32_t id) { return {id, 0, Full}; } uint32_t id() const { return value >> 8; } Type type() const { return static_cast<Type>(value & 3); } /// sparseLen can be called on non-sparse blocks and will be 0 for them uint32_t sparseLen() const { return (value >> 2) & 63; } void addSparseLen(uint8_t n) { assert(sparseLen() + n < 64); value += n << 2; } /// Number of Bits256 blocks - 1 for dense blocks, 0 otherwise. size_t denseLen() const { return value & 1; } bool operator==(Hdr other) const { return value == other.value; } bool operator!=(Hdr other) const { return value != other.value; } bool before(uint32_t id) const { return value < (id << 8); } private: Hdr(uint32_t id, uint8_t len, Type type) { value = (id << 8) | (uint32_t(len) << 2) | static_cast<uint32_t>(type); } uint32_t value; }; struct Sizes { size_t hdrs = 0; size_t dense = 0; size_t sparse = 0; static Sizes max(const Sizes& x, const Sizes& y) { return {std::max(x.hdrs, y.hdrs), std::max(x.dense, y.dense), std::max(x.sparse, y.sparse)}; } Sizes operator+(const Sizes& other) const { return {hdrs + other.hdrs, dense + other.dense, sparse + other.sparse}; } }; SetU32() = default; struct copy_capacity_tag {}; static constexpr copy_capacity_tag copy_capacity {}; SetU32(const SetU32& other, copy_capacity_tag); bool operator==(const SetU32& other) const; bool operator!=(const SetU32& other) const { return !(*this == other); } uint64_t hash(uint64_t seed) const; Sizes sizes() const; Sizes capacities() const; size_t bytes() const; size_t size() const; uint32_t upper() const; struct Block { Hdr hdr; std::vector<Bits256>::const_iterator dense; std::vector<uint8_t>::const_iterator sparse; bool operator==(const Block& other) const; bool operator!=(const Block& other) const { return !(*this == other); } bool includes(const Block& other) const; }; struct const_iterator { std::vector<Hdr>::const_iterator hdrs; Block block; const_iterator( std::vector<Hdr>::const_iterator hdrs, std::vector<Bits256>::const_iterator dense, std::vector<uint8_t>::const_iterator sparse) : hdrs(hdrs), block{Hdr::null(), dense, sparse} {} const Block& operator*() const { const_cast<Hdr&>(block.hdr) = *hdrs; return block; } const Block* operator->() const { return &operator*(); } const_iterator& operator++() { block.sparse += hdrs->sparseLen(); block.dense += hdrs->denseLen(); ++hdrs; return *this; } const_iterator operator++(int) { const auto x = *this; ++*this; return x; } bool operator==(const const_iterator& other) const { return hdrs == other.hdrs; } bool operator!=(const const_iterator& other) const { return hdrs != other.hdrs; } }; const_iterator begin() const { return { hdrs.begin(), dense.begin(), sparse.begin() }; } const_iterator end() const { return { hdrs.end(), dense.end(), sparse.end() }; } static const_iterator lower_bound( const_iterator start, const_iterator finish, uint32_t id); void reserve(Sizes sizes); void shrink_to_fit(); void clear(); /// Append a new value which must be >= the largest value in the set void append(uint32_t value); static SetU32 from(const std::set<uint32_t>& set) { SetU32 setu32; for (auto elt : set) { setu32.append(elt); } return setu32; } /** * Merge two sets. If `right` is a subset of `left` or vice versa, returns a * pointer to the superset. Otherwise, stores the result in `result` and * returns a pointer to it. */ static const SetU32 *merge(SetU32& result, const SetU32& left, const SetU32& right); template<typename F> void foreach(F&& f) const { for (auto &block : *this) { auto id = block.hdr.id() << 8; switch (block.hdr.type()) { case SetU32::Hdr::Sparse: { for (uint32_t i = 0; i < block.hdr.sparseLen(); i++) { f(id | block.sparse[i]); } break; } case SetU32::Hdr::Dense: { for (uint32_t i = 0; i < 256; i++) { if (block.dense->contains(i)) { f(id | i); } } break; } case SetU32::Hdr::Full: { for (uint32_t i = 0; i < 256; i++) { f(id | i); } break; } } } } using MutableEliasFanoList = folly::compression::MutableEliasFanoCompressedList; using EliasFanoList = folly::compression::EliasFanoCompressedList; MutableEliasFanoList toEliasFano(); static SetU32 fromEliasFano(const EliasFanoList& list); static void dump(SetU32 &); private: static bool fitsSparse(uint8_t m, uint8_t n) { return int(m) + n < 32; } void append(const_iterator start, const_iterator finish); void append(uint32_t id, Bits256 w); void appendMerge( const_iterator left, const_iterator left_end, const_iterator right, const_iterator right_end); void appendMerge(Block left, Block right); std::vector<Hdr> hdrs; std::vector<Bits256> dense; std::vector<uint8_t> sparse; }; } } }

glean/rts/ownership/setu32.h (382 lines of code) (raw):