in folly/container/detail/F14Table.h [291:543]
struct alignas(kRequiredVectorAlignment) F14Chunk {
using Item = ItemType;
// For our 16 byte vector alignment (and assuming alignof(Item) >=
// 4) kCapacity of 14 is the most space efficient. Slightly smaller
// or larger capacities can help with cache alignment in a couple of
// cases without wasting too much space, but once the items are larger
// then we're unlikely to get much benefit anyway. The only case we
// optimize is using kCapacity of 12 for 4 byte items, which makes the
// chunk take exactly 1 cache line, and adding 16 bytes of padding for
// 16 byte items so that a chunk takes exactly 4 cache lines.
static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;
static constexpr unsigned kDesiredCapacity = kCapacity - 2;
static constexpr unsigned kAllocatedCapacity =
kCapacity + (sizeof(Item) == 16 ? 1 : 0);
// If kCapacity == 12 then we get 16 bits of capacityScale by using
// tag 12 and 13, otherwise we only get 4 bits of control_
static constexpr std::size_t kCapacityScaleBits = kCapacity == 12 ? 16 : 4;
static constexpr std::size_t kCapacityScaleShift = kCapacityScaleBits - 4;
static constexpr MaskType kFullMask = FullMask<kCapacity>::value;
// Non-empty tags have their top bit set. tags_ array might be bigger
// than kCapacity to keep alignment of first item.
std::array<uint8_t, 14> tags_;
// Bits 0..3 of chunk 0 record the scaling factor between the number of
// chunks and the max size without rehash. Bits 4-7 in any chunk are a
// 4-bit counter of the number of values in this chunk that were placed
// because they overflowed their desired chunk (hostedOverflowCount).
uint8_t control_;
// The number of values that would have been placed into this chunk if
// there had been space, including values that also overflowed previous
// full chunks. This value saturates; once it becomes 255 it no longer
// increases nor decreases.
uint8_t outboundOverflowCount_;
std::array<aligned_storage_for_t<Item>, kAllocatedCapacity> rawItems_;
static F14Chunk* emptyInstance() {
auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
if (kRequiredVectorAlignment > alignof(max_align_t)) {
auto delta = kRequiredVectorAlignment -
(reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
raw += delta;
}
auto rv = reinterpret_cast<F14Chunk*>(raw);
FOLLY_SAFE_DCHECK(
(reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
return rv;
}
void clear() {
// tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
// gcc < 6 doesn't exploit chunk alignment to generate the optimal
// SSE clear from memset. This is very hot code, so it is worth
// handling that case specially.
#if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
// this doesn't violate strict aliasing rules because __m128i is
// tagged as __may_alias__
auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
_mm_store_si128(v, _mm_setzero_si128());
#else
std::memset(&tags_[0], '\0', 16);
#endif
}
void copyOverflowInfoFrom(F14Chunk const& rhs) {
FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
control_ += static_cast<uint8_t>(rhs.control_ & 0xf0);
outboundOverflowCount_ = rhs.outboundOverflowCount_;
}
unsigned hostedOverflowCount() const { return control_ >> 4; }
static constexpr uint8_t kIncrHostedOverflowCount = 0x10;
static constexpr uint8_t kDecrHostedOverflowCount =
static_cast<uint8_t>(-0x10);
void adjustHostedOverflowCount(uint8_t op) { control_ += op; }
bool eof() const { return capacityScale() != 0; }
std::size_t capacityScale() const {
if (kCapacityScaleBits == 4) {
return control_ & 0xf;
} else {
uint16_t v;
std::memcpy(&v, &tags_[12], 2);
return v;
}
}
void setCapacityScale(std::size_t scale) {
FOLLY_SAFE_DCHECK(
this != emptyInstance() && scale > 0 &&
scale < (std::size_t{1} << kCapacityScaleBits),
"");
if (kCapacityScaleBits == 4) {
control_ = static_cast<uint8_t>((control_ & ~0xf) | scale);
} else {
uint16_t v = static_cast<uint16_t>(scale);
std::memcpy(&tags_[12], &v, 2);
}
}
void markEof(std::size_t scale) {
folly::assume(control_ == 0);
setCapacityScale(scale);
}
unsigned outboundOverflowCount() const { return outboundOverflowCount_; }
void incrOutboundOverflowCount() {
if (outboundOverflowCount_ != 255) {
++outboundOverflowCount_;
}
}
void decrOutboundOverflowCount() {
if (outboundOverflowCount_ != 255) {
--outboundOverflowCount_;
}
}
std::size_t tag(std::size_t index) const { return tags_[index]; }
void setTag(std::size_t index, std::size_t tag) {
FOLLY_SAFE_DCHECK(
this != emptyInstance() && tag >= 0x80 && tag <= 0xff, "");
FOLLY_SAFE_CHECK(tags_[index] == 0, "");
tags_[index] = static_cast<uint8_t>(tag);
}
void clearTag(std::size_t index) {
FOLLY_SAFE_CHECK((tags_[index] & 0x80) != 0, "");
tags_[index] = 0;
}
#if FOLLY_NEON
////////
// Tag filtering using NEON intrinsics
SparseMaskIter tagMatchIter(std::size_t needle) const {
FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
uint8x16_t tagV = vld1q_u8(&tags_[0]);
auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle));
auto eqV = vceqq_u8(tagV, needleV);
// get info from every byte into the bottom half of every uint16_t
// by shifting right 4, then round to get it into a 64-bit vector
uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4);
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
return SparseMaskIter(mask);
}
MaskType occupiedMask() const {
uint8x16_t tagV = vld1q_u8(&tags_[0]);
// signed shift extends top bit to all bits
auto occupiedV =
vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4);
return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
}
#else
////////
// Tag filtering using SSE2 intrinsics
TagVector const* tagVector() const {
return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
}
SparseMaskIter tagMatchIter(std::size_t needle) const {
FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
auto tagV = _mm_load_si128(tagVector());
// TRICKY! It may seem strange to have a std::size_t needle and narrow
// it at the last moment, rather than making HashPair::second be a
// uint8_t, but the latter choice sometimes leads to a performance
// problem.
//
// On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
// to multiple instructions. One of those is a MOVD of either 4 or
// 8 byte width. Only the bottom byte of that move actually affects
// the result, but if a 1-byte needle has been spilled then this will
// be a 4 byte load. GCC 5.5 has been observed to reload needle
// (or perhaps fuse a reload and part of a previous static_cast)
// needle using a MOVZX with a 1 byte load in parallel with the MOVD.
// This combination causes a failure of store-to-load forwarding,
// which has a big performance penalty (60 nanoseconds per find on
// a microbenchmark). Keeping needle >= 4 bytes avoids the problem
// and also happens to result in slightly more compact assembly.
auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle));
auto eqV = _mm_cmpeq_epi8(tagV, needleV);
auto mask = _mm_movemask_epi8(eqV) & kFullMask;
return SparseMaskIter{mask};
}
MaskType occupiedMask() const {
auto tagV = _mm_load_si128(tagVector());
return _mm_movemask_epi8(tagV) & kFullMask;
}
#endif
DenseMaskIter occupiedIter() const {
return DenseMaskIter{&tags_[0], occupiedMask()};
}
MaskRangeIter occupiedRangeIter() const {
return MaskRangeIter{occupiedMask()};
}
LastOccupiedInMask lastOccupied() const {
return LastOccupiedInMask{occupiedMask()};
}
FirstEmptyInMask firstEmpty() const {
return FirstEmptyInMask{occupiedMask() ^ kFullMask};
}
bool occupied(std::size_t index) const {
FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
return tags_[index] != 0;
}
Item* itemAddr(std::size_t i) const {
return static_cast<Item*>(
const_cast<void*>(static_cast<void const*>(&rawItems_[i])));
}
Item& item(std::size_t i) {
FOLLY_SAFE_DCHECK(this->occupied(i), "");
return *launder(itemAddr(i));
}
Item const& citem(std::size_t i) const {
FOLLY_SAFE_DCHECK(this->occupied(i), "");
return *launder(itemAddr(i));
}
static F14Chunk& owner(Item& item, std::size_t index) {
auto rawAddr =
static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
return *chunkAddr;
}
};