struct alignas()

in folly/container/detail/F14Table.h [291:543]
162 lines of code
18 McCabe index (conditional complexity)

struct alignas(kRequiredVectorAlignment) F14Chunk {
  using Item = ItemType;

  // For our 16 byte vector alignment (and assuming alignof(Item) >=
  // 4) kCapacity of 14 is the most space efficient.  Slightly smaller
  // or larger capacities can help with cache alignment in a couple of
  // cases without wasting too much space, but once the items are larger
  // then we're unlikely to get much benefit anyway.  The only case we
  // optimize is using kCapacity of 12 for 4 byte items, which makes the
  // chunk take exactly 1 cache line, and adding 16 bytes of padding for
  // 16 byte items so that a chunk takes exactly 4 cache lines.
  static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;

  static constexpr unsigned kDesiredCapacity = kCapacity - 2;

  static constexpr unsigned kAllocatedCapacity =
      kCapacity + (sizeof(Item) == 16 ? 1 : 0);

  // If kCapacity == 12 then we get 16 bits of capacityScale by using
  // tag 12 and 13, otherwise we only get 4 bits of control_
  static constexpr std::size_t kCapacityScaleBits = kCapacity == 12 ? 16 : 4;
  static constexpr std::size_t kCapacityScaleShift = kCapacityScaleBits - 4;

  static constexpr MaskType kFullMask = FullMask<kCapacity>::value;

  // Non-empty tags have their top bit set.  tags_ array might be bigger
  // than kCapacity to keep alignment of first item.
  std::array<uint8_t, 14> tags_;

  // Bits 0..3 of chunk 0 record the scaling factor between the number of
  // chunks and the max size without rehash.  Bits 4-7 in any chunk are a
  // 4-bit counter of the number of values in this chunk that were placed
  // because they overflowed their desired chunk (hostedOverflowCount).
  uint8_t control_;

  // The number of values that would have been placed into this chunk if
  // there had been space, including values that also overflowed previous
  // full chunks.  This value saturates; once it becomes 255 it no longer
  // increases nor decreases.
  uint8_t outboundOverflowCount_;

  std::array<aligned_storage_for_t<Item>, kAllocatedCapacity> rawItems_;

  static F14Chunk* emptyInstance() {
    auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
    if (kRequiredVectorAlignment > alignof(max_align_t)) {
      auto delta = kRequiredVectorAlignment -
          (reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
      raw += delta;
    }
    auto rv = reinterpret_cast<F14Chunk*>(raw);
    FOLLY_SAFE_DCHECK(
        (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
    return rv;
  }

  void clear() {
    // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;

    // gcc < 6 doesn't exploit chunk alignment to generate the optimal
    // SSE clear from memset.  This is very hot code, so it is worth
    // handling that case specially.
#if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
    // this doesn't violate strict aliasing rules because __m128i is
    // tagged as __may_alias__
    auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
    _mm_store_si128(v, _mm_setzero_si128());
#else
    std::memset(&tags_[0], '\0', 16);
#endif
  }

  void copyOverflowInfoFrom(F14Chunk const& rhs) {
    FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
    control_ += static_cast<uint8_t>(rhs.control_ & 0xf0);
    outboundOverflowCount_ = rhs.outboundOverflowCount_;
  }

  unsigned hostedOverflowCount() const { return control_ >> 4; }

  static constexpr uint8_t kIncrHostedOverflowCount = 0x10;
  static constexpr uint8_t kDecrHostedOverflowCount =
      static_cast<uint8_t>(-0x10);

  void adjustHostedOverflowCount(uint8_t op) { control_ += op; }

  bool eof() const { return capacityScale() != 0; }

  std::size_t capacityScale() const {
    if (kCapacityScaleBits == 4) {
      return control_ & 0xf;
    } else {
      uint16_t v;
      std::memcpy(&v, &tags_[12], 2);
      return v;
    }
  }

  void setCapacityScale(std::size_t scale) {
    FOLLY_SAFE_DCHECK(
        this != emptyInstance() && scale > 0 &&
            scale < (std::size_t{1} << kCapacityScaleBits),
        "");
    if (kCapacityScaleBits == 4) {
      control_ = static_cast<uint8_t>((control_ & ~0xf) | scale);
    } else {
      uint16_t v = static_cast<uint16_t>(scale);
      std::memcpy(&tags_[12], &v, 2);
    }
  }

  void markEof(std::size_t scale) {
    folly::assume(control_ == 0);
    setCapacityScale(scale);
  }

  unsigned outboundOverflowCount() const { return outboundOverflowCount_; }

  void incrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      ++outboundOverflowCount_;
    }
  }

  void decrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      --outboundOverflowCount_;
    }
  }

  std::size_t tag(std::size_t index) const { return tags_[index]; }

  void setTag(std::size_t index, std::size_t tag) {
    FOLLY_SAFE_DCHECK(
        this != emptyInstance() && tag >= 0x80 && tag <= 0xff, "");
    FOLLY_SAFE_CHECK(tags_[index] == 0, "");
    tags_[index] = static_cast<uint8_t>(tag);
  }

  void clearTag(std::size_t index) {
    FOLLY_SAFE_CHECK((tags_[index] & 0x80) != 0, "");
    tags_[index] = 0;
  }

#if FOLLY_NEON
  ////////
  // Tag filtering using NEON intrinsics

  SparseMaskIter tagMatchIter(std::size_t needle) const {
    FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
    uint8x16_t tagV = vld1q_u8(&tags_[0]);
    auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle));
    auto eqV = vceqq_u8(tagV, needleV);
    // get info from every byte into the bottom half of every uint16_t
    // by shifting right 4, then round to get it into a 64-bit vector
    uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4);
    uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
    return SparseMaskIter(mask);
  }

  MaskType occupiedMask() const {
    uint8x16_t tagV = vld1q_u8(&tags_[0]);
    // signed shift extends top bit to all bits
    auto occupiedV =
        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
    uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4);
    return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
  }
#else
  ////////
  // Tag filtering using SSE2 intrinsics

  TagVector const* tagVector() const {
    return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
  }

  SparseMaskIter tagMatchIter(std::size_t needle) const {
    FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
    auto tagV = _mm_load_si128(tagVector());

    // TRICKY!  It may seem strange to have a std::size_t needle and narrow
    // it at the last moment, rather than making HashPair::second be a
    // uint8_t, but the latter choice sometimes leads to a performance
    // problem.
    //
    // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
    // to multiple instructions.  One of those is a MOVD of either 4 or
    // 8 byte width.  Only the bottom byte of that move actually affects
    // the result, but if a 1-byte needle has been spilled then this will
    // be a 4 byte load.  GCC 5.5 has been observed to reload needle
    // (or perhaps fuse a reload and part of a previous static_cast)
    // needle using a MOVZX with a 1 byte load in parallel with the MOVD.
    // This combination causes a failure of store-to-load forwarding,
    // which has a big performance penalty (60 nanoseconds per find on
    // a microbenchmark).  Keeping needle >= 4 bytes avoids the problem
    // and also happens to result in slightly more compact assembly.
    auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle));
    auto eqV = _mm_cmpeq_epi8(tagV, needleV);
    auto mask = _mm_movemask_epi8(eqV) & kFullMask;
    return SparseMaskIter{mask};
  }

  MaskType occupiedMask() const {
    auto tagV = _mm_load_si128(tagVector());
    return _mm_movemask_epi8(tagV) & kFullMask;
  }
#endif

  DenseMaskIter occupiedIter() const {
    return DenseMaskIter{&tags_[0], occupiedMask()};
  }

  MaskRangeIter occupiedRangeIter() const {
    return MaskRangeIter{occupiedMask()};
  }

  LastOccupiedInMask lastOccupied() const {
    return LastOccupiedInMask{occupiedMask()};
  }

  FirstEmptyInMask firstEmpty() const {
    return FirstEmptyInMask{occupiedMask() ^ kFullMask};
  }

  bool occupied(std::size_t index) const {
    FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
    return tags_[index] != 0;
  }

  Item* itemAddr(std::size_t i) const {
    return static_cast<Item*>(
        const_cast<void*>(static_cast<void const*>(&rawItems_[i])));
  }

  Item& item(std::size_t i) {
    FOLLY_SAFE_DCHECK(this->occupied(i), "");
    return *launder(itemAddr(i));
  }

  Item const& citem(std::size_t i) const {
    FOLLY_SAFE_DCHECK(this->occupied(i), "");
    return *launder(itemAddr(i));
  }

  static F14Chunk& owner(Item& item, std::size_t index) {
    auto rawAddr =
        static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
        offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
    auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
    FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
    return *chunkAddr;
  }
};