proxygen: proxygen/folly/folly/container/detail/F14Table.h Source File

Go to the documentation of this file.
 /*
  * Copyright 2017-present Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 #pragma once
 
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 
 #include <array>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <new>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include <folly/Bits.h>
 #include <folly/ConstexprMath.h>
 #include <folly/Likely.h>
 #include <folly/Portability.h>
 #include <folly/ScopeGuard.h>
 #include <folly/Traits.h>
 #include <folly/functional/ApplyTuple.h>
 #include <folly/functional/Invoke.h>
 #include <folly/lang/Align.h>
 #include <folly/lang/Assume.h>
 #include <folly/lang/Exception.h>
 #include <folly/lang/Launder.h>
 #include <folly/lang/SafeAssert.h>
 #include <folly/portability/Builtins.h>
 
 #include <folly/container/HeterogeneousAccess.h>
 #include <folly/container/detail/F14Defaults.h>
 #include <folly/container/detail/F14IntrinsicsAvailability.h>
 
 #if FOLLY_ASAN_ENABLED && defined(FOLLY_TLS)
 #define FOLLY_F14_TLS_IF_ASAN FOLLY_TLS
 #else
 #define FOLLY_F14_TLS_IF_ASAN
 #endif
 
 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
 
 #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
 #if FOLLY_NEON
 #include <arm_acle.h> // __crc32cd
 #else
 #include <nmmintrin.h> // _mm_crc32_u64
 #endif
 #else
 #ifdef _WIN32
 #include <intrin.h> // _mul128 in fallback bit mixer
 #endif
 #endif
 
 #if FOLLY_NEON
 #include <arm_neon.h> // uint8x16t intrinsics
 #else // SSE2
 #include <immintrin.h> // __m128i intrinsics
 #include <xmmintrin.h> // _mm_prefetch
 #endif
 
 #endif
 
 namespace folly {
 
 struct F14TableStats {
   char const* policy;
   std::size_t size{0};
   std::size_t valueSize{0};
   std::size_t bucketCount{0};
   std::size_t chunkCount{0};
   std::vector<std::size_t> chunkOccupancyHisto;
   std::vector<std::size_t> chunkOutboundOverflowHisto;
   std::vector<std::size_t> chunkHostedOverflowHisto;
   std::vector<std::size_t> keyProbeLengthHisto;
   std::vector<std::size_t> missProbeLengthHisto;
   std::size_t totalBytes{0};
   std::size_t overheadBytes{0};
 
  private:
   template <typename T>
   static auto computeHelper(T const* m) -> decltype(m->computeStats()) {
     return m->computeStats();
   }
 
   static F14TableStats computeHelper(...) {
     return {};
   }
 
  public:
   template <typename T>
   static F14TableStats compute(T const& m) {
     return computeHelper(&m);
   }
 };
 
 namespace f14 {
 namespace detail {
 
 template <F14IntrinsicsMode>
 struct F14LinkCheck {};
 
 template <>
 struct F14LinkCheck<getF14IntrinsicsMode()> {
   // The purpose of this method is to trigger a link failure if
   // compilation flags vary across compilation units.  The definition
   // is in F14Table.cpp, so only one of F14LinkCheck<None>::check,
   // F14LinkCheck<Simd>::check, or F14LinkCheck<SimdAndCrc>::check will
   // be available at link time.
   //
   // To cause a link failure the function must be invoked in code that
   // is not optimized away, so we call it on a couple of cold paths
   // (exception handling paths in copy construction and rehash).  LTO may
   // remove it entirely, but that's fine.
   static void check() noexcept;
 };
 
 #if defined(_LIBCPP_VERSION)
 
 template <typename K, typename V, typename H>
 struct StdNodeReplica {
   void* next;
   std::size_t hash;
   V value;
 };
 
 #else
 
 template <typename H>
 struct StdIsFastHash : std::true_type {};
 template <>
 struct StdIsFastHash<std::hash<long double>> : std::false_type {};
 template <typename... Args>
 struct StdIsFastHash<std::hash<std::basic_string<Args...>>> : std::false_type {
 };
 
 // TODO: add specialization for std::basic_string_view
 
 // mimic internal node of unordered containers in STL to estimate the size
 template <typename K, typename V, typename H, typename Enable = void>
 struct StdNodeReplica {
   void* next;
   V value;
 };
 template <typename K, typename V, typename H>
 struct StdNodeReplica<
     K,
     V,
     H,
     std::enable_if_t<
         !StdIsFastHash<H>::value || !is_nothrow_invocable<H, K>::value>> {
   void* next;
   V value;
   std::size_t hash;
 };
 
 #endif
 
 } // namespace detail
 } // namespace f14
 
 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
 namespace f14 {
 namespace detail {
 template <typename Policy>
 class F14Table;
 } // namespace detail
 } // namespace f14
 
 class F14HashToken final {
  public:
   F14HashToken() = default;
 
  private:
   using HashPair = std::pair<std::size_t, std::size_t>;
 
   explicit F14HashToken(HashPair hp) : hp_(hp) {}
   explicit operator HashPair() const {
     return hp_;
   }
 
   HashPair hp_;
 
   template <typename Policy>
   friend class f14::detail::F14Table;
 };
 
 namespace f14 {
 namespace detail {
 
 template <typename Arg, typename Default>
 using VoidDefault =
     std::conditional_t<std::is_same<Arg, Default>::value, void, Arg>;
 
 template <typename Arg, typename Default>
 using Defaulted =
     typename std::conditional_t<std::is_same<Arg, void>::value, Default, Arg>;
 
 template <
     typename TableKey,
     typename Hasher,
     typename KeyEqual,
     typename ArgKey,
     typename Void = void>
 struct EligibleForHeterogeneousFind : std::false_type {};
 
 template <
     typename TableKey,
     typename Hasher,
     typename KeyEqual,
     typename ArgKey>
 struct EligibleForHeterogeneousFind<
     TableKey,
     Hasher,
     KeyEqual,
     ArgKey,
     void_t<typename Hasher::is_transparent, typename KeyEqual::is_transparent>>
     : std::true_type {};
 
 template <
     typename TableKey,
     typename Hasher,
     typename KeyEqual,
     typename ArgKey>
 using EligibleForHeterogeneousInsert = Conjunction<
     EligibleForHeterogeneousFind<TableKey, Hasher, KeyEqual, ArgKey>,
     std::is_constructible<TableKey, ArgKey>>;
 
 template <
     typename TableKey,
     typename Hasher,
     typename KeyEqual,
     typename KeyArg0OrBool,
     typename... KeyArgs>
 using KeyTypeForEmplaceHelper = std::conditional_t<
     sizeof...(KeyArgs) == 1 &&
         (std::is_same<remove_cvref_t<KeyArg0OrBool>, TableKey>::value ||
          EligibleForHeterogeneousFind<
              TableKey,
              Hasher,
              KeyEqual,
              KeyArg0OrBool>::value),
     KeyArg0OrBool&&,
     TableKey>;
 
 template <
     typename TableKey,
     typename Hasher,
     typename KeyEqual,
     typename... KeyArgs>
 using KeyTypeForEmplace = KeyTypeForEmplaceHelper<
     TableKey,
     Hasher,
     KeyEqual,
     std::tuple_element_t<0, std::tuple<KeyArgs..., bool>>,
     KeyArgs...>;
 
 
 template <typename T>
 FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
 #ifndef _WIN32
   __builtin_prefetch(static_cast<void const*>(ptr));
 #elif FOLLY_NEON
   __prefetch(static_cast<void const*>(ptr));
 #else
   _mm_prefetch(
       static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
 #endif
 }
 
 template <typename T>
 FOLLY_ALWAYS_INLINE static unsigned findFirstSetNonZero(T mask) {
   assume(mask != 0);
   if (sizeof(mask) == sizeof(unsigned)) {
     return __builtin_ctz(static_cast<unsigned>(mask));
   } else {
     return __builtin_ctzll(mask);
   }
 }
 
 #if FOLLY_NEON
 using TagVector = uint8x16_t;
 
 using MaskType = uint64_t;
 
 constexpr unsigned kMaskSpacing = 4;
 #else // SSE2
 using TagVector = __m128i;
 
 using MaskType = uint32_t;
 
 constexpr unsigned kMaskSpacing = 1;
 #endif
 
 // We could use unaligned loads to relax this requirement, but that
 // would be both a performance penalty and require a bulkier packed
 // ItemIter format
 constexpr std::size_t kRequiredVectorAlignment =
     constexpr_max(std::size_t{16}, alignof(max_align_t));
 
 using EmptyTagVectorType = std::aligned_storage_t<
     sizeof(TagVector) + kRequiredVectorAlignment,
     alignof(max_align_t)>;
 
 extern EmptyTagVectorType kEmptyTagVector;
 
 extern FOLLY_F14_TLS_IF_ASAN std::size_t asanPendingSafeInserts;
 extern FOLLY_F14_TLS_IF_ASAN std::size_t asanRehashState;
 
 template <unsigned BitCount>
 struct FullMask {
   static constexpr MaskType value =
       (FullMask<BitCount - 1>::value << kMaskSpacing) + 1;
 };
 
 template <>
 struct FullMask<1> : std::integral_constant<MaskType, 1> {};
 
 #if FOLLY_ARM
 // Mask iteration is different for ARM because that is the only platform
 // for which the mask is bigger than a register.
 
 // Iterates a mask, optimized for the case that only a few bits are set
 class SparseMaskIter {
   static_assert(kMaskSpacing == 4, "");
 
   uint32_t interleavedMask_;
 
  public:
   explicit SparseMaskIter(MaskType mask)
       : interleavedMask_{static_cast<uint32_t>(((mask >> 32) << 2) | mask)} {}
 
   bool hasNext() {
     return interleavedMask_ != 0;
   }
 
   unsigned next() {
     FOLLY_SAFE_DCHECK(hasNext(), "");
     unsigned i = findFirstSetNonZero(interleavedMask_);
     interleavedMask_ &= (interleavedMask_ - 1);
     return ((i >> 2) | (i << 2)) & 0xf;
   }
 };
 
 // Iterates a mask, optimized for the case that most bits are set
 class DenseMaskIter {
   static_assert(kMaskSpacing == 4, "");
 
   std::size_t count_;
   unsigned index_;
   uint8_t const* tags_;
 
  public:
   explicit DenseMaskIter(uint8_t const* tags, MaskType mask) {
     if (mask == 0) {
       count_ = 0;
     } else {
       count_ = popcount(static_cast<uint32_t>(((mask >> 32) << 2) | mask));
       if (LIKELY((mask & 1) != 0)) {
         index_ = 0;
       } else {
         index_ = findFirstSetNonZero(mask) / kMaskSpacing;
       }
       tags_ = tags;
     }
   }
 
   bool hasNext() {
     return count_ > 0;
   }
 
   unsigned next() {
     auto rv = index_;
     --count_;
     if (count_ > 0) {
       do {
         ++index_;
       } while ((tags_[index_] & 0x80) == 0);
     }
     FOLLY_SAFE_DCHECK(index_ < 16, "");
     return rv;
   }
 };
 
 #else
 // Iterates a mask, optimized for the case that only a few bits are set
 class SparseMaskIter {
   MaskType mask_;
 
  public:
   explicit SparseMaskIter(MaskType mask) : mask_{mask} {}
 
   bool hasNext() {
     return mask_ != 0;
   }
 
   unsigned next() {
     FOLLY_SAFE_DCHECK(hasNext(), "");
     unsigned i = findFirstSetNonZero(mask_);
     mask_ &= (mask_ - 1);
     return i / kMaskSpacing;
   }
 };
 
 // Iterates a mask, optimized for the case that most bits are set
 class DenseMaskIter {
   MaskType mask_;
   unsigned index_{0};
 
  public:
   explicit DenseMaskIter(uint8_t const*, MaskType mask) : mask_{mask} {}
 
   bool hasNext() {
     return mask_ != 0;
   }
 
   unsigned next() {
     FOLLY_SAFE_DCHECK(hasNext(), "");
     if (LIKELY((mask_ & 1) != 0)) {
       mask_ >>= kMaskSpacing;
       return index_++;
     } else {
       unsigned s = findFirstSetNonZero(mask_);
       unsigned rv = index_ + (s / kMaskSpacing);
       mask_ >>= (s + kMaskSpacing);
       index_ = rv + 1;
       return rv;
     }
   }
 };
 #endif
 
 // Iterates a mask, returning pairs of [begin,end) index covering blocks
 // of set bits
 class MaskRangeIter {
   MaskType mask_;
   unsigned shift_{0};
 
  public:
   explicit MaskRangeIter(MaskType mask) {
     // If kMaskSpacing is > 1 then there will be empty bits even for
     // contiguous ranges.  Fill them in.
     mask_ = mask * ((1 << kMaskSpacing) - 1);
   }
 
   bool hasNext() {
     return mask_ != 0;
   }
 
   std::pair<unsigned, unsigned> next() {
     FOLLY_SAFE_DCHECK(hasNext(), "");
     auto s = shift_;
     unsigned b = findFirstSetNonZero(mask_);
     unsigned e = findFirstSetNonZero(~(mask_ | (mask_ - 1)));
     mask_ >>= e;
     shift_ = s + e;
     return std::make_pair((s + b) / kMaskSpacing, (s + e) / kMaskSpacing);
   }
 };
 
 // Holds the result of an index query that has an optional result,
 // interpreting a mask of 0 to be the empty answer and the index of the
 // last set bit to be the non-empty answer
 class LastOccupiedInMask {
   MaskType mask_;
 
  public:
   explicit LastOccupiedInMask(MaskType mask) : mask_{mask} {}
 
   bool hasIndex() const {
     return mask_ != 0;
   }
 
   unsigned index() const {
     assume(mask_ != 0);
     return (findLastSet(mask_) - 1) / kMaskSpacing;
   }
 };
 
 // Holds the result of an index query that has an optional result,
 // interpreting a mask of 0 to be the empty answer and the index of the
 // first set bit to be the non-empty answer
 class FirstEmptyInMask {
   MaskType mask_;
 
  public:
   explicit FirstEmptyInMask(MaskType mask) : mask_{mask} {}
 
   bool hasIndex() const {
     return mask_ != 0;
   }
 
   unsigned index() const {
     FOLLY_SAFE_DCHECK(mask_ != 0, "");
     return findFirstSetNonZero(mask_) / kMaskSpacing;
   }
 };
 
 template <typename ItemType>
 struct alignas(kRequiredVectorAlignment) F14Chunk {
   using Item = ItemType;
 
   // For our 16 byte vector alignment (and assuming alignof(Item) >=
   // 4) kCapacity of 14 is the most space efficient.  Slightly smaller
   // or larger capacities can help with cache alignment in a couple of
   // cases without wasting too much space, but once the items are larger
   // then we're unlikely to get much benefit anyway.  The only case we
   // optimize is using kCapacity of 12 for 4 byte items, which makes the
   // chunk take exactly 1 cache line, and adding 16 bytes of padding for
   // 16 byte items so that a chunk takes exactly 4 cache lines.
   static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;
 
   static constexpr unsigned kDesiredCapacity = kCapacity - 2;
 
   static constexpr unsigned kAllocatedCapacity =
       kCapacity + (sizeof(Item) == 16 ? 1 : 0);
 
   static constexpr MaskType kFullMask = FullMask<kCapacity>::value;
 
   // Non-empty tags have their top bit set.  tags_ array might be bigger
   // than kCapacity to keep alignment of first item.
   std::array<uint8_t, 14> tags_;
 
   // Bits 0..3 record the actual capacity of the chunk if this is chunk
   // zero, or hold 0000 for other chunks.  Bits 4-7 are a 4-bit counter
   // of the number of values in this chunk that were placed because they
   // overflowed their desired chunk (hostedOverflowCount).
   uint8_t control_;
 
   // The number of values that would have been placed into this chunk if
   // there had been space, including values that also overflowed previous
   // full chunks.  This value saturates; once it becomes 255 it no longer
   // increases nor decreases.
   uint8_t outboundOverflowCount_;
 
   std::array<
       std::aligned_storage_t<sizeof(Item), alignof(Item)>,
       kAllocatedCapacity>
       rawItems_;
 
   static F14Chunk* emptyInstance() {
     auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
     if (kRequiredVectorAlignment > alignof(max_align_t)) {
       auto delta = kRequiredVectorAlignment -
           (reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
       raw += delta;
     }
     auto rv = reinterpret_cast<F14Chunk*>(raw);
     FOLLY_SAFE_DCHECK(
         (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
     return rv;
   }
 
   void clear() {
     // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;
 
     // gcc < 6 doesn't exploit chunk alignment to generate the optimal
     // SSE clear from memset.  This is very hot code, so it is worth
     // handling that case specially.
 #if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
     // this doesn't violate strict aliasing rules because __m128i is
     // tagged as __may_alias__
     auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
     _mm_store_si128(v, _mm_setzero_si128());
 #else
     std::memset(&tags_[0], '\0', 16);
 #endif
   }
 
   void copyOverflowInfoFrom(F14Chunk const& rhs) {
     FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
     control_ += static_cast<uint8_t>(rhs.control_ & 0xf0);
     outboundOverflowCount_ = rhs.outboundOverflowCount_;
   }
 
   unsigned hostedOverflowCount() const {
     return control_ >> 4;
   }
 
   static constexpr uint8_t kIncrHostedOverflowCount = 0x10;
   static constexpr uint8_t kDecrHostedOverflowCount =
       static_cast<uint8_t>(-0x10);
 
   void adjustHostedOverflowCount(uint8_t op) {
     control_ += op;
   }
 
   bool eof() const {
     return (control_ & 0xf) != 0;
   }
 
   std::size_t chunk0Capacity() const {
     return control_ & 0xf;
   }
 
   void markEof(std::size_t c0c) {
     FOLLY_SAFE_DCHECK(
         this != emptyInstance() && control_ == 0 && c0c > 0 && c0c <= 0xf &&
             c0c <= kCapacity,
         "");
     control_ = static_cast<uint8_t>(c0c);
   }
 
   unsigned outboundOverflowCount() const {
     return outboundOverflowCount_;
   }
 
   void incrOutboundOverflowCount() {
     if (outboundOverflowCount_ != 255) {
       ++outboundOverflowCount_;
     }
   }
 
   void decrOutboundOverflowCount() {
     if (outboundOverflowCount_ != 255) {
       --outboundOverflowCount_;
     }
   }
 
   std::size_t tag(std::size_t index) const {
     return tags_[index];
   }
 
   void setTag(std::size_t index, std::size_t tag) {
     FOLLY_SAFE_DCHECK(
         this != emptyInstance() && tag >= 0x80 && tag <= 0xff, "");
     tags_[index] = static_cast<uint8_t>(tag);
   }
 
   void clearTag(std::size_t index) {
     tags_[index] = 0;
   }
 
 #if FOLLY_NEON
   // Tag filtering using NEON intrinsics
 
   SparseMaskIter tagMatchIter(std::size_t needle) const {
     FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
     uint8x16_t tagV = vld1q_u8(&tags_[0]);
     auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle));
     auto eqV = vceqq_u8(tagV, needleV);
     // get info from every byte into the bottom half of every uint16_t
     // by shifting right 4, then round to get it into a 64-bit vector
     uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4);
     uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
     return SparseMaskIter(mask);
   }
 
   MaskType occupiedMask() const {
     uint8x16_t tagV = vld1q_u8(&tags_[0]);
     // signed shift extends top bit to all bits
     auto occupiedV =
         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
     uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4);
     return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
   }
 #else
   // Tag filtering using SSE2 intrinsics
 
   TagVector const* tagVector() const {
     return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
   }
 
   SparseMaskIter tagMatchIter(std::size_t needle) const {
     FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
     auto tagV = _mm_load_si128(tagVector());
 
     // TRICKY!  It may seem strange to have a std::size_t needle and narrow
     // it at the last moment, rather than making HashPair::second be a
     // uint8_t, but the latter choice sometimes leads to a performance
     // problem.
     //
     // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
     // to multiple instructions.  One of those is a MOVD of either 4 or
     // 8 byte width.  Only the bottom byte of that move actually affects
     // the result, but if a 1-byte needle has been spilled then this will
     // be a 4 byte load.  GCC 5.5 has been observed to reload needle
     // (or perhaps fuse a reload and part of a previous static_cast)
     // needle using a MOVZX with a 1 byte load in parallel with the MOVD.
     // This combination causes a failure of store-to-load forwarding,
     // which has a big performance penalty (60 nanoseconds per find on
     // a microbenchmark).  Keeping needle >= 4 bytes avoids the problem
     // and also happens to result in slightly more compact assembly.
     auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle));
     auto eqV = _mm_cmpeq_epi8(tagV, needleV);
     auto mask = _mm_movemask_epi8(eqV) & kFullMask;
     return SparseMaskIter{mask};
   }
 
   MaskType occupiedMask() const {
     auto tagV = _mm_load_si128(tagVector());
     return _mm_movemask_epi8(tagV) & kFullMask;
   }
 #endif
 
   DenseMaskIter occupiedIter() const {
     return DenseMaskIter{&tags_[0], occupiedMask()};
   }
 
   MaskRangeIter occupiedRangeIter() const {
     return MaskRangeIter{occupiedMask()};
   }
 
   LastOccupiedInMask lastOccupied() const {
     return LastOccupiedInMask{occupiedMask()};
   }
 
   FirstEmptyInMask firstEmpty() const {
     return FirstEmptyInMask{occupiedMask() ^ kFullMask};
   }
 
   bool occupied(std::size_t index) const {
     FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
     return tags_[index] != 0;
   }
 
   Item* itemAddr(std::size_t i) const {
     return static_cast<Item*>(
         const_cast<void*>(static_cast<void const*>(&rawItems_[i])));
   }
 
   Item& item(std::size_t i) {
     FOLLY_SAFE_DCHECK(this->occupied(i), "");
     return *launder(itemAddr(i));
   }
 
   Item const& citem(std::size_t i) const {
     FOLLY_SAFE_DCHECK(this->occupied(i), "");
     return *launder(itemAddr(i));
   }
 
   static F14Chunk& owner(Item& item, std::size_t index) {
     auto rawAddr =
         static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
         offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
     auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
     FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
     return *chunkAddr;
   }
 };
 
 
 // PackedChunkItemPtr points to an Item in an F14Chunk, allowing both the
 // Item& and its index to be recovered.  It sorts by the address of the
 // item, and it only works for items that are in a properly-aligned chunk.
 
 // generic form, not actually packed
 template <typename Ptr>
 class PackedChunkItemPtr {
  public:
   PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} {
     FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "");
   }
 
   Ptr ptr() const {
     return ptr_;
   }
 
   std::size_t index() const {
     return index_;
   }
 
   bool operator<(PackedChunkItemPtr const& rhs) const {
     FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "");
     return ptr_ < rhs.ptr_;
   }
 
   bool operator==(PackedChunkItemPtr const& rhs) const {
     FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "");
     return ptr_ == rhs.ptr_;
   }
 
   bool operator!=(PackedChunkItemPtr const& rhs) const {
     return !(*this == rhs);
   }
 
  private:
   Ptr ptr_;
   std::size_t index_;
 };
 
 // Bare pointer form, packed into a uintptr_t.  Uses only bits wasted by
 // alignment, so it works on 32-bit and 64-bit platforms
 template <typename T>
 class PackedChunkItemPtr<T*> {
   static_assert((alignof(F14Chunk<T>) % 16) == 0, "");
 
   // Chunks are 16-byte aligned, so we can maintain a packed pointer to a
   // chunk item by packing the 4-bit item index into the least significant
   // bits of a pointer to the chunk itself.  This makes ItemIter::pack
   // more expensive, however, since it has to compute the chunk address.
   //
   // Chunk items have varying alignment constraints, so it would seem
   // to be that we can't do a similar trick while using only bit masking
   // operations on the Item* itself.  It happens to be, however, that if
   // sizeof(Item) is not a multiple of 16 then we can recover a portion
   // of the index bits from the knowledge that the Item-s are stored in
   // an array that is itself 16-byte aligned.
   //
   // If kAlignBits is the number of trailing zero bits in sizeof(Item)
   // (up to 4), then we can borrow those bits to store kAlignBits of the
   // index directly.  We can recover (4 - kAlignBits) bits of the index
   // from the item pointer itself, by defining/observing that
   //
   // A = kAlignBits                  (A <= 4)
   //
   // S = (sizeof(Item) % 16) >> A    (shifted-away bits are all zero)
   //
   // R = (itemPtr % 16) >> A         (shifted-away bits are all zero)
   //
   // M = 16 >> A
   //
   // itemPtr % 16   = (index * sizeof(Item)) % 16
   //
   // (R * 2^A) % 16 = (index * (sizeof(Item) % 16)) % 16
   //
   // (R * 2^A) % 16 = (index * 2^A * S) % 16
   //
   // R % M          = (index * S) % M
   //
   // S is relatively prime with M, so a multiplicative inverse is easy
   // to compute
   //
   // Sinv = S^(M - 1) % M
   //
   // (R * Sinv) % M = index % M
   //
   // This lets us recover the bottom bits of the index.  When sizeof(T)
   // is 8-byte aligned kSizeInverse will always be 1.  When sizeof(T)
   // is 4-byte aligned kSizeInverse will be either 1 or 3.
 
   // returns pow(x, y) % m
   static constexpr uintptr_t powerMod(uintptr_t x, uintptr_t y, uintptr_t m) {
     return y == 0 ? 1 : (x * powerMod(x, y - 1, m)) % m;
   }
 
   static constexpr uintptr_t kIndexBits = 4;
   static constexpr uintptr_t kIndexMask = (uintptr_t{1} << kIndexBits) - 1;
 
   static constexpr uintptr_t kAlignBits = constexpr_min(
       uintptr_t{4},
       constexpr_find_first_set(uintptr_t{sizeof(T)}) - 1);
 
   static constexpr uintptr_t kAlignMask = (uintptr_t{1} << kAlignBits) - 1;
 
   static constexpr uintptr_t kModulus = uintptr_t{1}
       << (kIndexBits - kAlignBits);
   static constexpr uintptr_t kSizeInverse =
       powerMod(sizeof(T) >> kAlignBits, kModulus - 1, kModulus);
 
  public:
   PackedChunkItemPtr(T* p, std::size_t i) noexcept {
     uintptr_t encoded = i >> (kIndexBits - kAlignBits);
     assume((encoded & ~kAlignMask) == 0);
     raw_ = reinterpret_cast<uintptr_t>(p) | encoded;
     FOLLY_SAFE_DCHECK(p == ptr(), "");
     FOLLY_SAFE_DCHECK(i == index(), "");
   }
 
   T* ptr() const {
     return reinterpret_cast<T*>(raw_ & ~kAlignMask);
   }
 
   std::size_t index() const {
     auto encoded = (raw_ & kAlignMask) << (kIndexBits - kAlignBits);
     auto deduced =
         ((raw_ >> kAlignBits) * kSizeInverse) & (kIndexMask >> kAlignBits);
     return encoded | deduced;
   }
 
   bool operator<(PackedChunkItemPtr const& rhs) const {
     return raw_ < rhs.raw_;
   }
   bool operator==(PackedChunkItemPtr const& rhs) const {
     return raw_ == rhs.raw_;
   }
   bool operator!=(PackedChunkItemPtr const& rhs) const {
     return !(*this == rhs);
   }
 
  private:
   uintptr_t raw_;
 };
 
 template <typename ChunkPtr>
 class F14ItemIter {
  private:
   using Chunk = typename std::pointer_traits<ChunkPtr>::element_type;
 
  public:
   using Item = typename Chunk::Item;
   using ItemPtr = typename std::pointer_traits<ChunkPtr>::template rebind<Item>;
   using ItemConstPtr =
       typename std::pointer_traits<ChunkPtr>::template rebind<Item const>;
 
   using Packed = PackedChunkItemPtr<ItemPtr>;
 
 
   F14ItemIter() noexcept : itemPtr_{nullptr}, index_{0} {}
 
   // default copy and move constructors and assignment operators are correct
 
   explicit F14ItemIter(Packed const& packed)
       : itemPtr_{packed.ptr()}, index_{packed.index()} {}
 
   F14ItemIter(ChunkPtr chunk, std::size_t index)
       : itemPtr_{std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index))},
         index_{index} {
     FOLLY_SAFE_DCHECK(index < Chunk::kCapacity, "");
     assume(
         std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index)) !=
         nullptr);
     assume(itemPtr_ != nullptr);
   }
 
   FOLLY_ALWAYS_INLINE void advanceImpl(bool checkEof, bool likelyDead) {
     auto c = chunk();
 
     // common case is packed entries
     while (index_ > 0) {
       --index_;
       --itemPtr_;
       if (LIKELY(c->occupied(index_))) {
         return;
       }
     }
 
     // It's fairly common for an iterator to be advanced and then become
     // dead, for example in the return value from erase(iter) or in
     // the last step of a loop.  We'd like to make sure that the entire
     // advance() method can be eliminated by the compiler's dead code
     // elimination pass.  To do that it must eliminate the loops, which
     // requires it to prove that they have no side effects.  It's easy
     // to show that there are no escaping stores, but at the moment
     // compilers also consider an infinite loop to be a side effect.
     // (There are parts of the standard that would allow them to treat
     // this as undefined behavior, but at the moment they don't exploit
     // those clauses.)
     //
     // The following loop should really be a while loop, which would
     // save a register, some instructions, and a conditional branch,
     // but by writing it as a for loop the compiler can prove to itself
     // that it will eventually terminate.  (No matter that even if the
     // loop executed in a single cycle it would take about 200 years to
     // run all 2^64 iterations.)
     //
     // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82776 has the bug we
     // filed about the issue.  while (true) {
     for (std::size_t i = 1; !likelyDead || i != 0; ++i) {
       if (checkEof) {
         // exhausted the current chunk
         if (UNLIKELY(c->eof())) {
           FOLLY_SAFE_DCHECK(index_ == 0, "");
           itemPtr_ = nullptr;
           return;
         }
       } else {
         FOLLY_SAFE_DCHECK(!c->eof(), "");
       }
       --c;
       auto last = c->lastOccupied();
       if (checkEof && !likelyDead) {
         prefetchAddr(&*c - 1);
       }
       if (LIKELY(last.hasIndex())) {
         index_ = last.index();
         itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
         return;
       }
     }
   }
 
   void precheckedAdvance() {
     advanceImpl(false, false);
   }
 
   FOLLY_ALWAYS_INLINE void advance() {
     advanceImpl(true, false);
   }
 
   FOLLY_ALWAYS_INLINE void advanceLikelyDead() {
     advanceImpl(true, true);
   }
 
   ChunkPtr chunk() const {
     return std::pointer_traits<ChunkPtr>::pointer_to(
         Chunk::owner(*itemPtr_, index_));
   }
 
   std::size_t index() const {
     return index_;
   }
 
   Item* itemAddr() const {
     return std::addressof(*itemPtr_);
   }
   Item& item() const {
     return *itemPtr_;
   }
   Item const& citem() const {
     return *itemPtr_;
   }
 
   bool atEnd() const {
     return itemPtr_ == nullptr;
   }
 
   Packed pack() const {
     return Packed{itemPtr_, static_cast<uint8_t>(index_)};
   }
 
   bool operator==(F14ItemIter const& rhs) const {
     // this form makes iter == end() into a single null check after inlining
     // and constant propagation
     return itemPtr_ == rhs.itemPtr_;
   }
 
   bool operator!=(F14ItemIter const& rhs) const {
     return !(*this == rhs);
   }
 
  private:
   ItemPtr itemPtr_;
   std::size_t index_;
 };
 
 
 template <typename SizeType, typename ItemIter, bool EnablePackedItemIter>
 struct SizeAndPackedBegin {
   SizeType size_{0};
 
  private:
   typename ItemIter::Packed packedBegin_{ItemIter{}.pack()};
 
  public:
   typename ItemIter::Packed& packedBegin() {
     return packedBegin_;
   }
 
   typename ItemIter::Packed const& packedBegin() const {
     return packedBegin_;
   }
 };
 
 template <typename SizeType, typename ItemIter>
 struct SizeAndPackedBegin<SizeType, ItemIter, false> {
   SizeType size_{0};
 
   [[noreturn]] typename ItemIter::Packed& packedBegin() {
     assume_unreachable();
   }
 
   [[noreturn]] typename ItemIter::Packed const& packedBegin() const {
     assume_unreachable();
   }
 };
 
 template <typename Policy>
 class F14Table : public Policy {
  public:
   using Item = typename Policy::Item;
 
   using value_type = typename Policy::Value;
   using allocator_type = typename Policy::Alloc;
 
  private:
   using Alloc = typename Policy::Alloc;
   using AllocTraits = typename Policy::AllocTraits;
   using Hasher = typename Policy::Hasher;
   using InternalSizeType = typename Policy::InternalSizeType;
   using KeyEqual = typename Policy::KeyEqual;
 
   using Policy::kAllocIsAlwaysEqual;
   using Policy::kDefaultConstructIsNoexcept;
   using Policy::kEnableItemIteration;
   using Policy::kSwapIsNoexcept;
 
   using Policy::destroyItemOnClear;
   using Policy::isAvalanchingHasher;
   using Policy::prefetchBeforeCopy;
   using Policy::prefetchBeforeDestroy;
   using Policy::prefetchBeforeRehash;
 
   using ByteAlloc = typename AllocTraits::template rebind_alloc<uint8_t>;
   using BytePtr = typename std::allocator_traits<ByteAlloc>::pointer;
 
   using Chunk = F14Chunk<Item>;
   using ChunkPtr =
       typename std::pointer_traits<BytePtr>::template rebind<Chunk>;
 
   using HashPair = typename F14HashToken::HashPair;
 
  public:
   using ItemIter = F14ItemIter<ChunkPtr>;
 
  private:
 
   ChunkPtr chunks_{Chunk::emptyInstance()};
   InternalSizeType chunkMask_{0};
   SizeAndPackedBegin<InternalSizeType, ItemIter, kEnableItemIteration>
       sizeAndPackedBegin_;
 
 
   void swapContents(F14Table& rhs) noexcept {
     using std::swap;
     swap(chunks_, rhs.chunks_);
     swap(chunkMask_, rhs.chunkMask_);
     swap(sizeAndPackedBegin_.size_, rhs.sizeAndPackedBegin_.size_);
     if (kEnableItemIteration) {
       swap(
           sizeAndPackedBegin_.packedBegin(),
           rhs.sizeAndPackedBegin_.packedBegin());
     }
   }
 
  public:
   F14Table(
       std::size_t initialCapacity,
       Hasher const& hasher,
       KeyEqual const& keyEqual,
       Alloc const& alloc)
       : Policy{hasher, keyEqual, alloc} {
     if (initialCapacity > 0) {
       reserve(initialCapacity);
     }
   }
 
   F14Table(F14Table const& rhs) : Policy{rhs} {
     buildFromF14Table(rhs);
   }
 
   F14Table(F14Table const& rhs, Alloc const& alloc) : Policy{rhs, alloc} {
     buildFromF14Table(rhs);
   }
 
   F14Table(F14Table&& rhs) noexcept(
       std::is_nothrow_move_constructible<Hasher>::value&&
           std::is_nothrow_move_constructible<KeyEqual>::value&&
               std::is_nothrow_move_constructible<Alloc>::value)
       : Policy{std::move(rhs)} {
     swapContents(rhs);
   }
 
   F14Table(F14Table&& rhs, Alloc const& alloc) noexcept(kAllocIsAlwaysEqual)
       : Policy{std::move(rhs), alloc} {
     if (kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) {
       // move storage (common case)
       swapContents(rhs);
     } else {
       // new storage because allocators unequal, move values (rare case)
       buildFromF14Table(std::move(rhs));
     }
   }
 
   F14Table& operator=(F14Table const& rhs) {
     if (this != &rhs) {
       reset();
       static_cast<Policy&>(*this) = rhs;
       buildFromF14Table(rhs);
     }
     return *this;
   }
 
   F14Table& operator=(F14Table&& rhs) noexcept(
       std::is_nothrow_move_assignable<Hasher>::value&&
           std::is_nothrow_move_assignable<KeyEqual>::value &&
       (kAllocIsAlwaysEqual ||
        (AllocTraits::propagate_on_container_move_assignment::value &&
         std::is_nothrow_move_assignable<Alloc>::value))) {
     if (this != &rhs) {
       reset();
       static_cast<Policy&>(*this) = std::move(rhs);
       if (AllocTraits::propagate_on_container_move_assignment::value ||
           kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) {
         // move storage (common case)
         swapContents(rhs);
       } else {
         // new storage because allocators unequal, move values (rare case)
         buildFromF14Table(std::move(rhs));
       }
     }
     return *this;
   }
 
   ~F14Table() {
     reset();
   }
 
   void swap(F14Table& rhs) noexcept(kSwapIsNoexcept) {
     // If propagate_on_container_swap is false and allocators are
     // not equal, the only way to accomplish a swap would be to do
     // dynamic allocation and then move (or swap) each contained value.
     // AllocatorAwareContainer-s are not supposed to attempt this, but
     // rather are supposed to have undefined behavior in that case.
     FOLLY_SAFE_CHECK(
         AllocTraits::propagate_on_container_swap::value ||
             kAllocIsAlwaysEqual || this->alloc() == rhs.alloc(),
         "swap is undefined for unequal non-propagating allocators");
     this->swapPolicy(rhs);
     swapContents(rhs);
   }
 
  private:
 
   // Hash values are used to compute the desired position, which is the
   // chunk index at which we would like to place a value (if there is no
   // overflow), and the tag, which is an additional 8 bits of entropy.
   //
   // The standard's definition of hash function quality only refers to
   // the probability of collisions of the entire hash value, not to the
   // probability of collisions of the results of shifting or masking the
   // hash value.  Some hash functions, however, provide this stronger
   // guarantee (not quite the same as the definition of avalanching,
   // but similar).
   //
   // If the user-supplied hasher is an avalanching one (each bit of the
   // hash value has a 50% chance of being the same for differing hash
   // inputs), then we can just take 1 byte of the hash value for the tag
   // and the rest for the desired position.  Avalanching hashers also
   // let us map hash value to array index position with just a bitmask
   // without risking clumping.  (Many hash tables just accept the risk
   // and do it regardless.)
   //
   // std::hash<std::string> avalanches in all implementations we've
   // examined: libstdc++-v3 uses MurmurHash2, and libc++ uses CityHash
   // or MurmurHash2.  The other std::hash specializations, however, do not
   // have this property.  std::hash for integral and pointer values is the
   // identity function on libstdc++-v3 and libc++, in particular.  In our
   // experience it is also fairly common for user-defined specializations
   // of std::hash to combine fields in an ad-hoc way that does not evenly
   // distribute entropy among the bits of the result (a + 37 * b, for
   // example, where a and b are integer fields).
   //
   // For hash functions we don't trust to avalanche, we repair things by
   // applying a bit mixer to the user-supplied hash.
 
 #if FOLLY_X64 || FOLLY_AARCH64
   // 64-bit
   static HashPair splitHash(std::size_t hash) {
     static_assert(sizeof(std::size_t) == sizeof(uint64_t), "");
     std::size_t tag;
     if (!isAvalanchingHasher()) {
 #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
 #if FOLLY_SSE
       // SSE4.2 CRC
       std::size_t c = _mm_crc32_u64(0, hash);
       tag = (c >> 24) | 0x80;
       hash += c;
 #else
       // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1
       std::size_t c = __crc32cd(0, hash);
       tag = (c >> 24) | 0x80;
       hash += c;
 #endif
 #else
       // The mixer below is not fully avalanching for all 64 bits of
       // output, but looks quite good for bits 18..63 and puts plenty
       // of entropy even lower when considering multiple bits together
       // (like the tag).  Importantly, when under register pressure it
       // uses fewer registers, instructions, and immediate constants
       // than the alternatives, resulting in compact code that is more
       // easily inlinable.  In one instantiation a modified Murmur mixer
       // was 48 bytes of assembly (even after using the same multiplicand
       // for both steps) and this one was 27 bytes, for example.
       auto const kMul = 0xc4ceb9fe1a85ec53ULL;
 #ifdef _WIN32
       __int64 signedHi;
       __int64 signedLo = _mul128(
           static_cast<__int64>(hash), static_cast<__int64>(kMul), &signedHi);
       auto hi = static_cast<uint64_t>(signedHi);
       auto lo = static_cast<uint64_t>(signedLo);
 #else
       auto hi = static_cast<uint64_t>(
           (static_cast<unsigned __int128>(hash) * kMul) >> 64);
       auto lo = hash * kMul;
 #endif
       hash = hi ^ lo;
       hash *= kMul;
       tag = ((hash >> 15) & 0x7f) | 0x80;
       hash >>= 22;
 #endif
     } else {
       // we don't trust the top bit
       tag = (hash >> 56) | 0x80;
     }
     return std::make_pair(hash, tag);
   }
 #else
   // 32-bit
   static HashPair splitHash(std::size_t hash) {
     static_assert(sizeof(std::size_t) == sizeof(uint32_t), "");
     uint8_t tag;
     if (!isAvalanchingHasher()) {
 #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
 #if FOLLY_SSE
       // SSE4.2 CRC
       auto c = _mm_crc32_u32(0, hash);
       tag = static_cast<uint8_t>(~(c >> 25));
       hash += c;
 #else
       auto c = __crc32cw(0, hash);
       tag = static_cast<uint8_t>(~(c >> 25));
       hash += c;
 #endif
 #else
       // finalizer for 32-bit murmur2
       hash ^= hash >> 13;
       hash *= 0x5bd1e995;
       hash ^= hash >> 15;
       tag = static_cast<uint8_t>(~(hash >> 25));
 #endif
     } else {
       // we don't trust the top bit
       tag = (hash >> 24) | 0x80;
     }
     return std::make_pair(hash, tag);
   }
 #endif
 
 
   static std::size_t chunkAllocSize(
       std::size_t chunkCount,
       std::size_t maxSizeWithoutRehash) {
     if (chunkCount == 1) {
       FOLLY_SAFE_DCHECK((maxSizeWithoutRehash % 2) == 0, "");
       static_assert(offsetof(Chunk, rawItems_) == 16, "");
       return 16 + sizeof(Item) * maxSizeWithoutRehash;
     } else {
       return sizeof(Chunk) * chunkCount;
     }
   }
 
   ChunkPtr initializeChunks(
       BytePtr raw,
       std::size_t chunkCount,
       std::size_t maxSizeWithoutRehash) {
     static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD");
     auto chunks = static_cast<Chunk*>(static_cast<void*>(&*raw));
     for (std::size_t i = 0; i < chunkCount; ++i) {
       chunks[i].clear();
     }
     chunks[0].markEof(chunkCount == 1 ? maxSizeWithoutRehash : 1);
     return std::pointer_traits<ChunkPtr>::pointer_to(*chunks);
   }
 
  public:
   ItemIter begin() const noexcept {
     FOLLY_SAFE_DCHECK(kEnableItemIteration, "");
     return ItemIter{sizeAndPackedBegin_.packedBegin()};
   }
 
   ItemIter end() const noexcept {
     return ItemIter{};
   }
 
   bool empty() const noexcept {
     return size() == 0;
   }
 
   InternalSizeType size() const noexcept {
     return sizeAndPackedBegin_.size_;
   }
 
   std::size_t max_size() const noexcept {
     auto& a = this->alloc();
     return std::min<std::size_t>(
         (std::numeric_limits<InternalSizeType>::max)(),
         AllocTraits::max_size(a));
   }
 
   std::size_t bucket_count() const noexcept {
     // bucket_count is just a synthetic construct for the outside world
     // so that size, bucket_count, load_factor, and max_load_factor are
     // all self-consistent.  The only one of those that is real is size().
     if (chunkMask_ != 0) {
       return (chunkMask_ + 1) * Chunk::kDesiredCapacity;
     } else {
       return chunks_->chunk0Capacity();
     }
   }
 
   std::size_t max_bucket_count() const noexcept {
     return max_size();
   }
 
   float load_factor() const noexcept {
     return empty()
         ? 0.0f
         : static_cast<float>(size()) / static_cast<float>(bucket_count());
   }
 
   float max_load_factor() const noexcept {
     return 1.0f;
   }
 
   void max_load_factor(float) noexcept {
     // Probing hash tables can't run load factors >= 1 (unlike chaining
     // tables).  In addition, we have measured that there is little or
     // no performance advantage to running a smaller load factor (cache
     // locality losses outweigh the small reduction in probe lengths,
     // often making it slower).  Therefore, we've decided to just fix
     // max_load_factor at 1.0f regardless of what the user requests.
     // This has an additional advantage that we don't have to store it.
     // Taking alignment into consideration this makes every F14 table
     // 8 bytes smaller, and is part of the reason an empty F14NodeMap
     // is almost half the size of an empty std::unordered_map (32 vs
     // 56 bytes).
     //
     // I don't have a strong opinion on whether we should remove this
     // method or leave a stub, let ngbronson or xshi know if you have a
     // compelling argument either way.
   }
 
  private:
   // Our probe strategy is to advance through additional chunks with
   // a stride that is key-specific.  This is called double hashing,
   // and is a well known and high quality probing strategy.  So long as
   // the stride and the chunk count are relatively prime, we will visit
   // every chunk once and then return to the original chunk, letting us
   // detect and end the cycle.  The chunk count is a power of two, so
   // we can satisfy the relatively prime part by choosing an odd stride.
   // We've already computed a high quality secondary hash value for the
   // tag, so we just use it for the second probe hash as well.
   //
   // At the maximum load factor of 12/14, expected probe length for a
   // find hit is 1.041, with 99% of keys found in the first three chunks.
   // Expected probe length for a find miss (or insert) is 1.275, with a
   // p99 probe length of 4 (fewer than 1% of failing find look at 5 or
   // more chunks).
   //
   // This code is structured so you can try various ways of encoding
   // the current probe state.  For example, at the moment the probe's
   // state is the position in the cycle and the resulting chunk index is
   // computed from that inside probeCurrentIndex.  We could also make the
   // probe state the chunk index, and then increment it by hp.second *
   // 2 + 1 in probeAdvance.  Wrapping can be applied early or late as
   // well.  This particular code seems to be easier for the optimizer
   // to understand.
   //
   // We could also implement probing strategies that resulted in the same
   // tour for every key initially assigned to a chunk (linear probing or
   // quadratic), but that results in longer probe lengths.  In particular,
   // the cache locality wins of linear probing are not worth the increase
   // in probe lengths (extra work and less branch predictability) in
   // our experiments.
 
   std::size_t probeDelta(HashPair hp) const {
     return 2 * hp.second + 1;
   }
 
   template <typename K>
   FOLLY_ALWAYS_INLINE ItemIter findImpl(HashPair hp, K const& key) const {
     std::size_t index = hp.first;
     std::size_t step = probeDelta(hp);
     for (std::size_t tries = 0; tries <= chunkMask_; ++tries) {
       ChunkPtr chunk = chunks_ + (index & chunkMask_);
       if (sizeof(Chunk) > 64) {
         prefetchAddr(chunk->itemAddr(8));
       }
       auto hits = chunk->tagMatchIter(hp.second);
       while (hits.hasNext()) {
         auto i = hits.next();
         if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) {
           // Tag match and key match were both successful.  The chance
           // of a false tag match is 1/128 for each key in the chunk
           // (with a proper hash function).
           return ItemIter{chunk, i};
         }
       }
       if (LIKELY(chunk->outboundOverflowCount() == 0)) {
         // No keys that wanted to be placed in this chunk were denied
         // entry, so our search is over.  This is the common case.
         break;
       }
       index += step;
     }
     // Loop exit because tries is exhausted is rare, but possible.
     // That means that for every chunk there is currently a key present
     // in the map that visited that chunk on its probe search but ended
     // up somewhere else, and we have searched every chunk.
     return ItemIter{};
   }
 
  public:
   // Prehashing splits the work of find(key) into two calls, enabling you
   // to manually implement loop pipelining for hot bulk lookups.  prehash
   // computes the hash and prefetches the first computed memory location,
   // and the two-arg find(F14HashToken,K) performs the rest of the search.
   template <typename K>
   F14HashToken prehash(K const& key) const {
     FOLLY_SAFE_DCHECK(chunks_ != nullptr, "");
     auto hp = splitHash(this->computeKeyHash(key));
     ChunkPtr firstChunk = chunks_ + (hp.first & chunkMask_);
     prefetchAddr(firstChunk);
     return F14HashToken(std::move(hp));
   }
 
   template <typename K>
   FOLLY_ALWAYS_INLINE ItemIter find(K const& key) const {
     auto hp = splitHash(this->computeKeyHash(key));
     return findImpl(hp, key);
   }
 
   template <typename K>
   FOLLY_ALWAYS_INLINE ItemIter
   find(F14HashToken const& token, K const& key) const {
     FOLLY_SAFE_DCHECK(
         splitHash(this->computeKeyHash(key)) == static_cast<HashPair>(token),
         "");
     return findImpl(static_cast<HashPair>(token), key);
   }
 
  private:
   void adjustSizeAndBeginAfterInsert(ItemIter iter) {
     if (kEnableItemIteration) {
       // packedBegin is the max of all valid ItemIter::pack()
       auto packed = iter.pack();
       if (sizeAndPackedBegin_.packedBegin() < packed) {
         sizeAndPackedBegin_.packedBegin() = packed;
       }
     }
 
     ++sizeAndPackedBegin_.size_;
   }
 
   // Ignores hp if pos.chunk()->hostedOverflowCount() == 0
   void eraseBlank(ItemIter iter, HashPair hp) {
     iter.chunk()->clearTag(iter.index());
 
     if (iter.chunk()->hostedOverflowCount() != 0) {
       // clean up
       std::size_t index = hp.first;
       std::size_t delta = probeDelta(hp);
       uint8_t hostedOp = 0;
       while (true) {
         ChunkPtr chunk = chunks_ + (index & chunkMask_);
         if (chunk == iter.chunk()) {
           chunk->adjustHostedOverflowCount(hostedOp);
           break;
         }
         chunk->decrOutboundOverflowCount();
         hostedOp = Chunk::kDecrHostedOverflowCount;
         index += delta;
       }
     }
   }
 
   void adjustSizeAndBeginBeforeErase(ItemIter iter) {
     --sizeAndPackedBegin_.size_;
     if (kEnableItemIteration) {
       if (iter.pack() == sizeAndPackedBegin_.packedBegin()) {
         if (size() == 0) {
           iter = ItemIter{};
         } else {
           iter.precheckedAdvance();
         }
         sizeAndPackedBegin_.packedBegin() = iter.pack();
       }
     }
   }
 
   template <typename... Args>
   void insertAtBlank(ItemIter pos, HashPair hp, Args&&... args) {
     try {
       auto dst = pos.itemAddr();
       this->constructValueAtItem(size(), dst, std::forward<Args>(args)...);
     } catch (...) {
       eraseBlank(pos, hp);
       throw;
     }
     adjustSizeAndBeginAfterInsert(pos);
   }
 
   ItemIter allocateTag(uint8_t* fullness, HashPair hp) {
     ChunkPtr chunk;
     std::size_t index = hp.first;
     std::size_t delta = probeDelta(hp);
     uint8_t hostedOp = 0;
     while (true) {
       index &= chunkMask_;
       chunk = chunks_ + index;
       if (LIKELY(fullness[index] < Chunk::kCapacity)) {
         break;
       }
       chunk->incrOutboundOverflowCount();
       hostedOp = Chunk::kIncrHostedOverflowCount;
       index += delta;
     }
     unsigned itemIndex = fullness[index]++;
     FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "");
     chunk->setTag(itemIndex, hp.second);
     chunk->adjustHostedOverflowCount(hostedOp);
     return ItemIter{chunk, itemIndex};
   }
 
   ChunkPtr lastOccupiedChunk() const {
     FOLLY_SAFE_DCHECK(size() > 0, "");
     if (kEnableItemIteration) {
       return begin().chunk();
     } else {
       return chunks_ + chunkMask_;
     }
   }
 
   template <typename T>
   void directBuildFrom(T&& src) {
     FOLLY_SAFE_DCHECK(src.size() > 0 && chunkMask_ == src.chunkMask_, "");
 
     // We use std::forward<T> to allow portions of src to be moved out by
     // either beforeBuild or afterBuild, but we are just relying on good
     // behavior of our Policy superclass to ensure that any particular
     // field of this is a donor at most once.
 
     auto undoState =
         this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src));
     bool success = false;
     SCOPE_EXIT {
       this->afterBuild(
           undoState, success, src.size(), bucket_count(), std::forward<T>(src));
     };
 
     // Copy can fail part-way through if a Value copy constructor throws.
     // Failing afterBuild is limited in its cleanup power in this case,
     // because it can't enumerate the items that were actually copied.
     // Fortunately we can divide the situation into cases where all of
     // the state is owned by the table itself (F14Node and F14Value),
     // for which clearImpl() can do partial cleanup, and cases where all
     // of the values are owned by the policy (F14Vector), in which case
     // partial failure should not occur.  Sorry for the subtle invariants
     // in the Policy API.
 
     if (is_trivially_copyable<Item>::value && !this->destroyItemOnClear() &&
         bucket_count() == src.bucket_count()) {
       // most happy path
       auto n = chunkAllocSize(chunkMask_ + 1, bucket_count());
       std::memcpy(&chunks_[0], &src.chunks_[0], n);
       sizeAndPackedBegin_.size_ = src.size();
       if (kEnableItemIteration) {
         auto srcBegin = src.begin();
         sizeAndPackedBegin_.packedBegin() =
             ItemIter{chunks_ + (srcBegin.chunk() - src.chunks_),
                      srcBegin.index()}
                 .pack();
       }
     } else {
       std::size_t maxChunkIndex = src.lastOccupiedChunk() - src.chunks_;
 
       // happy path, no rehash but pack items toward bottom of chunk and
       // use copy constructor
       auto srcChunk = &src.chunks_[maxChunkIndex];
       Chunk* dstChunk = &chunks_[maxChunkIndex];
       do {
         dstChunk->copyOverflowInfoFrom(*srcChunk);
 
         auto iter = srcChunk->occupiedIter();
         if (prefetchBeforeCopy()) {
           for (auto piter = iter; piter.hasNext();) {
             this->prefetchValue(srcChunk->citem(piter.next()));
           }
         }
 
         std::size_t dstI = 0;
         for (; iter.hasNext(); ++dstI) {
           auto srcI = iter.next();
           auto&& srcArg =
               std::forward<T>(src).buildArgForItem(srcChunk->item(srcI));
           auto dst = dstChunk->itemAddr(dstI);
           this->constructValueAtItem(
               0, dst, std::forward<decltype(srcArg)>(srcArg));
           dstChunk->setTag(dstI, srcChunk->tag(srcI));
           ++sizeAndPackedBegin_.size_;
         }
 
         --srcChunk;
         --dstChunk;
       } while (size() != src.size());
 
       // reset doesn't care about packedBegin, so we don't fix it until the end
       if (kEnableItemIteration) {
         sizeAndPackedBegin_.packedBegin() =
             ItemIter{chunks_ + maxChunkIndex,
                      chunks_[maxChunkIndex].lastOccupied().index()}
                 .pack();
       }
     }
 
     success = true;
   }
 
   template <typename T>
   void rehashBuildFrom(T&& src) {
     FOLLY_SAFE_DCHECK(src.chunkMask_ > chunkMask_, "");
 
     // 1 byte per chunk means < 1 bit per value temporary overhead
     std::array<uint8_t, 256> stackBuf;
     uint8_t* fullness;
     auto cc = chunkMask_ + 1;
     if (cc <= stackBuf.size()) {
       fullness = stackBuf.data();
     } else {
       ByteAlloc a{this->alloc()};
       fullness = &*std::allocator_traits<ByteAlloc>::allocate(a, cc);
     }
     SCOPE_EXIT {
       if (cc > stackBuf.size()) {
         ByteAlloc a{this->alloc()};
         std::allocator_traits<ByteAlloc>::deallocate(
             a,
             std::pointer_traits<typename std::allocator_traits<
                 ByteAlloc>::pointer>::pointer_to(*fullness),
             cc);
       }
     };
     std::memset(fullness, '\0', cc);
 
     // We use std::forward<T> to allow portions of src to be moved out by
     // either beforeBuild or afterBuild, but we are just relying on good
     // behavior of our Policy superclass to ensure that any particular
     // field of this is a donor at most once.
 
     // Exception safety requires beforeBuild to happen after all of the
     // allocate() calls.
     auto undoState =
         this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src));
     bool success = false;
     SCOPE_EXIT {
       this->afterBuild(
           undoState, success, src.size(), bucket_count(), std::forward<T>(src));
     };
 
     // The current table is at a valid state at all points for policies
     // in which non-trivial values are owned by the main table (F14Node
     // and F14Value), so reset() will clean things up properly if we
     // fail partway through.  For the case that the policy manages value
     // lifecycle (F14Vector) then nothing after beforeBuild can throw and
     // we don't have to worry about partial failure.
 
     std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_;
     while (true) {
       auto srcChunk = &src.chunks_[srcChunkIndex];
       auto iter = srcChunk->occupiedIter();
       if (prefetchBeforeRehash()) {
         for (auto piter = iter; piter.hasNext();) {
           this->prefetchValue(srcChunk->item(piter.next()));
         }
       }
       if (srcChunk->hostedOverflowCount() == 0) {
         // all items are in their preferred chunk (no probing), so we
         // don't need to compute any hash values
         while (iter.hasNext()) {
           auto i = iter.next();
           auto& srcItem = srcChunk->item(i);
           auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem);
           HashPair hp{srcChunkIndex, srcChunk->tag(i)};
           insertAtBlank(
               allocateTag(fullness, hp),
               hp,
               std::forward<decltype(srcArg)>(srcArg));
         }
       } else {
         // any chunk's items might be in here
         while (iter.hasNext()) {
           auto i = iter.next();
           auto& srcItem = srcChunk->item(i);
           auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem);
           auto const& srcKey = src.keyForValue(srcArg);
           auto hp = splitHash(this->computeKeyHash(srcKey));
           FOLLY_SAFE_DCHECK(hp.second == srcChunk->tag(i), "");
           insertAtBlank(
               allocateTag(fullness, hp),
               hp,
               std::forward<decltype(srcArg)>(srcArg));
         }
       }
       if (srcChunkIndex == 0) {
         break;
       }
       --srcChunkIndex;
     }
 
     success = true;
   }
 
   template <typename T>
   FOLLY_NOINLINE void buildFromF14Table(T&& src) {
     FOLLY_SAFE_DCHECK(size() == 0, "");
     if (src.size() == 0) {
       return;
     }
 
     reserveForInsert(src.size());
     try {
       if (chunkMask_ == src.chunkMask_) {
         directBuildFrom(std::forward<T>(src));
       } else {
         rehashBuildFrom(std::forward<T>(src));
       }
     } catch (...) {
       reset();
       F14LinkCheck<getF14IntrinsicsMode()>::check();
       throw;
     }
   }
 
   FOLLY_NOINLINE void reserveImpl(
       std::size_t capacity,
       std::size_t origChunkCount,
       std::size_t origMaxSizeWithoutRehash) {
     FOLLY_SAFE_DCHECK(capacity >= size(), "");
 
     // compute new size
     std::size_t const kInitialCapacity = 2;
     std::size_t const kHalfChunkCapacity =
         (Chunk::kDesiredCapacity / 2) & ~std::size_t{1};
     std::size_t newMaxSizeWithoutRehash;
     std::size_t newChunkCount;
     if (capacity <= kHalfChunkCapacity) {
       newChunkCount = 1;
       newMaxSizeWithoutRehash =
           (capacity < kInitialCapacity) ? kInitialCapacity : kHalfChunkCapacity;
     } else {
       newChunkCount = nextPowTwo((capacity - 1) / Chunk::kDesiredCapacity + 1);
       newMaxSizeWithoutRehash = newChunkCount * Chunk::kDesiredCapacity;
 
       constexpr std::size_t kMaxChunksWithoutCapacityOverflow =
           (std::numeric_limits<std::size_t>::max)() / Chunk::kDesiredCapacity;
 
       if (newChunkCount > kMaxChunksWithoutCapacityOverflow ||
           newMaxSizeWithoutRehash > max_size()) {
         throw_exception<std::bad_alloc>();
       }
     }
 
     if (origMaxSizeWithoutRehash != newMaxSizeWithoutRehash) {
       rehashImpl(
           origChunkCount,
           origMaxSizeWithoutRehash,
           newChunkCount,
           newMaxSizeWithoutRehash);
     }
   }
 
   void rehashImpl(
       std::size_t origChunkCount,
       std::size_t origMaxSizeWithoutRehash,
       std::size_t newChunkCount,
       std::size_t newMaxSizeWithoutRehash) {
     auto origChunks = chunks_;
 
     BytePtr rawAllocation;
     auto undoState = this->beforeRehash(
         size(),
         origMaxSizeWithoutRehash,
         newMaxSizeWithoutRehash,
         chunkAllocSize(newChunkCount, newMaxSizeWithoutRehash),
         rawAllocation);
     chunks_ =
         initializeChunks(rawAllocation, newChunkCount, newMaxSizeWithoutRehash);
 
     FOLLY_SAFE_DCHECK(
         newChunkCount < std::numeric_limits<InternalSizeType>::max(), "");
     chunkMask_ = static_cast<InternalSizeType>(newChunkCount - 1);
 
     bool success = false;
     SCOPE_EXIT {
       // this SCOPE_EXIT reverts chunks_ and chunkMask_ if necessary
       BytePtr finishedRawAllocation = nullptr;
       std::size_t finishedAllocSize = 0;
       if (LIKELY(success)) {
         if (origMaxSizeWithoutRehash > 0) {
           finishedRawAllocation = std::pointer_traits<BytePtr>::pointer_to(
               *static_cast<uint8_t*>(static_cast<void*>(&*origChunks)));
           finishedAllocSize =
               chunkAllocSize(origChunkCount, origMaxSizeWithoutRehash);
         }
       } else {
         finishedRawAllocation = rawAllocation;
         finishedAllocSize =
             chunkAllocSize(newChunkCount, newMaxSizeWithoutRehash);
         chunks_ = origChunks;
         FOLLY_SAFE_DCHECK(
             origChunkCount < std::numeric_limits<InternalSizeType>::max(), "");
         chunkMask_ = static_cast<InternalSizeType>(origChunkCount - 1);
         F14LinkCheck<getF14IntrinsicsMode()>::check();
       }
 
       this->afterRehash(
           std::move(undoState),
           success,
           size(),
           origMaxSizeWithoutRehash,
           newMaxSizeWithoutRehash,
           finishedRawAllocation,
           finishedAllocSize);
     };
 
     if (size() == 0) {
       // nothing to do
     } else if (origChunkCount == 1 && newChunkCount == 1) {
       // no mask, no chunk scan, no hash computation, no probing
       auto srcChunk = origChunks;
       auto dstChunk = chunks_;
       std::size_t srcI = 0;
       std::size_t dstI = 0;
       while (dstI < size()) {
         if (LIKELY(srcChunk->occupied(srcI))) {
           dstChunk->setTag(dstI, srcChunk->tag(srcI));
           this->moveItemDuringRehash(
               dstChunk->itemAddr(dstI), srcChunk->item(srcI));
           ++dstI;
         }
         ++srcI;
       }
       if (kEnableItemIteration) {
         sizeAndPackedBegin_.packedBegin() = ItemIter{dstChunk, dstI - 1}.pack();
       }
     } else {
       // 1 byte per chunk means < 1 bit per value temporary overhead
       std::array<uint8_t, 256> stackBuf;
       uint8_t* fullness;
       if (newChunkCount <= stackBuf.size()) {
         fullness = stackBuf.data();
       } else {
         ByteAlloc a{this->alloc()};
         // may throw
         fullness =
             &*std::allocator_traits<ByteAlloc>::allocate(a, newChunkCount);
       }
       std::memset(fullness, '\0', newChunkCount);
       SCOPE_EXIT {
         if (newChunkCount > stackBuf.size()) {
           ByteAlloc a{this->alloc()};
           std::allocator_traits<ByteAlloc>::deallocate(
               a,
               std::pointer_traits<typename std::allocator_traits<
                   ByteAlloc>::pointer>::pointer_to(*fullness),
               newChunkCount);
         }
       };
 
       auto srcChunk = origChunks + origChunkCount - 1;
       std::size_t remaining = size();
       while (remaining > 0) {
         auto iter = srcChunk->occupiedIter();
         if (prefetchBeforeRehash()) {
           for (auto piter = iter; piter.hasNext();) {
             this->prefetchValue(srcChunk->item(piter.next()));
           }
         }
         while (iter.hasNext()) {
           --remaining;
           auto srcI = iter.next();
           Item& srcItem = srcChunk->item(srcI);
           auto hp = splitHash(
               this->computeItemHash(const_cast<Item const&>(srcItem)));
           FOLLY_SAFE_DCHECK(hp.second == srcChunk->tag(srcI), "");
 
           auto dstIter = allocateTag(fullness, hp);
           this->moveItemDuringRehash(dstIter.itemAddr(), srcItem);
         }
         --srcChunk;
       }
 
       if (kEnableItemIteration) {
         // this code replaces size invocations of adjustSizeAndBeginAfterInsert
         std::size_t i = chunkMask_;
         while (fullness[i] == 0) {
           --i;
         }
         sizeAndPackedBegin_.packedBegin() =
             ItemIter{chunks_ + i, std::size_t{fullness[i]} - 1}.pack();
       }
     }
 
     success = true;
   }
 
   void asanOnReserve(std::size_t capacity) {
     if (kIsSanitizeAddress && capacity > size()) {
       asanPendingSafeInserts += capacity - size();
     }
   }
 
   bool asanShouldAddExtraRehash() {
     if (!kIsSanitizeAddress) {
       return false;
     } else if (asanPendingSafeInserts > 0) {
       --asanPendingSafeInserts;
       return false;
     } else if (size() <= 1) {
       return size() > 0;
     } else {
       constexpr std::size_t kBigPrime = 4294967291U;
       auto s = (asanRehashState += kBigPrime);
       return (s % size()) == 0;
     }
   }
 
   void asanExtraRehash() {
     auto cc = chunkMask_ + 1;
     auto bc = bucket_count();
     rehashImpl(cc, bc, cc, bc);
   }
 
   void asanOnInsert() {
     // When running under ASAN, we add a spurious rehash with 1/size()
     // probability before every insert.  This means that finding reference
     // stability problems for F14Value and F14Vector is much more likely.
     // The most common pattern that causes this is
     //
     //   auto& ref = map[k1]; map[k2] = foo(ref);
     //
     // One way to fix this is to call map.reserve(N) before such a
     // sequence, where N is the number of keys that might be inserted
     // within the section that retains references.
     if (asanShouldAddExtraRehash()) {
       asanExtraRehash();
     }
   }
 
  public:
   // user has no control over max_load_factor
 
   void rehash(std::size_t capacity) {
     reserve(capacity);
   }
 
   void reserve(std::size_t capacity) {
     // We want to support the pattern
     //   map.reserve(2); auto& r1 = map[k1]; auto& r2 = map[k2];
     asanOnReserve(capacity);
     reserveImpl(
         std::max<std::size_t>(capacity, size()),
         chunkMask_ + 1,
         bucket_count());
   }
 
   // Returns true iff a rehash was performed
   void reserveForInsert(size_t incoming = 1) {
     auto capacity = size() + incoming;
     auto bc = bucket_count();
     if (capacity - 1 >= bc) {
       reserveImpl(capacity, chunkMask_ + 1, bc);
     }
   }
 
   // Returns pos,true if construct, pos,false if found.  key is only used
   // during the search; all constructor args for an inserted value come
   // from args...  key won't be accessed after args are touched.
   template <typename K, typename... Args>
   std::pair<ItemIter, bool> tryEmplaceValue(K const& key, Args&&... args) {
     const auto hp = splitHash(this->computeKeyHash(key));
 
     if (size() > 0) {
       auto existing = findImpl(hp, key);
       if (!existing.atEnd()) {
         return std::make_pair(existing, false);
       }
     }
 
     asanOnInsert();
 
     reserveForInsert();
 
     std::size_t index = hp.first;
     ChunkPtr chunk = chunks_ + (index & chunkMask_);
     auto firstEmpty = chunk->firstEmpty();
 
     if (!firstEmpty.hasIndex()) {
       std::size_t delta = probeDelta(hp);
       do {
         chunk->incrOutboundOverflowCount();
         index += delta;
         chunk = chunks_ + (index & chunkMask_);
         firstEmpty = chunk->firstEmpty();
       } while (!firstEmpty.hasIndex());
       chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount);
     }
     std::size_t itemIndex = firstEmpty.index();
     FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "");
 
     chunk->setTag(itemIndex, hp.second);
     ItemIter iter{chunk, itemIndex};
 
     // insertAtBlank will clear the tag if the constructor throws
     insertAtBlank(iter, hp, std::forward<Args>(args)...);
     return std::make_pair(iter, true);
   }
 
  private:
   template <bool Reset>
   void clearImpl() noexcept {
     if (chunks_ == Chunk::emptyInstance()) {
       FOLLY_SAFE_DCHECK(empty() && bucket_count() == 0, "");
       return;
     }
 
     // turn clear into reset if the table is >= 16 chunks so that
     // we don't get too low a load factor
     bool willReset = Reset || chunkMask_ + 1 >= 16;
 
     auto origSize = size();
     auto origCapacity = bucket_count();
     if (willReset) {
       this->beforeReset(origSize, origCapacity);
     } else {
       this->beforeClear(origSize, origCapacity);
     }
 
     if (!empty()) {
       if (destroyItemOnClear()) {
         for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
           ChunkPtr chunk = chunks_ + ci;
           auto iter = chunk->occupiedIter();
           if (prefetchBeforeDestroy()) {
             for (auto piter = iter; piter.hasNext();) {
               this->prefetchValue(chunk->item(piter.next()));
             }
           }
           while (iter.hasNext()) {
             this->destroyItem(chunk->item(iter.next()));
           }
         }
       }
       if (!willReset) {
         // It's okay to do this in a separate loop because we only do it
         // when the chunk count is small.  That avoids a branch when we
         // are promoting a clear to a reset for a large table.
         auto c0c = chunks_[0].chunk0Capacity();
         for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
           chunks_[ci].clear();
         }
         chunks_[0].markEof(c0c);
       }
       if (kEnableItemIteration) {
         sizeAndPackedBegin_.packedBegin() = ItemIter{}.pack();
       }
       sizeAndPackedBegin_.size_ = 0;
     }
 
     if (willReset) {
       BytePtr rawAllocation = std::pointer_traits<BytePtr>::pointer_to(
           *static_cast<uint8_t*>(static_cast<void*>(&*chunks_)));
       std::size_t rawSize = chunkAllocSize(chunkMask_ + 1, bucket_count());
 
       chunks_ = Chunk::emptyInstance();
       chunkMask_ = 0;
 
       this->afterReset(origSize, origCapacity, rawAllocation, rawSize);
     } else {
       this->afterClear(origSize, origCapacity);
     }
   }
 
   void eraseImpl(ItemIter pos, HashPair hp) {
     this->destroyItem(pos.item());
     adjustSizeAndBeginBeforeErase(pos);
     eraseBlank(pos, hp);
   }
 
  public:
   // The item needs to still be hashable during this call.  If you want
   // to intercept the value before it is destroyed (to extract it, for
   // example), use eraseIterInto(pos, beforeDestroy).
   void eraseIter(ItemIter pos) {
     eraseIterInto(pos, [](value_type&&) {});
   }
 
   // The item needs to still be hashable during this call.  If you want
   // to intercept the value before it is destroyed (to extract it, for
   // example), do so in the beforeDestroy callback.
   template <typename BeforeDestroy>
   void eraseIterInto(ItemIter pos, BeforeDestroy&& beforeDestroy) {
     HashPair hp{};
     if (pos.chunk()->hostedOverflowCount() != 0) {
       hp = splitHash(this->computeItemHash(pos.citem()));
     }
     beforeDestroy(this->valueAtItemForExtract(pos.item()));
     eraseImpl(pos, hp);
   }
 
   template <typename K>
   std::size_t eraseKey(K const& key) {
     return eraseKeyInto(key, [](value_type&&) {});
   }
 
   template <typename K, typename BeforeDestroy>
   std::size_t eraseKeyInto(K const& key, BeforeDestroy&& beforeDestroy) {
     if (UNLIKELY(size() == 0)) {
       return 0;
     }
     auto hp = splitHash(this->computeKeyHash(key));
     auto iter = findImpl(hp, key);
     if (!iter.atEnd()) {
       beforeDestroy(this->valueAtItemForExtract(iter.item()));
       eraseImpl(iter, hp);
       return 1;
     } else {
       return 0;
     }
   }
 
   void clear() noexcept {
     if (kIsSanitizeAddress) {
       // force recycling of heap memory
       auto bc = bucket_count();
       reset();
       try {
         reserveImpl(bc, 0, 0);
       } catch (std::bad_alloc const&) {
         // ASAN mode only, keep going
       }
     } else {
       clearImpl<false>();
     }
   }
 
   // Like clear(), but always frees all dynamic storage allocated
   // by the table.
   void reset() noexcept {
     clearImpl<true>();
   }
 
   // Get memory footprint, not including sizeof(*this).
   std::size_t getAllocatedMemorySize() const {
     std::size_t sum = 0;
     visitAllocationClasses(
         [&sum](std::size_t bytes, std::size_t n) { sum += bytes * n; });
     return sum;
   }
 
   // Enumerates classes of allocated memory blocks currently owned
   // by this table, calling visitor(allocationSize, allocationCount).
   // This can be used to get a more accurate indication of memory footprint
   // than getAllocatedMemorySize() if you have some way of computing the
   // internal fragmentation of the allocator, such as JEMalloc's nallocx.
   // The visitor might be called twice with the same allocationSize. The
   // visitor's computation should produce the same result for visitor(8,
   // 2) as for two calls to visitor(8, 1), for example.  The visitor may
   // be called with a zero allocationCount.
   template <typename V>
   void visitAllocationClasses(V&& visitor) const {
     auto bc = bucket_count();
     this->visitPolicyAllocationClasses(
         (bc == 0 ? 0 : chunkAllocSize(chunkMask_ + 1, bc)),
         size(),
         bc,
         visitor);
   }
 
   // visitor should take an Item const&
   template <typename V>
   void visitItems(V&& visitor) const {
     if (empty()) {
       return;
     }
     std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_;
     auto chunk = &chunks_[0];
     for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) {
       auto iter = chunk->occupiedIter();
       if (prefetchBeforeCopy()) {
         for (auto piter = iter; piter.hasNext();) {
           this->prefetchValue(chunk->citem(piter.next()));
         }
       }
       while (iter.hasNext()) {
         visitor(chunk->citem(iter.next()));
       }
     }
   }
 
   // visitor should take two Item const*
   template <typename V>
   void visitContiguousItemRanges(V&& visitor) const {
     if (empty()) {
       return;
     }
     std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_;
     auto chunk = &chunks_[0];
     for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) {
       for (auto iter = chunk->occupiedRangeIter(); iter.hasNext();) {
         auto be = iter.next();
         FOLLY_SAFE_DCHECK(
             chunk->occupied(be.first) && chunk->occupied(be.second - 1), "");
         Item const* b = chunk->itemAddr(be.first);
         visitor(b, b + (be.second - be.first));
       }
     }
   }
 
  private:
   static std::size_t& histoAt(
       std::vector<std::size_t>& histo,
       std::size_t index) {
     if (histo.size() <= index) {
       histo.resize(index + 1);
     }
     return histo.at(index);
   }
 
  public:
   // Expensive
   F14TableStats computeStats() const {
     F14TableStats stats;
 
     if (kIsDebug && kEnableItemIteration) {
       // validate iteration
       std::size_t n = 0;
       ItemIter prev;
       for (auto iter = begin(); iter != end(); iter.advance()) {
         FOLLY_SAFE_DCHECK(n == 0 || iter.pack() < prev.pack(), "");
         ++n;
         prev = iter;
       }
       FOLLY_SAFE_DCHECK(n == size(), "");
     }
 
     FOLLY_SAFE_DCHECK(
         (chunks_ == Chunk::emptyInstance()) == (bucket_count() == 0), "");
 
     std::size_t n1 = 0;
     std::size_t n2 = 0;
     auto cc = bucket_count() == 0 ? 0 : chunkMask_ + 1;
     for (std::size_t ci = 0; ci < cc; ++ci) {
       ChunkPtr chunk = chunks_ + ci;
       FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), "");
 
       auto iter = chunk->occupiedIter();
 
       std::size_t chunkOccupied = 0;
       for (auto piter = iter; piter.hasNext(); piter.next()) {
         ++chunkOccupied;
       }
       n1 += chunkOccupied;
 
       histoAt(stats.chunkOccupancyHisto, chunkOccupied)++;
       histoAt(
           stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++;
       histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++;
 
       while (iter.hasNext()) {
         auto ii = iter.next();
         ++n2;
 
         {
           auto& item = chunk->citem(ii);
           auto hp = splitHash(this->computeItemHash(item));
           FOLLY_SAFE_DCHECK(chunk->tag(ii) == hp.second, "");
 
           std::size_t dist = 1;
           std::size_t index = hp.first;
           std::size_t delta = probeDelta(hp);
           while ((index & chunkMask_) != ci) {
             index += delta;
             ++dist;
           }
 
           histoAt(stats.keyProbeLengthHisto, dist)++;
         }
 
         // misses could have any tag, so we do the dumb but accurate
         // thing and just try them all
         for (std::size_t ti = 0; ti < 256; ++ti) {
           uint8_t tag = static_cast<uint8_t>(ti == 0 ? 1 : 0);
           HashPair hp{ci, tag};
 
           std::size_t dist = 1;
           std::size_t index = hp.first;
           std::size_t delta = probeDelta(hp);
           for (std::size_t tries = 0; tries <= chunkMask_ &&
                chunks_[index & chunkMask_].outboundOverflowCount() != 0;
                ++tries) {
             index += delta;
             ++dist;
           }
 
           histoAt(stats.missProbeLengthHisto, dist)++;
         }
       }
     }
 
     FOLLY_SAFE_DCHECK(n1 == size(), "");
     FOLLY_SAFE_DCHECK(n2 == size(), "");
 
 #if FOLLY_HAS_RTTI
     stats.policy = typeid(Policy).name();
 #endif
     stats.size = size();
     stats.valueSize = sizeof(value_type);
     stats.bucketCount = bucket_count();
     stats.chunkCount = cc;
 
     stats.totalBytes = sizeof(*this) + getAllocatedMemorySize();
     stats.overheadBytes = stats.totalBytes - size() * sizeof(value_type);
 
     return stats;
   }
 };
 } // namespace detail
 } // namespace f14
 
 #endif // FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
 
 } // namespace folly