proxygen
CacheLocality.h
Go to the documentation of this file.
1 /*
2  * Copyright 2013-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <algorithm>
20 #include <array>
21 #include <atomic>
22 #include <cassert>
23 #include <functional>
24 #include <limits>
25 #include <mutex>
26 #include <string>
27 #include <type_traits>
28 #include <unordered_map>
29 #include <vector>
30 
31 #include <folly/Indestructible.h>
32 #include <folly/Likely.h>
33 #include <folly/Memory.h>
34 #include <folly/Portability.h>
35 #include <folly/hash/Hash.h>
36 #include <folly/lang/Align.h>
37 #include <folly/lang/Exception.h>
38 #include <folly/system/ThreadId.h>
39 
40 namespace folly {
41 
42 // This file contains several classes that might be useful if you are
43 // trying to dynamically optimize cache locality: CacheLocality reads
44 // cache sharing information from sysfs to determine how CPUs should be
45 // grouped to minimize contention, Getcpu provides fast access to the
46 // current CPU via __vdso_getcpu, and AccessSpreader uses these two to
47 // optimally spread accesses among a predetermined number of stripes.
48 //
49 // AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is
50 // substantially less than the cost of a cache miss. This means that we
51 // can effectively use it to reduce cache line ping-pong on striped data
52 // structures such as IndexedMemPool or statistics counters.
53 //
54 // Because CacheLocality looks at all of the cache levels, it can be
55 // used for different levels of optimization. AccessSpreader(2) does
56 // per-chip spreading on a dual socket system. AccessSpreader(numCpus)
57 // does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does
58 // perfect L1 spreading in a system with hyperthreading enabled.
59 
60 struct CacheLocality {
64  size_t numCpus;
65 
71  std::vector<size_t> numCachesByLevel;
72 
80  std::vector<size_t> localityIndexByCpu;
81 
96  template <template <typename> class Atom = std::atomic>
97  static const CacheLocality& system();
98 
107  static CacheLocality readFromSysfsTree(
108  const std::function<std::string(std::string)>& mapping);
109 
112  static CacheLocality readFromSysfs();
113 
117  static CacheLocality uniform(size_t numCpus);
118 };
119 
122 struct Getcpu {
124  typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused);
125 
129  static Func resolveVdsoFunc();
130 };
131 
132 #ifdef FOLLY_TLS
133 template <template <typename> class Atom>
134 struct SequentialThreadId {
136  static unsigned get() {
137  auto rv = currentId;
138  if (UNLIKELY(rv == 0)) {
139  rv = currentId = ++prevId;
140  }
141  return rv;
142  }
143 
144  private:
145  static Atom<unsigned> prevId;
146 
147  static FOLLY_TLS unsigned currentId;
148 };
149 
150 template <template <typename> class Atom>
151 Atom<unsigned> SequentialThreadId<Atom>::prevId(0);
152 
153 template <template <typename> class Atom>
154 FOLLY_TLS unsigned SequentialThreadId<Atom>::currentId(0);
155 
156 // Suppress this instantiation in other translation units. It is
157 // instantiated in CacheLocality.cpp
158 extern template struct SequentialThreadId<std::atomic>;
159 #endif
160 
162  static unsigned get() {
164  }
165 };
166 
170 template <typename ThreadId>
175  static int getcpu(unsigned* cpu, unsigned* node, void* /* unused */) {
176  auto id = ThreadId::get();
177  if (cpu) {
178  *cpu = id;
179  }
180  if (node) {
181  *node = id;
182  }
183  return 0;
184  }
185 };
186 
187 #ifdef FOLLY_TLS
189 #else
191 #endif
192 
227 template <template <typename> class Atom = std::atomic>
231  static size_t current(size_t numStripes) {
232  // widthAndCpuToStripe[0] will actually work okay (all zeros), but
233  // something's wrong with the caller
234  assert(numStripes > 0);
235 
236  unsigned cpu;
237  getcpuFunc(&cpu, nullptr, nullptr);
238  return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
239  [cpu % kMaxCpus];
240  }
241 
242 #ifdef FOLLY_TLS
243  static size_t cachedCurrent(size_t numStripes) {
250  return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
251  [cpuCache.cpu()];
252  }
253 #else
254  static size_t cachedCurrent(size_t numStripes) {
256  return current(numStripes);
257  }
258 #endif
259 
260  private:
263  enum { kMaxCpus = 128 };
264 
266 
267  static_assert(
268  (kMaxCpus & (kMaxCpus - 1)) == 0,
269  "kMaxCpus should be a power of two so modulo is fast");
270  static_assert(
271  kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
272  "stripeByCpu element type isn't wide enough");
273 
280  static Getcpu::Func getcpuFunc;
281 
286  static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
287 
289  class CpuCache {
290  public:
291  unsigned cpu() {
292  if (UNLIKELY(cachedCpuUses_-- == 0)) {
293  unsigned cpu;
294  AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
295  cachedCpu_ = cpu % kMaxCpus;
296  cachedCpuUses_ = kMaxCachedCpuUses - 1;
297  }
298  return cachedCpu_;
299  }
300 
301  private:
302  static constexpr unsigned kMaxCachedCpuUses = 32;
303 
304  unsigned cachedCpu_{0};
305  unsigned cachedCpuUses_{0};
306  };
307 
308 #ifdef FOLLY_TLS
309  static FOLLY_TLS CpuCache cpuCache;
310 #endif
311 
312  static bool initialized;
313 
316  auto best = Getcpu::resolveVdsoFunc();
317  return best ? best : &FallbackGetcpuType::getcpu;
318  }
319 
321  static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
322  if (cpu != nullptr) {
323  *cpu = 0;
324  }
325  if (node != nullptr) {
326  *node = 0;
327  }
328  return 0;
329  }
330 
331  // The function to call for fast lookup of getcpu is a singleton, as
332  // is the precomputed table of locality information. AccessSpreader
333  // is used in very tight loops, however (we're trying to race an L1
334  // cache miss!), so the normal singleton mechanisms are noticeably
335  // expensive. Even a not-taken branch guarding access to getcpuFunc
336  // slows AccessSpreader::current from 12 nanos to 14. As a result, we
337  // populate the static members with simple (but valid) values that can
338  // be filled in by the linker, and then follow up with a normal static
339  // initializer call that puts in the proper version. This means that
340  // when there are initialization order issues we will just observe a
341  // zero stripe. Once a sanitizer gets smart enough to detect this as
342  // a race or undefined behavior, we can annotate it.
343 
344  static bool initialize() {
345  getcpuFunc = pickGetcpuFunc();
346 
347  auto& cacheLocality = CacheLocality::system<Atom>();
348  auto n = cacheLocality.numCpus;
349  for (size_t width = 0; width <= kMaxCpus; ++width) {
350  auto numStripes = std::max(size_t{1}, width);
351  for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
352  auto index = cacheLocality.localityIndexByCpu[cpu];
353  assert(index < n);
354  // as index goes from 0..n, post-transform value goes from
355  // 0..numStripes
356  widthAndCpuToStripe[width][cpu] =
357  CompactStripe((index * numStripes) / n);
358  assert(widthAndCpuToStripe[width][cpu] < numStripes);
359  }
360  for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
361  widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
362  }
363  }
364  return true;
365  }
366 };
367 
368 template <template <typename> class Atom>
371 
372 template <template <typename> class Atom>
374  AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {};
375 
376 #ifdef FOLLY_TLS
377 template <template <typename> class Atom>
378 FOLLY_TLS
380 #endif
381 
382 template <template <typename> class Atom>
384 
385 // Suppress this instantiation in other translation units. It is
386 // instantiated in CacheLocality.cpp
387 extern template struct AccessSpreader<std::atomic>;
388 
396  uint8_t* mem_{nullptr};
397  uint8_t* end_{nullptr};
398  void* freelist_{nullptr};
399  size_t allocSize_;
400  size_t sz_;
401  std::vector<void*> blocks_;
402 
403  public:
404  SimpleAllocator(size_t allocSize, size_t sz);
405  ~SimpleAllocator();
406  void* allocateHard();
407 
408  // Inline fast-paths.
409  void* allocate() {
410  std::lock_guard<std::mutex> g(m_);
411  // Freelist allocation.
412  if (freelist_) {
413  auto mem = freelist_;
414  freelist_ = *static_cast<void**>(freelist_);
415  return mem;
416  }
417 
418  // Bump-ptr allocation.
419  if (intptr_t(mem_) % 128 == 0) {
420  // Avoid allocating pointers that may look like malloc
421  // pointers.
422  mem_ += std::min(sz_, max_align_v);
423  }
424  if (mem_ && (mem_ + sz_ <= end_)) {
425  auto mem = mem_;
426  mem_ += sz_;
427 
428  assert(intptr_t(mem) % 128 != 0);
429  return mem;
430  }
431 
432  return allocateHard();
433  }
434  void deallocate(void* mem) {
435  std::lock_guard<std::mutex> g(m_);
436  *static_cast<void**>(mem) = freelist_;
437  freelist_ = mem;
438  }
439 };
440 
455 template <size_t Stripes>
457  public:
458  class Allocator {
459  static constexpr size_t AllocSize{4096};
460 
462  if (size <= 8) {
463  return 0;
464  } else if (size <= 16) {
465  return 1;
466  } else if (size <= 32) {
467  return 2;
468  } else if (size <= 64) {
469  return 3;
470  } else { // punt to malloc.
471  return 4;
472  }
473  }
474 
475  std::array<SimpleAllocator, 4> allocators_{
476  {{AllocSize, 8}, {AllocSize, 16}, {AllocSize, 32}, {AllocSize, 64}}};
477 
478  public:
479  void* allocate(size_t size) {
480  auto cl = sizeClass(size);
481  if (cl == 4) {
482  // Align to a cacheline
483  size = size + (hardware_destructive_interference_size - 1);
484  size &= ~size_t(hardware_destructive_interference_size - 1);
485  void* mem =
487  if (!mem) {
488  throw_exception<std::bad_alloc>();
489  }
490  return mem;
491  }
492  return allocators_[cl].allocate();
493  }
494  void deallocate(void* mem, size_t = 0) {
495  if (!mem) {
496  return;
497  }
498 
499  // See if it came from this allocator or malloc.
500  if (intptr_t(mem) % 128 != 0) {
501  auto addr =
502  reinterpret_cast<void*>(intptr_t(mem) & ~intptr_t(AllocSize - 1));
503  auto allocator = *static_cast<SimpleAllocator**>(addr);
504  allocator->deallocate(mem);
505  } else {
506  aligned_free(mem);
507  }
508  }
509  };
510 
511  Allocator* get(size_t stripe) {
512  assert(stripe < Stripes);
513  return &allocators_[stripe];
514  }
515 
516  private:
517  Allocator allocators_[Stripes];
518 };
519 
520 template <typename T, size_t Stripes>
522 getCoreAllocator(size_t stripe) {
523  // We cannot make sure that the allocator will be destroyed after
524  // all the objects allocated with it, so we leak it.
527  *allocator->get(stripe));
528 }
529 
530 } // namespace folly
Caches the current CPU and refreshes the cache every so often.
static CacheLocality uniform(size_t numCpus)
void deallocate(void *mem, size_t=0)
static const CacheLocality & system()
std::vector< size_t > localityIndexByCpu
Definition: CacheLocality.h:80
LogLevel max
Definition: LogLevel.cpp:31
CxxAllocatorAdaptor< T, typename CoreRawAllocator< Stripes >::Allocator > getCoreAllocator(size_t stripe)
void aligned_free(void *aligned_ptr)
Definition: Memory.h:89
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
uint32_t twang_32from64(uint64_t key) noexcept
Definition: Hash.h:83
static int getcpu(unsigned *cpu, unsigned *node, void *)
constexpr std::size_t max_align_v
Definition: Align.h:90
constexpr std::size_t hardware_destructive_interference_size
Definition: Align.h:107
int current
constexpr auto size(C const &c) -> decltype(c.size())
Definition: Access.h:45
FallbackGetcpu< HashingThreadId > FallbackGetcpuType
LogLevel min
Definition: LogLevel.cpp:30
void * aligned_malloc(size_t size, size_t align)
Definition: Memory.h:85
#define Atom
std::vector< size_t > numCachesByLevel
Definition: CacheLocality.h:71
int(* Func)(unsigned *cpu, unsigned *node, void *unused)
Function pointer to a function with the same signature as getcpu(2).
cache index *static CacheLocality readFromSysfs()
static bool initialize()
static Func resolveVdsoFunc()
static int degenerateGetcpu(unsigned *cpu, unsigned *node, void *)
Always claims to be on CPU zero, node zero.
static Getcpu::Func getcpuFunc
void deallocate(void *mem)
std::mutex mutex
const char * string
Definition: Conv.cpp:212
uint64_t getCurrentThreadID()
Definition: ThreadId.h:42
g_t g(f_t)
static size_t current(size_t numStripes)
std::vector< void * > blocks_
#define UNLIKELY(x)
Definition: Likely.h:48
PUSHMI_INLINE_VAR constexpr detail::get_fn< T > get
Definition: submit.h:391
static Getcpu::Func pickGetcpuFunc()
Returns the best getcpu implementation for Atom.
ThreadPoolListHook * addr
uintptr_t end_