proxygen
CacheLocalityBenchmark.cpp File Reference
#include <folly/concurrency/CacheLocality.h>
#include <memory>
#include <thread>
#include <unordered_map>
#include <glog/logging.h>
#include <folly/Benchmark.h>

Go to the source code of this file.

Namespaces

 folly
 —— Concurrent Priority Queue Implementation ——
 

Macros

#define DECLARE_SPREADER_TAG(tag, locality, func)
 

Functions

 BENCHMARK (AccessSpreaderUse, iters)
 
 BENCHMARK (CachedAccessSpreaderUse, iters)
 
 BENCHMARK (BaselineAtomicIncrement, iters)
 
 BENCHMARK (CachedAccessSpreaderAtomicIncrement, iters)
 
template<template< typename > class Tag>
static void contentionAtWidth (size_t iters, size_t stripes, size_t work)
 
static void atomicIncrBaseline (size_t iters, size_t work, size_t numThreads=32)
 
static void contentionAtWidthGetcpu (size_t iters, size_t stripes, size_t work)
 
static void contentionAtWidthThreadLocal (size_t iters, size_t stripes, size_t work)
 
static void contentionAtWidthPthreadSelf (size_t iters, size_t stripes, size_t work)
 
static void contentionAtWidthCached (size_t iters, size_t stripes, size_t work)
 
 BENCHMARK_DRAW_LINE ()
 
int main (int argc, char **argv)
 

Macro Definition Documentation

#define DECLARE_SPREADER_TAG (   tag,
  locality,
  func 
)
Value:
namespace { \
template <typename dummy> \
struct tag {}; \
} \
namespace folly { \
template <> \
const CacheLocality& CacheLocality::system<tag>() { \
static auto* inst = new CacheLocality(locality); \
return *inst; \
} \
template <> \
return func; \
} \
template struct AccessSpreader<tag>; \
}
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
Function< void()> Func
Definition: Executor.h:27
const
Definition: upload.py:398

Definition at line 29 of file CacheLocalityBenchmark.cpp.

Referenced by TEST().

Function Documentation

static void atomicIncrBaseline ( size_t  iters,
size_t  work,
size_t  numThreads = 32 
)
static

Definition at line 252 of file CacheLocalityBenchmark.cpp.

References folly::BenchmarkSuspender::dismiss(), folly::doNotOptimizeAway(), i, threads, and folly::fibers::yield().

Referenced by contentionAtWidthCached().

252  {
254 
255  std::atomic<bool> go(false);
256 
257  std::vector<std::thread> threads;
258  while (threads.size() < numThreads) {
259  threads.push_back(std::thread([&]() {
260  while (!go.load()) {
262  }
263  std::atomic<size_t> localCounter(0);
264  std::atomic<int> localWork(0);
265  for (size_t i = iters; i > 0; --i) {
266  localCounter++;
267  for (size_t j = work; j > 0; --j) {
268  auto x = localWork.load();
270  }
271  }
272  }));
273  }
274 
275  braces.dismiss();
276  go = true;
277 
278  for (auto& thr : threads) {
279  thr.join();
280  }
281 }
Definition: InvokeTest.cpp:58
std::vector< std::thread::id > threads
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
BENCHMARK ( AccessSpreaderUse  ,
iters   
)

Definition at line 68 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::current(), folly::doNotOptimizeAway(), and i.

68  {
69  for (unsigned long i = 0; i < iters; ++i) {
70  auto x = AccessSpreader<>::current(16);
72  }
73 }
Definition: InvokeTest.cpp:58
int current
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
BENCHMARK ( CachedAccessSpreaderUse  ,
iters   
)

Definition at line 75 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::cachedCurrent(), folly::doNotOptimizeAway(), and i.

75  {
76  for (unsigned long i = 0; i < iters; ++i) {
79  }
80 }
Definition: InvokeTest.cpp:58
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
BENCHMARK ( BaselineAtomicIncrement  ,
iters   
)

Definition at line 82 of file CacheLocalityBenchmark.cpp.

References folly::doNotOptimizeAway(), i, and folly::value().

82  {
83  std::atomic<int> value;
84  for (unsigned long i = 0; i < iters; ++i) {
85  ++value;
87  }
88 }
static const char *const value
Definition: Conv.cpp:50
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
BENCHMARK ( CachedAccessSpreaderAtomicIncrement  ,
iters   
)

Definition at line 90 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::cachedCurrent(), folly::doNotOptimizeAway(), i, values(), and x.

90  {
91  std::array<std::atomic<int>, 64> values;
92  for (unsigned long i = 0; i < iters; ++i) {
94  ++values[x];
95  folly::doNotOptimizeAway(values[x]);
96  }
97 }
Definition: InvokeTest.cpp:58
const int x
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
std::vector< int > values(1'000)
BENCHMARK_DRAW_LINE ( )
template<template< typename > class Tag>
static void contentionAtWidth ( size_t  iters,
size_t  stripes,
size_t  work 
)
static

Definition at line 182 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::current(), folly::BenchmarkSuspender::dismiss(), folly::doNotOptimizeAway(), i, folly::pushmi::detail::t, threads, and folly::fibers::yield().

182  {
183  const size_t counterAlignment = 128;
184  const size_t numThreads = 32;
185 
187 
188  std::atomic<size_t> ready(0);
189  std::atomic<bool> go(false);
190 
191  // while in theory the cache line size is 64 bytes, experiments show
192  // that we get contention on 128 byte boundaries for Ivy Bridge. The
193  // extra indirection adds 1 or 2 nanos
194  assert(counterAlignment >= sizeof(std::atomic<size_t>));
195  std::vector<char> raw(counterAlignment * stripes);
196 
197  // if we happen to be using the tlsRoundRobin, then sequentially
198  // assigning the thread identifiers is the unlikely best-case scenario.
199  // We don't want to unfairly benefit or penalize. Computing the exact
200  // maximum likelihood of the probability distributions is annoying, so
201  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
202  // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
203  // wrapping back to slot 0 when we hit 1/15 and 1/5.
204 
205  std::vector<std::thread> threads;
206  while (threads.size() < numThreads) {
207  threads.push_back(std::thread([&, iters, stripes, work]() {
208  auto counters = std::vector<std::atomic<size_t>*>(stripes);
209  for (size_t i = 0; i < stripes; ++i) {
210  counters[i] =
211  new (raw.data() + counterAlignment * i) std::atomic<size_t>();
212  }
213 
214  ready++;
215  while (!go.load()) {
217  }
218  std::atomic<int> localWork(0);
219  for (size_t i = iters; i > 0; --i) {
220  ++*(counters[AccessSpreader<Tag>::current(stripes)]);
221  for (size_t j = work; j > 0; --j) {
222  auto x = localWork.load();
224  }
225  }
226  }));
227 
228  if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
229  // create a few dummy threads to wrap back around to 0 mod numCpus
230  for (size_t i = threads.size(); i != numThreads; ++i) {
231  std::thread t([&]() {
232  auto x = AccessSpreader<Tag>::current(stripes);
234  });
235  t.join();
236  }
237  }
238  }
239 
240  while (ready < numThreads) {
242  }
243  braces.dismiss();
244  go = true;
245 
246  for (auto& thr : threads) {
247  thr.join();
248  }
249 }
Definition: InvokeTest.cpp:58
std::vector< std::thread::id > threads
int current
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
static void contentionAtWidthCached ( size_t  iters,
size_t  stripes,
size_t  work 
)
static

Definition at line 297 of file CacheLocalityBenchmark.cpp.

References atomicIncrBaseline(), folly::BENCHMARK_DRAW_LINE(), BENCHMARK_NAMED_PARAM, contentionAtWidthGetcpu(), contentionAtWidthPthreadSelf(), and contentionAtWidthThreadLocal().

297  {
298  contentionAtWidth<CachedCurrentTag>(iters, stripes, work);
299 }
static void contentionAtWidthGetcpu ( size_t  iters,
size_t  stripes,
size_t  work 
)
static

Definition at line 283 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

283  {
284  contentionAtWidth<std::atomic>(iters, stripes, work);
285 }
static void contentionAtWidthPthreadSelf ( size_t  iters,
size_t  stripes,
size_t  work 
)
static

Definition at line 293 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

293  {
294  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
295 }
static void contentionAtWidthThreadLocal ( size_t  iters,
size_t  stripes,
size_t  work 
)
static

Definition at line 288 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

288  {
289  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
290 }
int main ( int  argc,
char **  argv 
)

Definition at line 345 of file CacheLocalityBenchmark.cpp.

References folly::runBenchmarks().

345  {
346  gflags::ParseCommandLineFlags(&argc, &argv, true);
348  return 0;
349 }
void runBenchmarks()
Definition: Benchmark.cpp:456
char ** argv