proxygen
SmallLocksBenchmark.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2016-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <algorithm>
18 #include <cmath>
19 #include <condition_variable>
20 #include <numeric>
21 #include <thread>
22 #include <vector>
23 
24 #include <google/base/spinlock.h>
25 
26 #include <folly/Benchmark.h>
27 #include <folly/SharedMutex.h>
30 
31 /* "Work cycle" is just an additional nop loop iteration.
32  * A smaller number of work cyles will result in more contention,
33  * which is what we're trying to measure. The relative ratio of
34  * locked to unlocked cycles will simulate how big critical sections
35  * are in production code
36  */
37 DEFINE_int32(work, 100, "Number of work cycles");
38 DEFINE_int32(unlocked_work, 1000, "Number of unlocked work cycles");
40  threads,
41  std::thread::hardware_concurrency(),
42  "Number of threads for fairness test");
43 
44 static void burn(size_t n) {
45  for (size_t i = 0; i < n; ++i) {
47  }
48 }
49 
50 namespace {
51 
52 template <typename Mutex>
53 std::unique_lock<Mutex> lock(Mutex& mutex) {
54  return std::unique_lock<Mutex>{mutex};
55 }
56 template <typename Mutex, typename Other>
57 void unlock(Mutex&, Other) {}
58 auto lock(folly::DistributedMutex& mutex) {
59  return mutex.lock();
60 }
61 template <typename State>
62 void unlock(folly::DistributedMutex& mutex, State state) {
63  mutex.unlock(std::move(state));
64 }
65 
66 struct SimpleBarrier {
67  explicit SimpleBarrier(int count) : count_(count) {}
68  void wait() {
69  // we spin for a bit to try and get the kernel to schedule threads on
70  // different cores
71  for (auto i = 0; i < 100000; ++i) {
73  }
74  num_.fetch_add(1);
75  while (num_.load() != count_) {
76  }
77  }
78 
79  private:
80  std::atomic<int> num_{0};
81  const int count_;
82 };
83 } // namespace
84 
85 template <typename Lock>
86 class InitLock {
87  Lock lock_;
88 
89  public:
91  lock_.init();
92  }
93  void lock() {
94  lock_.lock();
95  }
96  void unlock() {
97  lock_.unlock();
98  }
99 };
100 
102  public:
103  void lock() {
104  lock_.Lock();
105  }
106  void unlock() {
107  lock_.Unlock();
108  }
109 
110  private:
111  SpinLock lock_;
112 };
113 
114 template <typename Lock>
115 static void runContended(size_t numOps, size_t numThreads) {
117  size_t totalthreads = std::thread::hardware_concurrency();
118  if (totalthreads < numThreads) {
119  totalthreads = numThreads;
120  }
121  size_t threadgroups = totalthreads / numThreads;
122  struct lockstruct {
123  char padding1[128];
124  Lock mutex;
125  char padding2[128];
126  long value = 1;
127  };
128 
129  auto locks =
130  (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
131 
132  char padding3[128];
133  (void)padding3;
134  std::vector<std::thread> threads(totalthreads);
135 
136  SimpleBarrier runbarrier(totalthreads + 1);
137 
138  for (size_t t = 0; t < totalthreads; ++t) {
139  threads[t] = std::thread([&, t] {
140  lockstruct* mutex = &locks[t % threadgroups];
141  runbarrier.wait();
142  for (size_t op = 0; op < numOps; op += 1) {
143  auto state = lock(mutex->mutex);
144  burn(FLAGS_work);
145  mutex->value++;
146  unlock(mutex->mutex, std::move(state));
147  burn(FLAGS_unlocked_work);
148  }
149  });
150  }
151 
152  runbarrier.wait();
153  braces.dismissing([&] {
154  for (auto& thr : threads) {
155  thr.join();
156  }
157  });
158 }
159 
160 template <typename Lock>
161 static void runFairness() {
162  size_t numThreads = FLAGS_threads;
163  size_t totalthreads = std::thread::hardware_concurrency();
164  if (totalthreads < numThreads) {
165  totalthreads = numThreads;
166  }
167  long threadgroups = totalthreads / numThreads;
168  struct lockstruct {
169  char padding1[128];
170  Lock lock;
171  };
172 
173  auto locks =
174  (struct lockstruct*)calloc(threadgroups, sizeof(struct lockstruct));
175 
176  char padding3[64];
177  (void)padding3;
178  std::vector<std::thread> threads(totalthreads);
179 
180  std::atomic<bool> stop{false};
181 
183  std::vector<long> results;
184  std::vector<std::chrono::microseconds> maxes;
185 
186  std::vector<std::chrono::microseconds> aqTime;
187  std::vector<unsigned long> aqTimeSq;
188 
189  SimpleBarrier runbarrier(totalthreads + 1);
190 
191  for (size_t t = 0; t < totalthreads; ++t) {
192  threads[t] = std::thread([&, t] {
193  lockstruct* mutex = &locks[t % threadgroups];
194  long value = 0;
195  std::chrono::microseconds max(0);
196  std::chrono::microseconds time(0);
197  unsigned long timeSq(0);
198  runbarrier.wait();
199  while (!stop) {
200  std::chrono::steady_clock::time_point prelock =
202  auto state = lock(mutex->lock);
203  std::chrono::steady_clock::time_point postlock =
205  auto diff = std::chrono::duration_cast<std::chrono::microseconds>(
206  postlock - prelock);
207  time += diff;
208  timeSq += diff.count() * diff.count();
209  if (diff > max) {
210  max = diff;
211  }
212  burn(FLAGS_work);
213  value++;
214  unlock(mutex->lock, std::move(state));
215  burn(FLAGS_unlocked_work);
216  }
217  {
218  std::lock_guard<std::mutex> g(rlock);
219  results.push_back(value);
220  maxes.push_back(max);
221  aqTime.push_back(time);
222  aqTimeSq.push_back(timeSq);
223  }
224  });
225  }
226 
227  runbarrier.wait();
228  /* sleep override */
229  std::this_thread::sleep_for(std::chrono::seconds(2));
230  stop = true;
231 
232  for (auto& thr : threads) {
233  thr.join();
234  }
235 
236  // Calulate some stats
237  unsigned long sum = std::accumulate(results.begin(), results.end(), 0.0);
238  double m = sum / results.size();
239 
240  double accum = 0.0;
241  std::for_each(results.begin(), results.end(), [&](const double d) {
242  accum += (d - m) * (d - m);
243  });
244  double stdev = std::sqrt(accum / (results.size() - 1));
245  std::chrono::microseconds mx = *std::max_element(maxes.begin(), maxes.end());
246  std::chrono::microseconds agAqTime = std::accumulate(
247  aqTime.begin(), aqTime.end(), std::chrono::microseconds(0));
248  unsigned long agAqTimeSq =
249  std::accumulate(aqTimeSq.begin(), aqTimeSq.end(), 0);
250  std::chrono::microseconds mean = agAqTime / sum;
251  double variance = (sum * agAqTimeSq - (agAqTime.count() * agAqTime.count())) /
252  sum / (sum - 1);
253  double stddev2 = std::sqrt(variance);
254 
255  printf("Sum: %li Mean: %.0f stddev: %.0f\n", sum, m, stdev);
256  printf(
257  "Lock time stats in us: mean %li stddev %.0f max %li\n",
258  mean.count(),
259  stddev2,
260  mx.count());
261 }
262 
263 template <typename Mutex>
264 void runUncontended(std::size_t iters) {
265  auto&& mutex = Mutex{};
266  for (auto i = std::size_t{0}; i < iters; ++i) {
267  auto state = lock(mutex);
268  unlock(mutex, std::move(state));
269  }
270 }
271 
272 BENCHMARK(StdMutexUncontendedBenchmark, iters) {
273  runUncontended<std::mutex>(iters);
274 }
275 
276 BENCHMARK(GoogleSpinUncontendedBenchmark, iters) {
277  runUncontended<GoogleSpinLockAdapter>(iters);
278 }
279 
280 BENCHMARK(MicroSpinLockUncontendedBenchmark, iters) {
281  runUncontended<InitLock<folly::MicroSpinLock>>(iters);
282 }
283 
284 BENCHMARK(PicoSpinLockUncontendedBenchmark, iters) {
285  runUncontended<InitLock<folly::PicoSpinLock<std::uint16_t>>>(iters);
286 }
287 
288 BENCHMARK(MicroLockUncontendedBenchmark, iters) {
289  runUncontended<InitLock<folly::MicroLock>>(iters);
290 }
291 
292 BENCHMARK(SharedMutexUncontendedBenchmark, iters) {
293  runUncontended<folly::SharedMutex>(iters);
294 }
295 
296 BENCHMARK(DistributedMutexUncontendedBenchmark, iters) {
297  runUncontended<folly::DistributedMutex>(iters);
298 }
299 
300 BENCHMARK(AtomicFetchAddUncontendedBenchmark, iters) {
301  auto&& atomic = std::atomic<uint64_t>{0};
302  while (iters--) {
303  folly::doNotOptimizeAway(atomic.fetch_add(1));
304  }
305 }
306 
307 struct VirtualBase {
308  virtual void foo() = 0;
309  virtual ~VirtualBase() {}
310 };
311 
313  void foo() override { /* noop */
314  }
315  ~VirtualImpl() override {}
316 };
317 
318 #ifndef __clang__
319 __attribute__((noinline, noclone)) VirtualBase* makeVirtual() {
320  return new VirtualImpl();
321 }
322 
323 BENCHMARK(VirtualFunctionCall, iters) {
324  VirtualBase* vb = makeVirtual();
325  while (iters--) {
326  vb->foo();
327  }
328  delete vb;
329 }
330 #endif
331 
333 
334 #define BENCH_BASE(...) FB_VA_GLUE(BENCHMARK_NAMED_PARAM, (__VA_ARGS__))
335 #define BENCH_REL(...) FB_VA_GLUE(BENCHMARK_RELATIVE_NAMED_PARAM, (__VA_ARGS__))
336 
337 static void std_mutex(size_t numOps, size_t numThreads) {
338  runContended<std::mutex>(numOps, numThreads);
339 }
340 static void google_spin(size_t numOps, size_t numThreads) {
341  runContended<GoogleSpinLockAdapter>(numOps, numThreads);
342 }
343 static void folly_microspin(size_t numOps, size_t numThreads) {
344  runContended<InitLock<folly::MicroSpinLock>>(numOps, numThreads);
345 }
346 static void folly_picospin(size_t numOps, size_t numThreads) {
347  runContended<InitLock<folly::PicoSpinLock<uint16_t>>>(numOps, numThreads);
348 }
349 static void folly_microlock(size_t numOps, size_t numThreads) {
350  runContended<folly::MicroLock>(numOps, numThreads);
351 }
352 static void folly_sharedmutex(size_t numOps, size_t numThreads) {
353  runContended<folly::SharedMutex>(numOps, numThreads);
354 }
355 static void folly_distributedmutex(size_t numOps, size_t numThreads) {
356  runContended<folly::DistributedMutex>(numOps, numThreads);
357 }
358 
360 BENCH_BASE(std_mutex, 1thread, 1)
361 BENCH_REL(google_spin, 1thread, 1)
362 BENCH_REL(folly_microspin, 1thread, 1)
364 BENCH_REL(folly_microlock, 1thread, 1)
367 BENCHMARK_DRAW_LINE();
368 BENCH_BASE(std_mutex, 2thread, 2)
369 BENCH_REL(google_spin, 2thread, 2)
370 BENCH_REL(folly_microspin, 2thread, 2)
371 BENCH_REL(folly_picospin, 2thread, 2)
372 BENCH_REL(folly_microlock, 2thread, 2)
373 BENCH_REL(folly_sharedmutex, 2thread, 2)
374 BENCH_REL(folly_distributedmutex, 2thread, 2)
375 BENCHMARK_DRAW_LINE();
376 BENCH_BASE(std_mutex, 4thread, 4)
377 BENCH_REL(google_spin, 4thread, 4)
378 BENCH_REL(folly_microspin, 4thread, 4)
379 BENCH_REL(folly_picospin, 4thread, 4)
380 BENCH_REL(folly_microlock, 4thread, 4)
381 BENCH_REL(folly_sharedmutex, 4thread, 4)
382 BENCH_REL(folly_distributedmutex, 4thread, 4)
383 BENCHMARK_DRAW_LINE();
384 BENCH_BASE(std_mutex, 8thread, 8)
385 BENCH_REL(google_spin, 8thread, 8)
386 BENCH_REL(folly_microspin, 8thread, 8)
387 BENCH_REL(folly_picospin, 8thread, 8)
388 BENCH_REL(folly_microlock, 8thread, 8)
389 BENCH_REL(folly_sharedmutex, 8thread, 8)
390 BENCH_REL(folly_distributedmutex, 8thread, 8)
391 BENCHMARK_DRAW_LINE();
392 BENCH_BASE(std_mutex, 16thread, 16)
393 BENCH_REL(google_spin, 16thread, 16)
394 BENCH_REL(folly_microspin, 16thread, 16)
395 BENCH_REL(folly_picospin, 16thread, 16)
396 BENCH_REL(folly_microlock, 16thread, 16)
397 BENCH_REL(folly_sharedmutex, 16thread, 16)
398 BENCH_REL(folly_distributedmutex, 16thread, 16)
399 BENCHMARK_DRAW_LINE();
400 BENCH_BASE(std_mutex, 32thread, 32)
401 BENCH_REL(google_spin, 32thread, 32)
402 BENCH_REL(folly_microspin, 32thread, 32)
403 BENCH_REL(folly_picospin, 32thread, 32)
404 BENCH_REL(folly_microlock, 32thread, 32)
405 BENCH_REL(folly_sharedmutex, 32thread, 32)
406 BENCH_REL(folly_distributedmutex, 32thread, 32)
407 BENCHMARK_DRAW_LINE();
408 BENCH_BASE(std_mutex, 64thread, 64)
409 BENCH_REL(google_spin, 64thread, 64)
410 BENCH_REL(folly_microspin, 64thread, 64)
411 BENCH_REL(folly_picospin, 64thread, 64)
412 BENCH_REL(folly_microlock, 64thread, 64)
413 BENCH_REL(folly_sharedmutex, 64thread, 64)
414 BENCH_REL(folly_distributedmutex, 64thread, 64)
415 BENCHMARK_DRAW_LINE();
416 BENCH_BASE(std_mutex, 128thread, 128)
417 BENCH_REL(google_spin, 128thread, 128)
418 BENCH_REL(folly_microspin, 128thread, 128)
419 BENCH_REL(folly_picospin, 128thread, 128)
420 BENCH_REL(folly_microlock, 128thread, 128)
421 BENCH_REL(folly_sharedmutex, 128thread, 128)
422 BENCH_REL(folly_distributedmutex, 128thread, 128)
423 
424 #define FairnessTest(type) \
425  { \
426  printf(#type ": \n"); \
427  runFairness<type>(); \
428  }
429 
430 int main(int argc, char** argv) {
431  gflags::ParseCommandLineFlags(&argc, &argv, true);
432 
440 
442 
443  return 0;
444 }
445 
446 /*
447 ./small_locks_benchmark --bm_min_iters=100000
448 Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz
449 
450 std::mutex:
451 Sum: 3645010 Mean: 65089 stddev: 841
452 Lock time stats in us: mean 16 stddev 1178 max 21361
453 GoogleSpinLockAdapter:
454 Sum: 4329140 Mean: 77306 stddev: 2338
455 Lock time stats in us: mean 10 stddev 16 max 19860
456 InitLock<folly::MicroSpinLock>:
457 Sum: 3513882 Mean: 62747 stddev: 27713
458 Lock time stats in us: mean 31 stddev 1222 max 211624
459 InitLock<folly::PicoSpinLock<uint16_t>>:
460 Sum: 2182472 Mean: 38972 stddev: 41789
461 Lock time stats in us: mean 49 stddev 1967 max 228875
462 InitLock<folly::MicroLock>:
463 Sum: 1868601 Mean: 33367 stddev: 4836
464 Lock time stats in us: mean 48 stddev 2298 max 12235
465 folly::SharedMutex:
466 Sum: 2037742 Mean: 36388 stddev: 18204
467 Lock time stats in us: mean 53 stddev 2107 max 132469
468 folly::DistributedMutex:
469 Sum: 6793764 Mean: 121317 stddev: 20791
470 Lock time stats in us: mean 15 stddev 8 max 55696
471 ============================================================================
472 folly/synchronization/test/SmallLocksBenchmark.cpprelative time/iter iters/s
473 ============================================================================
474 StdMutexUncontendedBenchmark 16.73ns 59.77M
475 GoogleSpinUncontendedBenchmark 11.26ns 88.80M
476 MicroSpinLockUncontendedBenchmark 10.06ns 99.44M
477 PicoSpinLockUncontendedBenchmark 11.25ns 88.89M
478 MicroLockUncontendedBenchmark 19.20ns 52.09M
479 SharedMutexUncontendedBenchmark 19.45ns 51.40M
480 DistributedMutexUncontendedBenchmark 17.02ns 58.75M
481 AtomicFetchAddUncontendedBenchmark 5.47ns 182.91M
482 ----------------------------------------------------------------------------
483 ----------------------------------------------------------------------------
484 std_mutex(1thread) 802.21ns 1.25M
485 google_spin(1thread) 109.81% 730.52ns 1.37M
486 folly_microspin(1thread) 119.16% 673.22ns 1.49M
487 folly_picospin(1thread) 119.02% 673.99ns 1.48M
488 folly_microlock(1thread) 131.67% 609.28ns 1.64M
489 folly_sharedmutex(1thread) 118.41% 677.46ns 1.48M
490 folly_distributedmutex(1thread) 100.27% 800.02ns 1.25M
491 ----------------------------------------------------------------------------
492 std_mutex(2thread) 1.30us 769.21K
493 google_spin(2thread) 129.59% 1.00us 996.85K
494 folly_microspin(2thread) 158.13% 822.13ns 1.22M
495 folly_picospin(2thread) 150.43% 864.23ns 1.16M
496 folly_microlock(2thread) 144.94% 896.92ns 1.11M
497 folly_sharedmutex(2thread) 120.36% 1.08us 925.83K
498 folly_distributedmutex(2thread) 112.98% 1.15us 869.08K
499 ----------------------------------------------------------------------------
500 std_mutex(4thread) 2.36us 424.08K
501 google_spin(4thread) 120.20% 1.96us 509.75K
502 folly_microspin(4thread) 109.07% 2.16us 462.53K
503 folly_picospin(4thread) 113.37% 2.08us 480.78K
504 folly_microlock(4thread) 83.88% 2.81us 355.71K
505 folly_sharedmutex(4thread) 90.47% 2.61us 383.65K
506 folly_distributedmutex(4thread) 121.82% 1.94us 516.63K
507 ----------------------------------------------------------------------------
508 std_mutex(8thread) 5.39us 185.64K
509 google_spin(8thread) 127.72% 4.22us 237.10K
510 folly_microspin(8thread) 106.70% 5.05us 198.08K
511 folly_picospin(8thread) 88.02% 6.12us 163.41K
512 folly_microlock(8thread) 79.78% 6.75us 148.11K
513 folly_sharedmutex(8thread) 78.25% 6.88us 145.26K
514 folly_distributedmutex(8thread) 162.74% 3.31us 302.12K
515 ----------------------------------------------------------------------------
516 std_mutex(16thread) 11.74us 85.16K
517 google_spin(16thread) 109.91% 10.68us 93.60K
518 folly_microspin(16thread) 103.93% 11.30us 88.50K
519 folly_picospin(16thread) 50.36% 23.32us 42.89K
520 folly_microlock(16thread) 55.85% 21.03us 47.56K
521 folly_sharedmutex(16thread) 64.27% 18.27us 54.74K
522 folly_distributedmutex(16thread) 181.32% 6.48us 154.41K
523 ----------------------------------------------------------------------------
524 std_mutex(32thread) 31.56us 31.68K
525 google_spin(32thread) 95.17% 33.17us 30.15K
526 folly_microspin(32thread) 100.60% 31.38us 31.87K
527 folly_picospin(32thread) 31.30% 100.84us 9.92K
528 folly_microlock(32thread) 55.04% 57.35us 17.44K
529 folly_sharedmutex(32thread) 65.09% 48.49us 20.62K
530 folly_distributedmutex(32thread) 177.39% 17.79us 56.20K
531 ----------------------------------------------------------------------------
532 std_mutex(64thread) 39.90us 25.06K
533 google_spin(64thread) 110.92% 35.98us 27.80K
534 folly_microspin(64thread) 105.98% 37.65us 26.56K
535 folly_picospin(64thread) 33.03% 120.80us 8.28K
536 folly_microlock(64thread) 58.02% 68.78us 14.54K
537 folly_sharedmutex(64thread) 68.43% 58.32us 17.15K
538 folly_distributedmutex(64thread) 200.38% 19.91us 50.22K
539 ----------------------------------------------------------------------------
540 std_mutex(128thread) 75.67us 13.21K
541 google_spin(128thread) 116.14% 65.16us 15.35K
542 folly_microspin(128thread) 100.82% 75.06us 13.32K
543 folly_picospin(128thread) 44.99% 168.21us 5.94K
544 folly_microlock(128thread) 53.93% 140.31us 7.13K
545 folly_sharedmutex(128thread) 64.37% 117.55us 8.51K
546 folly_distributedmutex(128thread) 185.71% 40.75us 24.54K
547 ============================================================================
548 */
std::atomic< int64_t > sum(0)
void accumulate(std::vector< std::size_t > &a, std::vector< std::size_t > const &d)
Definition: F14TestUtil.h:58
static std::unique_ptr< SSLLock[]> & locks()
virtual void foo()=0
#define BENCH_BASE(...)
LogLevel max
Definition: LogLevel.cpp:31
static void burn(size_t n)
std::chrono::steady_clock::time_point now()
constexpr detail::Map< Move > move
Definition: Base-inl.h:2567
#define Mutex
#define FairnessTest(type)
auto rlock(Synchronized &synchronized, Args &&...args)
Definition: Synchronized.h:935
void runBenchmarks()
Definition: Benchmark.cpp:456
static void folly_distributedmutex(size_t numOps, size_t numThreads)
DEFINE_int32(work, 100,"Number of work cycles")
std::vector< std::thread::id > threads
int main(void)
Definition: test.c:2834
char ** argv
void foo() override
static void stop()
State
See Core for details.
Definition: Core.h:43
auto lock(SynchronizedLocker...lockersIn) -> std::tuple< typename SynchronizedLocker::LockedPtr... >
Definition: Synchronized.h:871
void runUncontended(std::size_t iters)
static void runFairness()
static map< string, int > m
bool wait(Waiter *waiter, bool shouldSleep, Waiter *&next)
static void folly_microspin(size_t numOps, size_t numThreads)
static void folly_microlock(size_t numOps, size_t numThreads)
static const char *const value
Definition: Conv.cpp:50
int * count
uint64_t diff(uint64_t a, uint64_t b)
Definition: FutexTest.cpp:135
BENCHMARK_DRAW_LINE()
#define BENCH_REL(...)
std::mutex mutex
auto dismissing(F f) -> invoke_result_t< F >
Definition: Benchmark.h:130
g_t g(f_t)
__attribute__((noinline, noclone)) VirtualBase *makeVirtual()
void for_each(T const &range, Function< void(typename T::value_type const &) const > const &func)
BENCHMARK(StdMutexUncontendedBenchmark, iters)
static void std_mutex(size_t numOps, size_t numThreads)
std::chrono::nanoseconds time()
static void runContended(size_t numOps, size_t numThreads)
state
Definition: http_parser.c:272
~VirtualImpl() override
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258