proxygen
CacheLocalityBenchmark.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2016-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include <memory>
20 #include <thread>
21 #include <unordered_map>
22 
23 #include <glog/logging.h>
24 
25 #include <folly/Benchmark.h>
26 
27 using namespace folly;
28 
29 #define DECLARE_SPREADER_TAG(tag, locality, func) \
30  namespace { \
31  template <typename dummy> \
32  struct tag {}; \
33  } \
34  namespace folly { \
35  template <> \
36  const CacheLocality& CacheLocality::system<tag>() { \
37  static auto* inst = new CacheLocality(locality); \
38  return *inst; \
39  } \
40  template <> \
41  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
42  return func; \
43  } \
44  template struct AccessSpreader<tag>; \
45  }
46 
48  ThreadLocalTag,
49  CacheLocality::system<>(),
50  folly::FallbackGetcpu<SequentialThreadId<std::atomic>>::getcpu)
52  PthreadSelfTag,
53  CacheLocality::system<>(),
55 
56 // Allow us to run cachedCurrent() in the benchmark function easily.
57 namespace {
58 template <typename dummy>
59 struct CachedCurrentTag {};
60 } // namespace
61 namespace folly {
62 template <>
63 size_t AccessSpreader<CachedCurrentTag>::current(size_t numStripes) {
65 }
66 } // namespace folly
67 
68 BENCHMARK(AccessSpreaderUse, iters) {
69  for (unsigned long i = 0; i < iters; ++i) {
70  auto x = AccessSpreader<>::current(16);
72  }
73 }
74 
75 BENCHMARK(CachedAccessSpreaderUse, iters) {
76  for (unsigned long i = 0; i < iters; ++i) {
79  }
80 }
81 
82 BENCHMARK(BaselineAtomicIncrement, iters) {
83  std::atomic<int> value;
84  for (unsigned long i = 0; i < iters; ++i) {
85  ++value;
87  }
88 }
89 
90 BENCHMARK(CachedAccessSpreaderAtomicIncrement, iters) {
91  std::array<std::atomic<int>, 64> values;
92  for (unsigned long i = 0; i < iters; ++i) {
94  ++values[x];
95  folly::doNotOptimizeAway(values[x]);
96  }
97 }
98 
99 // Benchmark scores here reflect the time for 32 threads to perform an
100 // atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
101 // if we don't separate the counters onto unique 128 byte stripes the
102 // 1_stripe and 2_stripe results are identical, even though the L3 is
103 // claimed to have 64 byte cache lines.
104 //
105 // Getcpu refers to the vdso getcpu implementation. ThreadLocal refers
106 // to execution using SequentialThreadId, the fallback if the vdso
107 // getcpu isn't available. PthreadSelf hashes the value returned from
108 // pthread_self() as a fallback-fallback for systems that don't have
109 // thread-local support.
110 //
111 // At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
112 // so since the stripe selection is 12 nanos the atomic increments in
113 // the L1 is ~17 nanos. At width 8_stripe_0_work the line is expected
114 // to ping-pong almost every operation, since the loops have the same
115 // duration. Widths 4 and 2 have the same behavior, but each tour of the
116 // cache line is 4 and 8 cores long, respectively. These all suggest a
117 // lower bound of 60 nanos for intra-chip handoff and increment between
118 // the L1s.
119 //
120 // With 420 nanos of busywork per contended increment, the system can
121 // hide all of the latency of a tour of length 4, but not quite one of
122 // length 8. I was a bit surprised at how much worse the non-striped
123 // version got. It seems that the inter-chip traffic also interferes
124 // with the L1-only localWork.load(). When the local work is doubled
125 // to about 1 microsecond we see that the inter-chip contention is still
126 // very important, but subdivisions on the same chip don't matter.
127 //
128 // sudo nice -n -20 buck-out/gen/folly/test/cache_locality_test
129 // --benchmark --bm_min_iters=1000000
130 // ============================================================================
131 // folly/concurrency/test/CacheLocalityBenchmark.cpprelative time/iter iters/s
132 // ============================================================================
133 // AccessSpreaderUse 11.51ns 86.87M
134 // CachedAccessSpreaderUse 1.98ns 490.03M
135 // BaselineAtomicIncrement 10.37ns 96.43M
136 // CachedAccessSpreaderAtomicIncrement 11.43ns 87.50M
137 // ----------------------------------------------------------------------------
138 // contentionAtWidthGetcpu(1_stripe_0_work) 993.13ns 1.01M
139 // contentionAtWidthGetcpu(2_stripe_0_work) 551.45ns 1.81M
140 // contentionAtWidthGetcpu(4_stripe_0_work) 302.36ns 3.31M
141 // contentionAtWidthGetcpu(8_stripe_0_work) 156.57ns 6.39M
142 // contentionAtWidthGetcpu(16_stripe_0_work) 81.34ns 12.29M
143 // contentionAtWidthGetcpu(32_stripe_0_work) 37.90ns 26.39M
144 // contentionAtWidthGetcpu(64_stripe_0_work) 36.02ns 27.76M
145 // contentionAtWidthCached(2_stripe_0_work) 310.64ns 3.22M
146 // contentionAtWidthCached(4_stripe_0_work) 180.41ns 5.54M
147 // contentionAtWidthCached(8_stripe_0_work) 87.84ns 11.38M
148 // contentionAtWidthCached(16_stripe_0_work) 45.04ns 22.20M
149 // contentionAtWidthCached(32_stripe_0_work) 19.92ns 50.20M
150 // contentionAtWidthCached(64_stripe_0_work) 19.21ns 52.06M
151 // contentionAtWidthThreadLocal(2_stripe_0_work) 321.14ns 3.11M
152 // contentionAtWidthThreadLocal(4_stripe_0_work) 244.41ns 4.09M
153 // contentionAtWidthThreadLocal(8_stripe_0_work) 103.47ns 9.66M
154 // contentionAtWidthThreadLocal(16_stripe_0_work) 79.82ns 12.53M
155 // contentionAtWidthThreadLocal(32_stripe_0_work) 20.41ns 49.01M
156 // contentionAtWidthThreadLocal(64_stripe_0_work) 22.13ns 45.18M
157 // contentionAtWidthPthreadSelf(2_stripe_0_work) 373.46ns 2.68M
158 // contentionAtWidthPthreadSelf(4_stripe_0_work) 208.18ns 4.80M
159 // contentionAtWidthPthreadSelf(8_stripe_0_work) 105.99ns 9.43M
160 // contentionAtWidthPthreadSelf(16_stripe_0_work) 105.67ns 9.46M
161 // contentionAtWidthPthreadSelf(32_stripe_0_work) 76.01ns 13.16M
162 // contentionAtWidthPthreadSelf(64_stripe_0_work) 76.04ns 13.15M
163 // atomicIncrBaseline(local_incr_0_work) 13.43ns 74.47M
164 // ----------------------------------------------------------------------------
165 // contentionAtWidthGetcpu(1_stripe_500_work) 1.76us 567.20K
166 // contentionAtWidthGetcpu(2_stripe_500_work) 1.16us 863.67K
167 // contentionAtWidthGetcpu(4_stripe_500_work) 604.74ns 1.65M
168 // contentionAtWidthGetcpu(8_stripe_500_work) 524.16ns 1.91M
169 // contentionAtWidthGetcpu(16_stripe_500_work) 478.92ns 2.09M
170 // contentionAtWidthGetcpu(32_stripe_500_work) 480.64ns 2.08M
171 // atomicIncrBaseline(local_incr_500_work) 395.17ns 2.53M
172 // ----------------------------------------------------------------------------
173 // contentionAtWidthGetcpu(1_stripe_1000_work) 2.16us 462.06K
174 // contentionAtWidthGetcpu(2_stripe_1000_work) 1.31us 764.80K
175 // contentionAtWidthGetcpu(4_stripe_1000_work) 895.33ns 1.12M
176 // contentionAtWidthGetcpu(8_stripe_1000_work) 833.98ns 1.20M
177 // contentionAtWidthGetcpu(16_stripe_1000_work) 765.10ns 1.31M
178 // contentionAtWidthGetcpu(32_stripe_1000_work) 646.85ns 1.55M
179 // atomicIncrBaseline(local_incr_1000_work) 656.15ns 1.52M
180 // ============================================================================
181 template <template <typename> class Tag>
182 static void contentionAtWidth(size_t iters, size_t stripes, size_t work) {
183  const size_t counterAlignment = 128;
184  const size_t numThreads = 32;
185 
187 
188  std::atomic<size_t> ready(0);
189  std::atomic<bool> go(false);
190 
191  // while in theory the cache line size is 64 bytes, experiments show
192  // that we get contention on 128 byte boundaries for Ivy Bridge. The
193  // extra indirection adds 1 or 2 nanos
194  assert(counterAlignment >= sizeof(std::atomic<size_t>));
195  std::vector<char> raw(counterAlignment * stripes);
196 
197  // if we happen to be using the tlsRoundRobin, then sequentially
198  // assigning the thread identifiers is the unlikely best-case scenario.
199  // We don't want to unfairly benefit or penalize. Computing the exact
200  // maximum likelihood of the probability distributions is annoying, so
201  // I approximate as 2/5 of the ids that have no threads, 2/5 that have
202  // 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
203  // wrapping back to slot 0 when we hit 1/15 and 1/5.
204 
205  std::vector<std::thread> threads;
206  while (threads.size() < numThreads) {
207  threads.push_back(std::thread([&, iters, stripes, work]() {
208  auto counters = std::vector<std::atomic<size_t>*>(stripes);
209  for (size_t i = 0; i < stripes; ++i) {
210  counters[i] =
211  new (raw.data() + counterAlignment * i) std::atomic<size_t>();
212  }
213 
214  ready++;
215  while (!go.load()) {
217  }
218  std::atomic<int> localWork(0);
219  for (size_t i = iters; i > 0; --i) {
220  ++*(counters[AccessSpreader<Tag>::current(stripes)]);
221  for (size_t j = work; j > 0; --j) {
222  auto x = localWork.load();
224  }
225  }
226  }));
227 
228  if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
229  // create a few dummy threads to wrap back around to 0 mod numCpus
230  for (size_t i = threads.size(); i != numThreads; ++i) {
231  std::thread t([&]() {
232  auto x = AccessSpreader<Tag>::current(stripes);
234  });
235  t.join();
236  }
237  }
238  }
239 
240  while (ready < numThreads) {
242  }
243  braces.dismiss();
244  go = true;
245 
246  for (auto& thr : threads) {
247  thr.join();
248  }
249 }
250 
251 static void
252 atomicIncrBaseline(size_t iters, size_t work, size_t numThreads = 32) {
254 
255  std::atomic<bool> go(false);
256 
257  std::vector<std::thread> threads;
258  while (threads.size() < numThreads) {
259  threads.push_back(std::thread([&]() {
260  while (!go.load()) {
262  }
263  std::atomic<size_t> localCounter(0);
264  std::atomic<int> localWork(0);
265  for (size_t i = iters; i > 0; --i) {
266  localCounter++;
267  for (size_t j = work; j > 0; --j) {
268  auto x = localWork.load();
270  }
271  }
272  }));
273  }
274 
275  braces.dismiss();
276  go = true;
277 
278  for (auto& thr : threads) {
279  thr.join();
280  }
281 }
282 
283 static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work) {
284  contentionAtWidth<std::atomic>(iters, stripes, work);
285 }
286 
287 static void
288 contentionAtWidthThreadLocal(size_t iters, size_t stripes, size_t work) {
289  contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
290 }
291 
292 static void
293 contentionAtWidthPthreadSelf(size_t iters, size_t stripes, size_t work) {
294  contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
295 }
296 
297 static void contentionAtWidthCached(size_t iters, size_t stripes, size_t work) {
298  contentionAtWidth<CachedCurrentTag>(iters, stripes, work);
299 }
300 
302 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_0_work, 1, 0)
303 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_0_work, 2, 0)
304 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_0_work, 4, 0)
305 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_0_work, 8, 0)
306 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_0_work, 16, 0)
307 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_0_work, 32, 0)
308 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 64_stripe_0_work, 64, 0)
309 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 2_stripe_0_work, 2, 0)
310 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 4_stripe_0_work, 4, 0)
311 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 8_stripe_0_work, 8, 0)
312 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 16_stripe_0_work, 16, 0)
313 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 32_stripe_0_work, 32, 0)
314 BENCHMARK_NAMED_PARAM(contentionAtWidthCached, 64_stripe_0_work, 64, 0)
318 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 16_stripe_0_work, 16, 0)
319 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 32_stripe_0_work, 32, 0)
320 BENCHMARK_NAMED_PARAM(contentionAtWidthThreadLocal, 64_stripe_0_work, 64, 0)
324 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 16_stripe_0_work, 16, 0)
325 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 32_stripe_0_work, 32, 0)
326 BENCHMARK_NAMED_PARAM(contentionAtWidthPthreadSelf, 64_stripe_0_work, 64, 0)
327 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
329 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_500_work, 1, 500)
330 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_500_work, 2, 500)
331 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_500_work, 4, 500)
332 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_500_work, 8, 500)
333 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_500_work, 16, 500)
334 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_500_work, 32, 500)
335 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
337 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 1_stripe_1000_work, 1, 1000)
338 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 2_stripe_1000_work, 2, 1000)
339 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 4_stripe_1000_work, 4, 1000)
340 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 8_stripe_1000_work, 8, 1000)
341 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 16_stripe_1000_work, 16, 1000)
342 BENCHMARK_NAMED_PARAM(contentionAtWidthGetcpu, 32_stripe_1000_work, 32, 1000)
343 BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
344 
345 int main(int argc, char** argv) {
346  gflags::ParseCommandLineFlags(&argc, &argv, true);
348  return 0;
349 }
Definition: InvokeTest.cpp:58
int main(int argc, char **argv)
static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work)
const int x
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
void runBenchmarks()
Definition: Benchmark.cpp:456
static void contentionAtWidth(size_t iters, size_t stripes, size_t work)
std::vector< std::thread::id > threads
static void contentionAtWidthCached(size_t iters, size_t stripes, size_t work)
char ** argv
static size_t cachedCurrent(size_t numStripes)
Fallback implementation when thread-local storage isn&#39;t available.
#define BENCHMARK_NAMED_PARAM(name, param_name,...)
Definition: Benchmark.h:449
system
Definition: CMakeCache.txt:563
static void contentionAtWidthThreadLocal(size_t iters, size_t stripes, size_t work)
BENCHMARK(fbFollyGlobalBenchmarkBaseline)
Definition: Benchmark.cpp:84
BENCHMARK_DRAW_LINE()
static size_t current(size_t numStripes)
uint64_t value(const typename LockFreeRingBuffer< T, Atom >::Cursor &rbcursor)
static void atomicIncrBaseline(size_t iters, size_t work, size_t numThreads=32)
static void contentionAtWidthPthreadSelf(size_t iters, size_t stripes, size_t work)
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
Definition: Benchmark.h:258
std::vector< int > values(1'000)
#define DECLARE_SPREADER_TAG(tag, locality, func)