proxygen: proxygen/folly/folly/concurrency/test/CacheLocalityBenchmark.cpp File Reference

#include <folly/concurrency/CacheLocality.h>
#include <memory>
#include <thread>
#include <unordered_map>
#include <glog/logging.h>
#include <folly/Benchmark.h>

Go to the source code of this file.

Namespaces
	folly
	—— Concurrent Priority Queue Implementation ——

Macros
#define	DECLARE_SPREADER_TAG(tag, locality, func)

Functions
	BENCHMARK (AccessSpreaderUse, iters)

	BENCHMARK (CachedAccessSpreaderUse, iters)

	BENCHMARK (BaselineAtomicIncrement, iters)

	BENCHMARK (CachedAccessSpreaderAtomicIncrement, iters)

template<template< typename > class Tag>
static void	contentionAtWidth (size_t iters, size_t stripes, size_t work)

static void	atomicIncrBaseline (size_t iters, size_t work, size_t numThreads=32)

static void	contentionAtWidthGetcpu (size_t iters, size_t stripes, size_t work)

static void	contentionAtWidthThreadLocal (size_t iters, size_t stripes, size_t work)

static void	contentionAtWidthPthreadSelf (size_t iters, size_t stripes, size_t work)

static void	contentionAtWidthCached (size_t iters, size_t stripes, size_t work)

	BENCHMARK_DRAW_LINE ()

int	main (int argc, char **argv)

Macro Definition Documentation

#define DECLARE_SPREADER_TAG	(	tag,
		locality,
		func
	)

Value:

namespace {                                          \
  template <typename dummy>                            \
  struct tag {};                                       \
  }                                                    \
  namespace folly {                                    \
  template <>                                          \
  const CacheLocality& CacheLocality::system<tag>() {  \
    static auto* inst = new CacheLocality(locality);   \
    return *inst;                                      \
  }                                                    \
  template <>                                          \
  Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \
    return func;                                       \
  }                                                    \
  template struct AccessSpreader<tag>;                 \
  }

Definition at line 29 of file CacheLocalityBenchmark.cpp.

Referenced by TEST().

Function Documentation

static void atomicIncrBaseline	(	size_t	iters,
		size_t	work,
		size_t	numThreads = `32`
	)

static

Definition at line 252 of file CacheLocalityBenchmark.cpp.

References folly::BenchmarkSuspender::dismiss(), folly::doNotOptimizeAway(), i, threads, and folly::fibers::yield().

Referenced by contentionAtWidthCached().

                                                                       {
   folly::BenchmarkSuspender braces;
 
   std::atomic<bool> go(false);
 
   std::vector<std::thread> threads;
   while (threads.size() < numThreads) {
     threads.push_back(std::thread([&]() {
       while (!go.load()) {
         std::this_thread::yield();
       }
       std::atomic<size_t> localCounter(0);
       std::atomic<int> localWork(0);
       for (size_t i = iters; i > 0; --i) {
         localCounter++;
         for (size_t j = work; j > 0; --j) {
           auto x = localWork.load();
           folly::doNotOptimizeAway(x);
         }
       }
     }));
   }
 
   braces.dismiss();
   go = true;
 
   for (auto& thr : threads) {
     thr.join();
   }
 }

BENCHMARK	(	AccessSpreaderUse	,
		iters
	)

Definition at line 68 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::current(), folly::doNotOptimizeAway(), and i.

                                     {
   for (unsigned long i = 0; i < iters; ++i) {
     auto x = AccessSpreader<>::current(16);
     folly::doNotOptimizeAway(x);
   }
 }

BENCHMARK	(	CachedAccessSpreaderUse	,
		iters
	)

Definition at line 75 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::cachedCurrent(), folly::doNotOptimizeAway(), and i.

                                           {
   for (unsigned long i = 0; i < iters; ++i) {
     auto x = AccessSpreader<>::cachedCurrent(16);
     folly::doNotOptimizeAway(x);
   }
 }

BENCHMARK	(	BaselineAtomicIncrement	,
		iters
	)

Definition at line 82 of file CacheLocalityBenchmark.cpp.

References folly::doNotOptimizeAway(), i, and folly::value().

                                           {
   std::atomic<int> value;
   for (unsigned long i = 0; i < iters; ++i) {
     ++value;
     folly::doNotOptimizeAway(value);
   }
 }

BENCHMARK	(	CachedAccessSpreaderAtomicIncrement	,
		iters
	)

Definition at line 90 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::cachedCurrent(), folly::doNotOptimizeAway(), i, values(), and x.

                                                       {
   std::array<std::atomic<int>, 64> values;
   for (unsigned long i = 0; i < iters; ++i) {
     auto x = AccessSpreader<>::cachedCurrent(64);
     ++values[x];
     folly::doNotOptimizeAway(values[x]);
   }
 }

BENCHMARK_DRAW_LINE ( )

template<template< typename > class Tag>

static void contentionAtWidth	(	size_t	iters,
		size_t	stripes,
		size_t	work
	)

static

Definition at line 182 of file CacheLocalityBenchmark.cpp.

References folly::AccessSpreader< Atom >::current(), folly::BenchmarkSuspender::dismiss(), folly::doNotOptimizeAway(), i, folly::pushmi::detail::t, threads, and folly::fibers::yield().

                                                                          {
   const size_t counterAlignment = 128;
   const size_t numThreads = 32;
 
   folly::BenchmarkSuspender braces;
 
   std::atomic<size_t> ready(0);
   std::atomic<bool> go(false);
 
   // while in theory the cache line size is 64 bytes, experiments show
   // that we get contention on 128 byte boundaries for Ivy Bridge.  The
   // extra indirection adds 1 or 2 nanos
   assert(counterAlignment >= sizeof(std::atomic<size_t>));
   std::vector<char> raw(counterAlignment * stripes);
 
   // if we happen to be using the tlsRoundRobin, then sequentially
   // assigning the thread identifiers is the unlikely best-case scenario.
   // We don't want to unfairly benefit or penalize.  Computing the exact
   // maximum likelihood of the probability distributions is annoying, so
   // I approximate as 2/5 of the ids that have no threads, 2/5 that have
   // 1, 2/15 that have 2, and 1/15 that have 3.  We accomplish this by
   // wrapping back to slot 0 when we hit 1/15 and 1/5.
 
   std::vector<std::thread> threads;
   while (threads.size() < numThreads) {
     threads.push_back(std::thread([&, iters, stripes, work]() {
       auto counters = std::vector<std::atomic<size_t>*>(stripes);
       for (size_t i = 0; i < stripes; ++i) {
         counters[i] =
             new (raw.data() + counterAlignment * i) std::atomic<size_t>();
       }
 
       ready++;
       while (!go.load()) {
         std::this_thread::yield();
       }
       std::atomic<int> localWork(0);
       for (size_t i = iters; i > 0; --i) {
         ++*(counters[AccessSpreader<Tag>::current(stripes)]);
         for (size_t j = work; j > 0; --j) {
           auto x = localWork.load();
           folly::doNotOptimizeAway(x);
         }
       }
     }));
 
     if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
       // create a few dummy threads to wrap back around to 0 mod numCpus
       for (size_t i = threads.size(); i != numThreads; ++i) {
         std::thread t([&]() {
           auto x = AccessSpreader<Tag>::current(stripes);
           folly::doNotOptimizeAway(x);
         });
         t.join();
       }
     }
   }
 
   while (ready < numThreads) {
     std::this_thread::yield();
   }
   braces.dismiss();
   go = true;
 
   for (auto& thr : threads) {
     thr.join();
   }
 }

static void contentionAtWidthCached	(	size_t	iters,
		size_t	stripes,
		size_t	work
	)

static

Definition at line 297 of file CacheLocalityBenchmark.cpp.

References atomicIncrBaseline(), folly::BENCHMARK_DRAW_LINE(), BENCHMARK_NAMED_PARAM, contentionAtWidthGetcpu(), contentionAtWidthPthreadSelf(), and contentionAtWidthThreadLocal().

                                                                                {
   contentionAtWidth<CachedCurrentTag>(iters, stripes, work);
 }

static void contentionAtWidthGetcpu	(	size_t	iters,
		size_t	stripes,
		size_t	work
	)

static

Definition at line 283 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

                                                                                {
   contentionAtWidth<std::atomic>(iters, stripes, work);
 }

static void contentionAtWidthPthreadSelf	(	size_t	iters,
		size_t	stripes,
		size_t	work
	)

static

Definition at line 293 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

                                                                         {
   contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
 }

static void contentionAtWidthThreadLocal	(	size_t	iters,
		size_t	stripes,
		size_t	work
	)

static

Definition at line 288 of file CacheLocalityBenchmark.cpp.

Referenced by contentionAtWidthCached().

                                                                         {
   contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
 }

int main	(	int	argc,
		char **	argv
	)

Definition at line 345 of file CacheLocalityBenchmark.cpp.

References folly::runBenchmarks().

                                 {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   folly::runBenchmarks();
   return 0;
 }

Namespaces

Macros

Functions

Macro Definition Documentation

Function Documentation