21 #include <unordered_map> 23 #include <glog/logging.h> 27 using namespace folly;
29 #define DECLARE_SPREADER_TAG(tag, locality, func) \ 31 template <typename dummy> \ 36 const CacheLocality& CacheLocality::system<tag>() { \ 37 static auto* inst = new CacheLocality(locality); \ 41 Getcpu::Func AccessSpreader<tag>::pickGetcpuFunc() { \ 44 template struct AccessSpreader<tag>; \ 49 CacheLocality::system<>(),
58 template <
typename dummy>
59 struct CachedCurrentTag {};
69 for (
unsigned long i = 0;
i < iters; ++
i) {
76 for (
unsigned long i = 0;
i < iters; ++
i) {
83 std::atomic<int>
value;
84 for (
unsigned long i = 0;
i < iters; ++
i) {
90 BENCHMARK(CachedAccessSpreaderAtomicIncrement, iters) {
91 std::array<std::atomic<int>, 64>
values;
92 for (
unsigned long i = 0;
i < iters; ++
i) {
181 template <
template <
typename>
class Tag>
183 const size_t counterAlignment = 128;
184 const size_t numThreads = 32;
188 std::atomic<size_t> ready(0);
189 std::atomic<bool> go(
false);
194 assert(counterAlignment >=
sizeof(std::atomic<size_t>));
195 std::vector<char> raw(counterAlignment * stripes);
205 std::vector<std::thread>
threads;
206 while (threads.size() < numThreads) {
207 threads.push_back(std::thread([&, iters, stripes, work]() {
208 auto counters = std::vector<std::atomic<size_t>*>(stripes);
209 for (
size_t i = 0;
i < stripes; ++
i) {
211 new (raw.data() + counterAlignment *
i) std::atomic<size_t>();
218 std::atomic<int> localWork(0);
219 for (
size_t i = iters;
i > 0; --
i) {
221 for (
size_t j = work; j > 0; --j) {
222 auto x = localWork.load();
228 if (threads.size() == numThreads / 15 || threads.size() == numThreads / 5) {
230 for (
size_t i = threads.size();
i != numThreads; ++
i) {
231 std::thread
t([&]() {
240 while (ready < numThreads) {
246 for (
auto& thr : threads) {
255 std::atomic<bool> go(
false);
257 std::vector<std::thread>
threads;
258 while (threads.size() < numThreads) {
259 threads.push_back(std::thread([&]() {
263 std::atomic<size_t> localCounter(0);
264 std::atomic<int> localWork(0);
265 for (
size_t i = iters;
i > 0; --
i) {
267 for (
size_t j = work; j > 0; --j) {
268 auto x = localWork.load();
278 for (
auto& thr : threads) {
284 contentionAtWidth<std::atomic>(iters, stripes, work);
289 contentionAtWidth<ThreadLocalTag>(iters, stripes, work);
294 contentionAtWidth<PthreadSelfTag>(iters, stripes, work);
298 contentionAtWidth<CachedCurrentTag>(iters, stripes, work);
346 gflags::ParseCommandLineFlags(&argc, &argv,
true);
int main(int argc, char **argv)
static void contentionAtWidthGetcpu(size_t iters, size_t stripes, size_t work)
—— Concurrent Priority Queue Implementation ——
static void contentionAtWidth(size_t iters, size_t stripes, size_t work)
std::vector< std::thread::id > threads
static void contentionAtWidthCached(size_t iters, size_t stripes, size_t work)
static size_t cachedCurrent(size_t numStripes)
Fallback implementation when thread-local storage isn't available.
#define BENCHMARK_NAMED_PARAM(name, param_name,...)
static void contentionAtWidthThreadLocal(size_t iters, size_t stripes, size_t work)
BENCHMARK(fbFollyGlobalBenchmarkBaseline)
static size_t current(size_t numStripes)
uint64_t value(const typename LockFreeRingBuffer< T, Atom >::Cursor &rbcursor)
static void atomicIncrBaseline(size_t iters, size_t work, size_t numThreads=32)
static void contentionAtWidthPthreadSelf(size_t iters, size_t stripes, size_t work)
auto doNotOptimizeAway(const T &datum) -> typename std::enable_if< !detail::DoNotOptimizeAwayNeedsIndirect< T >::value >::type
std::vector< int > values(1'000)
#define DECLARE_SPREADER_TAG(tag, locality, func)