27 #include <type_traits> 28 #include <unordered_map> 96 template <
template <
typename>
class Atom = std::atomic>
124 typedef int (*
Func)(
unsigned* cpu,
unsigned* node,
void* unused);
129 static Func resolveVdsoFunc();
133 template <
template <
typename>
class Atom>
134 struct SequentialThreadId {
136 static unsigned get() {
139 rv = currentId = ++prevId;
145 static Atom<unsigned> prevId;
147 static FOLLY_TLS
unsigned currentId;
150 template <
template <
typename>
class Atom>
151 Atom<unsigned> SequentialThreadId<Atom>::prevId(0);
153 template <
template <
typename>
class Atom>
154 FOLLY_TLS
unsigned SequentialThreadId<Atom>::currentId(0);
158 extern template struct SequentialThreadId<std::atomic>;
162 static unsigned get() {
170 template <
typename ThreadId>
175 static int getcpu(
unsigned* cpu,
unsigned* node,
void* ) {
227 template <
template <
typename>
class Atom = std::atomic>
234 assert(numStripes > 0);
237 getcpuFunc(&cpu,
nullptr,
nullptr);
238 return widthAndCpuToStripe[
std::min(
size_t(kMaxCpus), numStripes)]
243 static size_t cachedCurrent(
size_t numStripes) {
250 return widthAndCpuToStripe[
std::min(
size_t(kMaxCpus), numStripes)]
254 static size_t cachedCurrent(
size_t numStripes) {
263 enum { kMaxCpus = 128 };
268 (kMaxCpus & (kMaxCpus - 1)) == 0,
269 "kMaxCpus should be a power of two so modulo is fast");
272 "stripeByCpu element type isn't wide enough");
286 static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus];
292 if (
UNLIKELY(cachedCpuUses_-- == 0)) {
295 cachedCpu_ = cpu % kMaxCpus;
296 cachedCpuUses_ = kMaxCachedCpuUses - 1;
302 static constexpr
unsigned kMaxCachedCpuUses = 32;
304 unsigned cachedCpu_{0};
305 unsigned cachedCpuUses_{0};
322 if (cpu !=
nullptr) {
325 if (node !=
nullptr) {
345 getcpuFunc = pickGetcpuFunc();
347 auto& cacheLocality = CacheLocality::system<Atom>();
348 auto n = cacheLocality.numCpus;
349 for (
size_t width = 0; width <= kMaxCpus; ++width) {
350 auto numStripes =
std::max(
size_t{1}, width);
351 for (
size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) {
352 auto index = cacheLocality.localityIndexByCpu[cpu];
356 widthAndCpuToStripe[width][cpu] =
357 CompactStripe((index * numStripes) / n);
358 assert(widthAndCpuToStripe[width][cpu] < numStripes);
360 for (
size_t cpu = n; cpu < kMaxCpus; ++cpu) {
361 widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
368 template <
template <
typename>
class Atom>
372 template <
template <
typename>
class Atom>
377 template <
template <
typename>
class Atom>
382 template <
template <
typename>
class Atom>
398 void* freelist_{
nullptr};
406 void* allocateHard();
410 std::lock_guard<std::mutex>
g(m_);
413 auto mem = freelist_;
414 freelist_ = *
static_cast<void**
>(freelist_);
419 if (intptr_t(mem_) % 128 == 0) {
424 if (mem_ && (mem_ + sz_ <=
end_)) {
428 assert(intptr_t(mem) % 128 != 0);
432 return allocateHard();
435 std::lock_guard<std::mutex>
g(m_);
436 *
static_cast<void**
>(mem) = freelist_;
455 template <
size_t Stripes>
459 static constexpr
size_t AllocSize{4096};
464 }
else if (size <= 16) {
466 }
else if (size <= 32) {
468 }
else if (size <= 64) {
475 std::array<SimpleAllocator, 4> allocators_{
476 {{AllocSize, 8}, {AllocSize, 16}, {AllocSize, 32}, {AllocSize, 64}}};
480 auto cl = sizeClass(size);
488 throw_exception<std::bad_alloc>();
492 return allocators_[cl].allocate();
500 if (intptr_t(mem) % 128 != 0) {
502 reinterpret_cast<void*
>(intptr_t(mem) & ~intptr_t(AllocSize - 1));
512 assert(stripe < Stripes);
513 return &allocators_[stripe];
520 template <
typename T,
size_t Stripes>
527 *allocator->
get(stripe));
Caches the current CPU and refreshes the cache every so often.
static CacheLocality uniform(size_t numCpus)
void deallocate(void *mem, size_t=0)
static const CacheLocality & system()
std::vector< size_t > localityIndexByCpu
CxxAllocatorAdaptor< T, typename CoreRawAllocator< Stripes >::Allocator > getCoreAllocator(size_t stripe)
void aligned_free(void *aligned_ptr)
—— Concurrent Priority Queue Implementation ——
uint8_t sizeClass(size_t size)
uint32_t twang_32from64(uint64_t key) noexcept
static int getcpu(unsigned *cpu, unsigned *node, void *)
constexpr std::size_t max_align_v
constexpr std::size_t hardware_destructive_interference_size
constexpr auto size(C const &c) -> decltype(c.size())
FallbackGetcpu< HashingThreadId > FallbackGetcpuType
void * aligned_malloc(size_t size, size_t align)
std::vector< size_t > numCachesByLevel
int(* Func)(unsigned *cpu, unsigned *node, void *unused)
Function pointer to a function with the same signature as getcpu(2).
cache index *static CacheLocality readFromSysfs()
static Func resolveVdsoFunc()
static int degenerateGetcpu(unsigned *cpu, unsigned *node, void *)
Always claims to be on CPU zero, node zero.
static Getcpu::Func getcpuFunc
void deallocate(void *mem)
uint64_t getCurrentThreadID()
static size_t current(size_t numStripes)
std::vector< void * > blocks_
PUSHMI_INLINE_VAR constexpr detail::get_fn< T > get
static Getcpu::Func pickGetcpuFunc()
Returns the best getcpu implementation for Atom.
ThreadPoolListHook * addr
void * allocate(size_t size)