proxygen
CacheLocality.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2013-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #ifndef _MSC_VER
20 #define _GNU_SOURCE 1 // for RTLD_NOLOAD
21 #include <dlfcn.h>
22 #endif
23 #include <fstream>
24 
25 #include <folly/Conv.h>
26 #include <folly/Exception.h>
27 #include <folly/FileUtil.h>
28 #include <folly/Format.h>
29 #include <folly/ScopeGuard.h>
30 
31 namespace folly {
32 
34 
37  if (kIsLinux) {
38  try {
40  } catch (...) {
41  // keep trying
42  }
43  }
44 
45  long numCpus = sysconf(_SC_NPROCESSORS_CONF);
46  if (numCpus <= 0) {
47  // This shouldn't happen, but if it does we should try to keep
48  // going. We are probably not going to be able to parse /sys on
49  // this box either (although we will try), which means we are going
50  // to fall back to the SequentialThreadId splitter. On my 16 core
51  // (x hyperthreading) dev box 16 stripes is enough to get pretty good
52  // contention avoidance with SequentialThreadId, and there is little
53  // improvement from going from 32 to 64. This default gives us some
54  // wiggle room
55  numCpus = 32;
56  }
57  return CacheLocality::uniform(size_t(numCpus));
58 }
59 
60 template <>
61 const CacheLocality& CacheLocality::system<std::atomic>() {
62  static auto* cache = new CacheLocality(getSystemLocalityInfo());
63  return *cache;
64 }
65 
66 // Each level of cache has sharing sets, which are the set of cpus
67 // that share a common cache at that level. These are available in a
68 // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
69 // for example). They are also available in a human-readable list form,
70 // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
71 // is a comma-separated list of numbers and ranges, where the ranges are
72 // a pair of decimal numbers separated by a '-'.
73 //
74 // To sort the cpus for optimum locality we don't really need to parse
75 // the sharing sets, we just need a unique representative from the
76 // equivalence class. The smallest value works fine, and happens to be
77 // the first decimal number in the file. We load all of the equivalence
78 // class information from all of the cpu*/index* directories, order the
79 // cpus first by increasing last-level cache equivalence class, then by
80 // the smaller caches. Finally, we break ties with the cpu number itself.
81 
85 static size_t parseLeadingNumber(const std::string& line) {
86  auto raw = line.c_str();
87  char* end;
88  unsigned long val = strtoul(raw, &end, 10);
89  if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
90  throw std::runtime_error(
91  to<std::string>("error parsing list '", line, "'").c_str());
92  }
93  return val;
94 }
95 
96 CacheLocality CacheLocality::readFromSysfsTree(
97  const std::function<std::string(std::string)>& mapping) {
98  // number of equivalence classes per level
99  std::vector<size_t> numCachesByLevel;
100 
101  // the list of cache equivalence classes, where equivalance classes
102  // are named by the smallest cpu in the class
103  std::vector<std::vector<size_t>> equivClassesByCpu;
104 
105  std::vector<size_t> cpus;
106 
107  while (true) {
108  auto cpu = cpus.size();
109  std::vector<size_t> levels;
110  for (size_t index = 0;; ++index) {
111  auto dir =
112  sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
113  auto cacheType = mapping(dir + "type");
114  auto equivStr = mapping(dir + "shared_cpu_list");
115  if (cacheType.size() == 0 || equivStr.size() == 0) {
116  // no more caches
117  break;
118  }
119  if (cacheType[0] == 'I') {
120  // cacheType in { "Data", "Instruction", "Unified" }. skip icache
121  continue;
122  }
123  auto equiv = parseLeadingNumber(equivStr);
124  auto level = levels.size();
125  levels.push_back(equiv);
126 
127  if (equiv == cpu) {
128  // we only want to count the equiv classes once, so we do it when
129  // we first encounter them
130  while (numCachesByLevel.size() <= level) {
131  numCachesByLevel.push_back(0);
132  }
133  numCachesByLevel[level]++;
134  }
135  }
136 
137  if (levels.size() == 0) {
138  // no levels at all for this cpu, we must be done
139  break;
140  }
141  equivClassesByCpu.emplace_back(std::move(levels));
142  cpus.push_back(cpu);
143  }
144 
145  if (cpus.size() == 0) {
146  throw std::runtime_error("unable to load cache sharing info");
147  }
148 
149  std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
150  // sort first by equiv class of cache with highest index,
151  // direction doesn't matter. If different cpus have
152  // different numbers of caches then this code might produce
153  // a sub-optimal ordering, but it won't crash
154  auto& lhsEquiv = equivClassesByCpu[lhs];
155  auto& rhsEquiv = equivClassesByCpu[rhs];
156  for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
157  i >= 0;
158  --i) {
159  auto idx = size_t(i);
160  if (lhsEquiv[idx] != rhsEquiv[idx]) {
161  return lhsEquiv[idx] < rhsEquiv[idx];
162  }
163  }
164 
165  // break ties deterministically by cpu
166  return lhs < rhs;
167  });
168 
169  // the cpus are now sorted by locality, with neighboring entries closer
170  // to each other than entries that are far away. For striping we want
171  // the inverse map, since we are starting with the cpu
172  std::vector<size_t> indexes(cpus.size());
173  for (size_t i = 0; i < cpus.size(); ++i) {
174  indexes[cpus[i]] = i;
175  }
176 
177  return CacheLocality{
178  cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
179 }
180 
182  return readFromSysfsTree([](std::string name) {
183  std::ifstream xi(name.c_str());
184  std::string rv;
185  std::getline(xi, rv);
186  return rv;
187  });
188 }
189 
191  CacheLocality rv;
192 
193  rv.numCpus = numCpus;
194 
195  // one cache shared by all cpus
196  rv.numCachesByLevel.push_back(numCpus);
197 
198  // no permutations in locality index mapping
199  for (size_t cpu = 0; cpu < numCpus; ++cpu) {
200  rv.localityIndexByCpu.push_back(cpu);
201  }
202 
203  return rv;
204 }
205 
207 
209 #if !FOLLY_HAVE_LINUX_VDSO
210  return nullptr;
211 #else
212  void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
213  if (h == nullptr) {
214  return nullptr;
215  }
216 
217  auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
218  if (func == nullptr) {
219  // technically a null result could either be a failure or a successful
220  // lookup of a symbol with the null value, but the second can't actually
221  // happen for this symbol. No point holding the handle forever if
222  // we don't need the code
223  dlclose(h);
224  }
225 
226  return func;
227 #endif
228 }
229 
230 #ifdef FOLLY_TLS
231 template struct SequentialThreadId<std::atomic>;
233 #endif
234 
236 template struct AccessSpreader<std::atomic>;
237 
238 SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
239  : allocSize_{allocSize}, sz_(sz) {}
240 
242  std::lock_guard<std::mutex> g(m_);
243  for (auto& block : blocks_) {
244  folly::aligned_free(block);
245  }
246 }
247 
249  // Allocate a new slab.
251  if (!mem_) {
252  throw_exception<std::bad_alloc>();
253  }
254  end_ = mem_ + allocSize_;
255  blocks_.push_back(mem_);
256 
257  // Install a pointer to ourselves as the allocator.
258  *reinterpret_cast<SimpleAllocator**>(mem_) = this;
259  static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small");
261 
262  // New allocation.
263  auto mem = mem_;
264  mem_ += sz_;
265  assert(intptr_t(mem) % 128 != 0);
266  return mem;
267 }
268 
269 } // namespace folly
SimpleAllocator(size_t allocSize, size_t sz)
static CacheLocality uniform(size_t numCpus)
*than *hazptr_holder h
Definition: Hazptr.h:116
std::vector< size_t > localityIndexByCpu
Definition: CacheLocality.h:80
std::string sformat(StringPiece fmt, Args &&...args)
Definition: Format.h:280
void aligned_free(void *aligned_ptr)
Definition: Memory.h:89
constexpr detail::Map< Move > move
Definition: Base-inl.h:2567
void BENCHFUN() getline(size_t iters, size_t arg)
double val
Definition: String.cpp:273
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
FOLLY_PUSH_WARNING RHS rhs
Definition: Traits.h:649
constexpr std::size_t max_align_v
Definition: Align.h:90
const char * name
Definition: http_parser.c:437
LogLevel min
Definition: LogLevel.cpp:30
void * aligned_malloc(size_t size, size_t align)
Definition: Memory.h:85
auto end(TestAdlIterable &instance)
Definition: ForeachTest.cpp:62
std::vector< size_t > numCachesByLevel
Definition: CacheLocality.h:71
int(* Func)(unsigned *cpu, unsigned *node, void *unused)
Function pointer to a function with the same signature as getcpu(2).
cache index *static CacheLocality readFromSysfs()
static Func resolveVdsoFunc()
const char * string
Definition: Conv.cpp:212
g_t g(f_t)
std::vector< void * > blocks_
static CacheLocality getSystemLocalityInfo()
Returns the best real CacheLocality information available.
static size_t parseLeadingNumber(const std::string &line)
constexpr auto kIsLinux
Definition: Portability.h:361