proxygen
Rcu.h
Go to the documentation of this file.
1 /*
2  * Copyright 2017-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <atomic>
19 #include <functional>
20 #include <limits>
21 
22 #include <folly/Indestructible.h>
23 #include <folly/Optional.h>
28 
29 // Implementation of proposed RCU C++ API
30 // http://open-std.org/JTC1/SC22/WG21/docs/papers/2017/p0566r3.pdf
31 
32 // Overview
33 
34 // This file provides a low-overhead mechanism to guarantee ordering
35 // between operations on shared data. In the simplest usage pattern,
36 // readers enter a critical section, view some state, and leave the
37 // critical section, while writers modify shared state and then defer
38 // some cleanup operations. Proper use of these classes will guarantee
39 // that a cleanup operation that is deferred during a reader critical
40 // section will not be executed until after that critical section is
41 // over.
42 
43 // Example
44 
45 // As an example, suppose we have some configuration data that gets
46 // periodically updated. We need to protect ourselves on *every* read
47 // path, even if updates are only vanishly rare, because we don't know
48 // if some writer will come along and yank the data out from under us.
49 //
50 // Here's how that might look without deferral:
51 //
52 // void doSomething(IPAddress host);
53 //
54 // folly::SharedMutex sm;
55 // ConfigData* globalConfigData;
56 //
57 // void reader() {
58 // while (true) {
59 // sm.lock_shared();
60 // IPAddress curManagementServer = globalConfigData->managementServerIP;
61 // sm.unlock_shared();
62 // doSomethingWith(curManagementServer);
63 // }
64 // }
65 //
66 // void writer() {
67 // while (true) {
68 // std::this_thread::sleep_for(std::chrono::seconds(60));
69 // ConfigData* oldConfigData = globalConfigData;
70 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
71 // sm.lock();
72 // globalConfigData = newConfigData;
73 // sm.unlock();
74 // delete oldConfigData;
75 // }
76 // }
77 //
78 // The readers have to lock_shared and unlock_shared every iteration, even
79 // during the overwhelming majority of the time in which there's no writer
80 // present. These functions are surprisingly expensive; there's around 30ns of
81 // overhead per critical section on my machine.
82 //
83 // Let's get rid of the locking. The readers and writer may proceed
84 // concurrently. Here's how this would look:
85 //
86 // void doSomething(IPAddress host);
87 //
88 // std::atomic<ConfigData*> globalConfigData;
89 //
90 // void reader() {
91 // while (true) {
92 // ConfigData* configData = globalConfigData.load();
93 // IPAddress curManagementServer = configData->managementServerIP;
94 // doSomethingWith(curManagementServer);
95 // }
96 // }
97 //
98 // void writer() {
99 // while (true) {
100 // std::this_thread::sleep_for(std::chrono::seconds(60));
101 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
102 // globalConfigData.store(newConfigData);
103 // // We can't delete the old ConfigData; we don't know when the readers
104 // // will be done with it! I guess we have to leak it...
105 // }
106 // }
107 //
108 // This works and is fast, but we don't ever reclaim the memory we
109 // allocated for the copy of the data. We need a way for the writer to
110 // know that no readers observed the old value of the pointer and are
111 // still using it. Tweaking this slightly, we want a way for the
112 // writer to say "I want some operation (deleting the old ConfigData)
113 // to happen, but only after I know that all readers that started
114 // before I requested the action have finished.". The classes in this
115 // namespace allow this. Here's how we'd express this idea:
116 //
117 // void doSomething(IPAddress host);
118 // std::atomic<ConfigData*> globalConfigData;
119 //
120 //
121 // void reader() {
122 // while (true) {
123 // IPAddress curManagementServer;
124 // {
125 // // We're about to do some reads we want to protect; if we read a
126 // // pointer, we need to make sure that if the writer comes along and
127 // // updates it, the writer's cleanup operation won't happen until we're
128 // // done accessing the pointed-to data. We get a Guard on that
129 // // domain; as long as it exists, no function subsequently passed to
130 // // invokeEventually will execute.
131 // rcu_reader guard;
132 // ConfigData* configData = globalConfigData.load();
133 // // We created a guard before we read globalConfigData; we know that the
134 // // pointer will remain valid until the guard is destroyed.
135 // curManagementServer = configData->managementServerIP;
136 // // Guard is released here; retired objects may be freed.
137 // }
138 // doSomethingWith(curManagementServer);
139 // }
140 // }
141 //
142 // void writer() {
143 //
144 // while (true) {
145 // std::this_thread::sleep_for(std::chrono::seconds(60));
146 // ConfigData* oldConfigData = globalConfigData.load();
147 // ConfigData* newConfigData = loadConfigDataFromRemoteServer();
148 // globalConfigData.store(newConfigData);
149 // rcu_retire(oldConfigData);
150 // // alternatively, in a blocking manner:
151 // // synchronize_rcu();
152 // // delete oldConfigData;
153 // }
154 // }
155 //
156 // This gets us close to the speed of the second solution, without
157 // leaking memory. A rcu_reader costs about 4 ns, faster than the
158 // lock_shared() / unlock_shared() pair in the more traditional
159 // mutex-based approach from our first attempt, and the writer
160 // never blocks the readers.
161 
162 // Notes
163 
164 // This implementation does implement an rcu_domain, and provides a default
165 // one for use per the standard implementation.
166 //
167 // rcu_domain:
168 // A "universe" of deferred execution. Each rcu_domain has an
169 // executor on which deferred functions may execute. Readers obtain
170 // Tokens from an rcu_domain and release them back to it.
171 // rcu_domains should in general be completely separated; it's never
172 // correct to pass a token from one domain to another, and
173 // rcu_reader objects created on one domain do not prevent functions
174 // deferred on other domains from running. It's intended that most
175 // callers should only ever use the default, global domain.
176 //
177 // Creation of a domain takes a template tag argument, which
178 // defaults to void. To access different domains, you have to pass a
179 // different tag. The global domain is preferred for almost all
180 // purposes, unless a different executor is required.
181 //
182 // You should use a custom rcu_domain if you can't avoid sleeping
183 // during reader critical sections (because you don't want to block
184 // all other users of the domain while you sleep), or you want to change
185 // the default executor type.
186 
187 // API correctness limitations:
188 // - Exceptions:
189 // In short, nothing about this is exception safe. retire functions should
190 // not throw exceptions in their destructors, move constructors or call
191 // operators.
192 //
193 // Performance limitations:
194 // - Blocking:
195 // A blocked reader will block invocation of deferred functions until it
196 // becomes unblocked. Sleeping while holding a rcu_reader can have bad
197 // performance consequences.
198 //
199 // API questions you might have:
200 // - Nested critical sections:
201 // These are fine. The following is explicitly allowed by the standard, up to
202 // a nesting level of 100:
203 // rcu_reader reader1;
204 // doSomeWork();
205 // rcu_reader reader2;
206 // doSomeMoreWork();
207 // - Restrictions on retired()ed functions:
208 // Any operation is safe from within a retired function's
209 // execution; you can retire additional functions or add a new domain call to
210 // the domain.
211 // - rcu_domain destruction:
212 // Destruction of a domain assumes previous synchronization: all remaining
213 // call and retire calls are immediately added to the executor.
214 
215 // Overheads
216 
217 // Deferral latency is as low as is practical: overhead involves running
218 // several global memory barriers on the machine to ensure all readers are gone.
219 //
220 // Currently use of MPMCQueue is the bottleneck for deferred calls, more
221 // specialized queues could be used if available, since only a single reader
222 // reads the queue, and can splice all of the items to the executor if possible.
223 //
224 // synchronize_rcu() call latency is on the order of 10ms. Multiple
225 // separate threads can share a synchronized period and should scale.
226 //
227 // rcu_retire() is a queue push, and on the order of 150 ns, however,
228 // the current implementation may synchronize if the retire queue is full,
229 // resulting in tail latencies of ~10ms.
230 //
231 // rcu_reader creation/destruction is ~4ns. By comparison,
232 // folly::SharedMutex::lock_shared + unlock_shared pair is ~26ns
233 
234 // Hazard pointers vs. RCU:
235 //
236 // Hazard pointers protect pointers, generally malloc()d pieces of memory, and
237 // each hazptr only protects one such block.
238 //
239 // RCU protects critical sections, *all* memory is protected as long
240 // as the critical section is active.
241 //
242 // For example, this has implications for linked lists: If you wish to
243 // free an entire linked list, you simply rcu_retire() each node with
244 // RCU: readers will see either an entirely valid list, or no list at
245 // all.
246 //
247 // Conversely with hazptrs, generally lists are walked with
248 // hand-over-hand hazptrs. Only one node is protected at a time. So
249 // anywhere in the middle of the list, hazptrs may read NULL, and throw
250 // away all current work. Alternatively, reference counting can be used,
251 // (as if each node was a shared_ptr<node>), so that readers will always see
252 // *the remaining part* of the list as valid, however parts before its current
253 // hazptr may be freed.
254 //
255 // So roughly: RCU is simple, but an all-or-nothing affair. A single rcu_reader
256 // can block all reclamation. Hazptrs will reclaim exactly as much as possible,
257 // at the cost of extra work writing traversal code
258 //
259 // Reproduced from
260 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0461r1.pdf
261 //
262 // Reference Counting RCU Hazptr
263 //
264 // Unreclaimed objects Bounded Unbounded Bounded
265 //
266 // Contention among readers High None None
267 //
268 // Traversal forward progress lock-free wait-free lock-free
269 //
270 // Reclamation forward progress lock-free blocking wait-free
271 //
272 // Traversal speed atomic no-overhead folly's is
273 // no-overhead
274 //
275 // Reference acquisition unconditional unconditional conditional
276 //
277 // Automatic reclamation yes no no
278 //
279 // Purpose of domains N/A isolate slow configeration
280 // readers
281 
282 namespace folly {
283 
284 struct RcuTag;
285 
286 template <typename Tag = RcuTag>
288 
289 // Opaque token used to match up lock_shared() and unlock_shared()
290 // pairs.
291 class rcu_token {
292  public:
293  rcu_token(uint64_t epoch) : epoch_(epoch) {}
295  ~rcu_token() = default;
296 
297  rcu_token(const rcu_token&) = delete;
298  rcu_token(rcu_token&& other) = default;
299  rcu_token& operator=(const rcu_token& other) = delete;
300  rcu_token& operator=(rcu_token&& other) = default;
301 
302  private:
303  template <typename Tag>
304  friend class rcu_domain;
306 };
307 
308 // For most usages, rcu_domain is unnecessary, and you can use
309 // rcu_reader and rcu_retire/synchronize_rcu directly.
310 template <typename Tag>
311 class rcu_domain {
314 
315  public:
316  /*
317  * If an executor is passed, it is used to run calls and delete
318  * retired objects.
319  */
320  rcu_domain(Executor* executor = nullptr) noexcept;
321 
322  rcu_domain(const rcu_domain&) = delete;
323  rcu_domain(rcu_domain&&) = delete;
324  rcu_domain& operator=(const rcu_domain&) = delete;
325  rcu_domain& operator=(rcu_domain&&) = delete;
326  ~rcu_domain();
327 
328  // Reader locks: Prevent any calls from occuring, retired memory
329  // from being freed, and synchronize() calls from completing until
330  // all preceding lock_shared() sections are finished.
331 
332  // Note: can potentially allocate on thread first use.
333  FOLLY_ALWAYS_INLINE rcu_token lock_shared();
334  FOLLY_ALWAYS_INLINE void unlock_shared(rcu_token&&);
335 
336  // Call a function after concurrent critical sections have finished.
337  // Does not block unless the queue is full, then may block to wait
338  // for scheduler thread, but generally does not wait for full
339  // synchronization.
340  template <typename T>
341  void call(T&& cbin);
342  void retire(list_node* node) noexcept;
343 
344  // Ensure concurrent critical sections have finished.
345  // Always waits for full synchronization.
346  // read lock *must not* be held.
347  void synchronize() noexcept;
348 
349  private:
351  // Global epoch.
352  std::atomic<uint64_t> version_{0};
353  // Future epochs being driven by threads in synchronize
354  std::atomic<uint64_t> work_{0};
355  // Matches version_, for waking up threads in synchronize that are
356  // sharing an epoch.
358  // Only one thread can drive half_sync.
360  // Currently half_sync takes ~16ms due to heavy barriers.
361  // Ensure that if used in a single thread, max GC time
362  // is limited to 1% of total CPU time.
363  static constexpr uint64_t syncTimePeriod_{1600 * 2 /* full sync is 2x */};
364  std::atomic<uint64_t> syncTime_{0};
365  // call()s waiting to move through two epochs.
367  // Executor callbacks will eventually be run on.
368  Executor* executor_{nullptr};
369  static bool singleton_; // Ensure uniqueness per-tag.
370 
371  // Queues for callbacks waiting to go through two epochs.
372  list_head queues_[2]{};
373 
374  // Move queues through one epoch (half of a full synchronize()).
375  // Will block waiting for readers to exit if blocking is true.
376  // blocking must *not* be true if the current thread is locked,
377  // or will deadlock.
378  //
379  // returns a list of callbacks ready to run in cbs.
380  void half_sync(bool blocking, list_head& cbs);
381 };
382 
384 
386  return *rcu_default_domain_;
387 }
388 
389 // Main reader guard class.
390 template <typename Tag = RcuTag>
392  public:
395  : epoch_(domain->lock_shared()), domain_(domain) {}
397  std::defer_lock_t,
399  : domain_(domain) {}
400  rcu_reader_domain(const rcu_reader_domain&) = delete;
402  : epoch_(std::move(other.epoch_)),
403  domain_(std::exchange(other.domain_, nullptr)) {}
406  if (epoch_.has_value()) {
407  domain_->unlock_shared(std::move(epoch_.value()));
408  }
409  epoch_ = std::move(other.epoch_);
410  domain_ = std::move(other.domain_);
411  return *this;
412  }
413 
415  if (epoch_.has_value()) {
416  domain_->unlock_shared(std::move(epoch_.value()));
417  }
418  }
419 
421  std::swap(epoch_, other.epoch_);
422  std::swap(domain_, other.domain_);
423  }
424 
426  DCHECK(!epoch_.has_value());
427  epoch_ = domain_->lock_shared();
428  }
429 
431  DCHECK(epoch_.has_value());
432  domain_->unlock_shared(std::move(epoch_.value()));
433  }
434 
435  private:
438 };
439 
441 
442 template <typename Tag = RcuTag>
443 inline void swap(
446  a.swap(b);
447 }
448 
449 template <typename Tag = RcuTag>
450 inline void synchronize_rcu(
452  domain->synchronize();
453 }
454 
455 template <typename Tag = RcuTag>
456 inline void rcu_barrier(
458  domain->synchronize();
459 }
460 
461 // Free-function retire. Always allocates.
462 template <
463  typename T,
464  typename D = std::default_delete<T>,
465  typename Tag = RcuTag>
467  T* p,
468  D d = {},
469  rcu_domain<Tag>* domain = rcu_default_domain()) {
470  domain->call([p, del = std::move(d)]() { del(p); });
471 }
472 
473 // Base class for rcu objects. retire() will use preallocated storage
474 // from rcu_obj_base, vs. rcu_retire() which always allocates.
475 template <
476  typename T,
477  typename D = std::default_delete<T>,
478  typename Tag = RcuTag>
480  public:
481  void retire(D d = {}, rcu_domain<Tag>* domain = rcu_default_domain()) {
482  // This implementation assumes folly::Function has enough
483  // inline storage for D, otherwise, it allocates.
484  this->cb_ = [this, d = std::move(d)]() { d(static_cast<T*>(this)); };
485  domain->retire(this);
486  }
487 };
488 
489 } // namespace folly
490 
Optional< rcu_token > epoch_
Definition: Rcu.h:436
FOLLY_ALWAYS_INLINE void unlock() noexcept
Definition: Rcu.h:430
FOLLY_ALWAYS_INLINE void lock() noexcept
Definition: Rcu.h:425
std::mutex syncMutex_
Definition: Rcu.h:359
#define FOLLY_ALWAYS_INLINE
Definition: CPortability.h:151
char b
void swap(rcu_reader_domain< Tag > &a, rcu_reader_domain< Tag > &b) noexcept
Definition: Rcu.h:443
static bool singleton_
Definition: Rcu.h:369
typename detail::ThreadCachedLists< folly::detail::Tag >::ListHead list_head
Definition: Rcu.h:312
void swap(rcu_reader_domain &other) noexcept
Definition: Rcu.h:420
FOLLY_ALWAYS_INLINE rcu_reader_domain(rcu_domain< Tag > *domain=rcu_default_domain()) noexcept
Definition: Rcu.h:393
constexpr detail::Map< Move > move
Definition: Base-inl.h:2567
STL namespace.
folly::std T
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
requires E e noexcept(noexcept(s.error(std::move(e))))
PUSHMI_INLINE_VAR constexpr __adl::get_executor_fn executor
void call(T &&cbin)
Definition: Rcu-inl.h:69
rcu_reader_domain(rcu_reader_domain &&other) noexcept
Definition: Rcu.h:401
folly::Indestructible< rcu_domain< RcuTag > * > rcu_default_domain_
rcu_reader_domain(std::defer_lock_t, rcu_domain< Tag > *domain=rcu_default_domain()) noexcept
Definition: Rcu.h:396
rcu_domain< Tag > * domain_
Definition: Rcu.h:437
~rcu_token()=default
std::unique_ptr< AsyncFizzServer::HandshakeCallback > cb_
#define D(name, bit)
Definition: CpuId.h:145
FOLLY_ALWAYS_INLINE ~rcu_reader_domain() noexcept
Definition: Rcu.h:414
void rcu_retire(T *p, D d={}, rcu_domain< Tag > *domain=rcu_default_domain())
Definition: Rcu.h:466
detail::TurnSequencer< std::atomic > turn_
Definition: Rcu.h:357
rcu_domain< RcuTag > * rcu_default_domain()
Definition: Rcu.h:385
char a
rcu_reader_domain & operator=(rcu_reader_domain &&other) noexcept
Definition: Rcu.h:405
void rcu_barrier(rcu_domain< Tag > *domain=rcu_default_domain()) noexcept
Definition: Rcu.h:456
void swap(exception_wrapper &a, exception_wrapper &b) noexcept
detail::ThreadCachedLists< Tag > q_
Definition: Rcu.h:366
rcu_token & operator=(const rcu_token &other)=delete
T exchange(T &obj, U &&new_value)
Definition: Utility.h:120
friend class rcu_domain
Definition: Rcu.h:304
std::mutex mutex
const
Definition: upload.py:398
rcu_token(uint64_t epoch)
Definition: Rcu.h:293
typename detail::ThreadCachedLists< folly::detail::Tag >::Node list_node
Definition: Rcu.h:313
uint64_t epoch_
Definition: Rcu.h:305
void retire(D d={}, rcu_domain< Tag > *domain=rcu_default_domain())
Definition: Rcu.h:481
void synchronize_rcu(rcu_domain< Tag > *domain=rcu_default_domain()) noexcept
Definition: Rcu.h:450
Future< bool > call(int depth, Executor *executor)
void retire(list_node *node) noexcept
Definition: Rcu-inl.h:79