proxygen
RendezvousHash.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree. An additional grant
7  * of patent rights can be found in the PATENTS file in the same directory.
8  *
9  */
12 #include <folly/hash/Hash.h>
13 #include <map>
14 #include <algorithm>
15 #include <vector>
16 #include <limits>
17 #include <math.h> /* pow */
18 
19 namespace proxygen {
20 void RendezvousHash::build(std::vector<std::pair<
21  std::string, uint64_t> >&nodes) {
22  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
23  std::string key = it->first;
24  uint64_t weight = it->second;
25  weights_.emplace_back(computeHash(key.c_str(), key.size()), weight);
26  }
27 }
28 
29 /*
30  * The algorithm of RendezvousHash goes like this:
31  * Assuming we have 3 clusters with names and weights:
32  * ==============================================
33  * | Cluster1 | Cluster2 | Cluster3 |
34  * | n="ash4c07" | n="frc1c12" | n="prn1c11" |
35  * | w=100 | w=400 | w=500 |
36  * ==============================================
37  * To prepare, we calculate a hash for each cluster based on its name:
38  * ==============================================
39  * | Cluster1 | Cluster2 | Cluster3 |
40  * | n="ash4c07" | n="frc1c12" | n="prn1c11" |
41  * | w = 100 | w=400 | w=500 |
42  * | h = hash(n) | h = hash(n) | h = hash(n) |
43  * ==============================================
44  *
45  * When a key comes, we have to decide which cluster we want to assign it to:
46  * E.g., k = 10240
47  *
48  * For each cluster, we calculate a combined hash with the sum of key
49  * and the cluster's hash:
50  *
51  *
52  * ==============================================
53  * | Cluster1 | Cluster2 | Cluster3 |
54  * ...
55  * k | h=hash(n) | h = hash(n) | h=hash(n) |
56  * | ==============================================
57  * | |
58  * +-------------+----------------------------+
59  * |
60  * ch=hash(h + k)
61  * |
62  * v
63  * ==============================================
64  * | Cluster1 | Cluster2 | Cluster3 |
65  * | n="ash4c07" | n="frc1c12" | n="prn1c11" |
66  * | w=100 | w=400 | w=500 |
67  * | h=hash(n) | h = hash(n) | h=hash(n) |
68  * |ch=hash(h+k) |ch = hash(h+k)|ch=hash(h+k) |
69  * ==============================================
70  *
71  * ch is now a random variable from 0 to max_int that follows
72  * uniform distribution,
73  * we need to scale it to a r.v. * from 0 to 1 by dividing it with max_int:
74  *
75  * scaledHash = ch / max_int
76  *
77  * ==============================================
78  * | Cluster1 | Cluster2 | Cluster3 |
79  * ....
80  * |ch=hash(h+k) |ch = hash(h+k)|ch=hash(h+k) |
81  * ==============================================
82  * |
83  * sh=ch/max_int
84  * |
85  * v
86  * ==============================================
87  * | Cluster1 | Cluster2 | Cluster3 |
88  * ....
89  * |ch=hash(h+k) |ch = hash(h+k)|ch=hash(h+k) |
90  * |sh=ch/max_int |sh=ch/max_int |sh=ch/max_int |
91  * ==============================================
92  *
93  * We also need to respect the weights, we have to scale it again with
94  * a function of its weight:
95  *
96  * ==============================================
97  * | Cluster1 | Cluster2 | Cluster3 |
98  * ....
99  * |sh=ch/max_int |sh=ch/max_int |sh=ch/max_int |
100  * ==============================================
101  * |
102  * |
103  * sw = pow(sh, 1/w)
104  * |
105  * V
106  * ==============================================
107  * | Cluster1 | Cluster2 | Cluster3 |
108  * ....
109  * |sh=ch/max_int |sh=ch/max_int |sh=ch/max_int |
110  * |sw=pow(sh,1/w)|sw=pow(sh,1/w)|sw=pow(sh,1/w)|
111  * ==============================================
112  *
113  * We now calculate who has the largest sw, that is the cluster that we are
114  * going to map k into:
115  * ==============================================
116  * | Cluster1 | Cluster2 | Cluster3 |
117  * ....
118  * |sw=pow(sh,1/w)|sw=pow(sh,1/w)|sw=pow(sh,1/w)|
119  * ==============================================
120  * |
121  * max(sw)
122  * |
123  * V
124  * Cluster
125  *
126  */
127 size_t RendezvousHash::get(const uint64_t key, const size_t rank) const {
128  return getNthByWeightedHash(key, rank, nullptr);
129 }
130 
131 /*
132  * Calculate Hash scaled by weight and return Top N weights.
133  * */
135  const uint64_t key,
136  const size_t rank,
137  std::vector<size_t>* returnRankIds) const {
138  size_t modRank = rank % weights_.size();
139  // optimize if required to return element with max weight, rank ==
140  // weights_.size(), keep track of the maxWeightIndex instead of populating
141  // scaledWeights array.
142  double maxWeight = -1;
143  int maxWeightIndex = 0;
144 
145  std::vector<std::pair<double, size_t>> scaledWeights;
146  if (modRank != 0) {
147  scaledWeights.reserve(weights_.size());
148  }
149  FOR_EACH_ENUMERATE(i, entry, weights_) {
150  // combine the hash with the cluster together
151  double combinedHash = computeHash(entry->first + key);
152  double scaledHash =
153  (double)combinedHash / std::numeric_limits<uint64_t>::max();
154  double scaledWeight = 0;
155  if (entry->second != 0) {
156  scaledWeight = pow(scaledHash, (double)1 / entry->second);
157  }
158  if (modRank == 0) {
159  if (scaledWeight > maxWeight) {
160  maxWeight = scaledWeight;
161  maxWeightIndex = i;
162  }
163  } else {
164  scaledWeights.emplace_back(scaledWeight, i);
165  }
166  }
167 
168  size_t rankIndex;
169  if (modRank == 0) {
170  rankIndex = maxWeightIndex;
171  } else {
172  std::nth_element(scaledWeights.begin(),
173  scaledWeights.begin() + modRank,
174  scaledWeights.end(),
175  std::greater<std::pair<double, size_t>>());
176  rankIndex = scaledWeights[modRank].second;
177  }
178 
179  if (returnRankIds) {
180  if (modRank == 0) {
181  returnRankIds->push_back(rankIndex);
182  } else {
183  returnRankIds->reserve(modRank);
184  for (size_t i = 0; i < modRank; i++) {
185  returnRankIds->push_back(scaledWeights[i].second);
186  }
187  }
188  }
189 
190  return rankIndex;
191 }
192 
193 /*
194  * Returns a consistent hash selection of N elements from array.
195  *
196  * This type of selection only obeys the probability distribution
197  * when all weights are identical.
198  * */
199 std::vector<size_t> RendezvousHash::selectNUnweighted(const uint64_t key,
200  const size_t rank) const {
201  std::vector<size_t> selection;
202  // shortcut if rank is equal or larger than array size
203  if (rank >= weights_.size()) {
204  selection = std::vector<size_t>(weights_.size());
205  std::generate(
206  selection.begin(), selection.end(), [n = 0]() mutable { return n++; });
207  return selection;
208  }
209 
210  getNthByWeightedHash(key, rank, &selection);
211  return selection;
212 }
213 
214 uint64_t RendezvousHash::computeHash(const char* data, size_t len) const {
215  return folly::hash::fnv64_buf(data, len);
216 }
217 
219  return folly::hash::twang_mix64(i);
220 }
221 
223  return 0;
224 }
225 
226 }
uint64_t fnv64_buf(const void *buf, size_t n, uint64_t hash=FNV_64_HASH_START) noexcept
Definition: Hash.h:199
LogLevel max
Definition: LogLevel.cpp:31
uint64_t computeHash(const char *data, size_t len) const
void build(std::vector< std::pair< std::string, uint64_t >> &) override
size_t get(const uint64_t key, const size_t rank=0) const override
std::vector< size_t > selectNUnweighted(const uint64_t key, const size_t rank) const
#define FOR_EACH_ENUMERATE(count, i, c)
Definition: Foreach.h:170
uint64_t twang_mix64(uint64_t key) noexcept
Definition: Hash.h:49
const char * string
Definition: Conv.cpp:212
size_t getNthByWeightedHash(const uint64_t key, const size_t modRank, std::vector< size_t > *returnRankIds) const
static constexpr uint64_t data[1]
Definition: Fingerprint.cpp:43
std::vector< std::pair< uint64_t, uint64_t > > weights_
double getMaxErrorRate() const override