proxygen
TDigest.h
Go to the documentation of this file.
1 /*
2  * Copyright 2012-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cmath>
20 #include <vector>
21 
22 #include <folly/Range.h>
23 #include <folly/Utility.h>
24 
25 namespace folly {
26 
27 /*
28  * TDigests are a biased quantile estimator designed to estimate the values of
29  * the quantiles of streaming data with high accuracy and low memory,
30  * particularly for quantiles at the tails (p0.1, p1, p99, p99.9). See
31  * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
32  * for an explanation of what the purpose of TDigests is, and how they work.
33  *
34  * There is a notable difference between the implementation here and the
35  * implementation in the paper. In the paper, the recommended scaling function
36  * for bucketing centroids is an arcsin function. The arcsin function provides
37  * high accuracy for low memory, but comes at a relatively high compute cost.
38  * A good choice algorithm has the following properties:
39  * - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
40  * This is a requirement for any t-digest function.
41  * - The limit of the derivative of the function dk/dq at 0 is inf, and at
42  * 1 is inf. This provides bias to improve accuracy at the tails.
43  * - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
44  * of upper and lower quantiles are equivalent.
45  * As such, TDigest uses a sqrt function with these properties, which is faster
46  * than arcsin. There is a small, but relatively negligible impact to accuracy
47  * at the tail. In empirical tests, accuracy of the sqrt approach has been
48  * adequate.
49  */
50 class TDigest {
51  public:
52  class Centroid {
53  public:
54  explicit Centroid(double mean = 0.0, double weight = 1.0)
55  : mean_(mean), weight_(weight) {
56  DCHECK_GT(weight, 0);
57  }
58 
59  inline double mean() const {
60  return mean_;
61  }
62 
63  inline double weight() const {
64  return weight_;
65  }
66 
67  /*
68  * Adds the sum/weight to this centroid, and returns the new sum.
69  */
70  inline double add(double sum, double weight);
71 
72  inline bool operator<(const Centroid& other) const {
73  return mean() < other.mean();
74  }
75 
76  private:
77  double mean_;
78  double weight_;
79  };
80 
81  explicit TDigest(size_t maxSize = 100)
82  : maxSize_(maxSize), sum_(0.0), count_(0.0), max_(NAN), min_(NAN) {}
83 
84  explicit TDigest(
85  std::vector<Centroid> centroids,
86  double sum,
87  double count,
88  double max_val,
89  double min_val,
90  size_t maxSize = 100);
91 
92  /*
93  * Returns a new TDigest constructed with values merged from the current
94  * digest and the given sortedValues.
95  */
96  TDigest merge(presorted_t, Range<const double*> sortedValues) const;
97  TDigest merge(Range<const double*> unsortedValues) const;
98 
99  /*
100  * Returns a new TDigest constructed with values merged from the given
101  * digests.
102  */
103  static TDigest merge(Range<const TDigest*> digests);
104 
105  /*
106  * Estimates the value of the given quantile.
107  */
108  double estimateQuantile(double q) const;
109 
110  double mean() const {
111  return count_ ? sum_ / count_ : 0;
112  }
113 
114  double sum() const {
115  return sum_;
116  }
117 
118  double count() const {
119  return count_;
120  }
121 
122  double min() const {
123  return min_;
124  }
125 
126  double max() const {
127  return max_;
128  }
129 
130  bool empty() const {
131  return centroids_.empty();
132  }
133 
134  const std::vector<Centroid>& getCentroids() const {
135  return centroids_;
136  }
137 
138  size_t maxSize() const {
139  return maxSize_;
140  }
141 
142  private:
143  std::vector<Centroid> centroids_;
144  size_t maxSize_;
145  double sum_;
146  double count_;
147  double max_;
148  double min_;
149 };
150 
151 } // namespace folly
double weight() const
Definition: TDigest.h:63
double sum_
Definition: TDigest.h:145
double mean() const
Definition: TDigest.h:59
const std::vector< Centroid > & getCentroids() const
Definition: TDigest.h:134
double estimateQuantile(double q) const
Definition: TDigest.cpp:312
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
double sum() const
Definition: TDigest.h:114
double count() const
Definition: TDigest.h:118
double add(double sum, double weight)
Definition: TDigest.cpp:369
bool operator<(const Centroid &other) const
Definition: TDigest.h:72
size_t maxSize() const
Definition: TDigest.h:138
size_t maxSize_
Definition: TDigest.h:144
std::vector< Centroid > centroids_
Definition: TDigest.h:143
TDigest(size_t maxSize=100)
Definition: TDigest.h:81
double mean() const
Definition: TDigest.h:110
Centroid(double mean=0.0, double weight=1.0)
Definition: TDigest.h:54
TDigest merge(presorted_t, Range< const double * > sortedValues) const
Definition: TDigest.cpp:126
double min_
Definition: TDigest.h:148
bool empty() const
Definition: TDigest.h:130
double count_
Definition: TDigest.h:146
double max() const
Definition: TDigest.h:126
double min() const
Definition: TDigest.h:122
double max_
Definition: TDigest.h:147