tesseract  3.05.02
detlinefit.cpp
Go to the documentation of this file.
1 // File: detlinefit.cpp
3 // Description: Deterministic least median squares line fitting.
4 // Author: Ray Smith
5 // Created: Thu Feb 28 14:45:01 PDT 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "detlinefit.h"
21 #include "statistc.h"
22 #include "ndminx.h"
23 #include "tprintf.h"
24 
25 namespace tesseract {
26 
27 // The number of points to consider at each end.
28 const int kNumEndPoints = 3;
29 // The minimum number of points at which to switch to number of points
30 // for badly fitted lines.
31 // To ensure a sensible error metric, kMinPointsForErrorCount should be at
32 // least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
33 // ComputeUpperQuartileError.
34 const int kMinPointsForErrorCount = 16;
35 // The maximum real distance to use before switching to number of
36 // mis-fitted points, which will get square-rooted for true distance.
37 const int kMaxRealDistance = 2.0;
38 
39 DetLineFit::DetLineFit() : square_length_(0.0) {
40 }
41 
43 }
44 
45 // Delete all Added points.
47  pts_.clear();
48  distances_.clear();
49 }
50 
51 // Add a new point. Takes a copy - the pt doesn't need to stay in scope.
52 void DetLineFit::Add(const ICOORD& pt) {
53  pts_.push_back(PointWidth(pt, 0));
54 }
55 // Associates a half-width with the given point if a point overlaps the
56 // previous point by more than half the width, and its distance is further
57 // than the previous point, then the more distant point is ignored in the
58 // distance calculation. Useful for ignoring i dots and other diacritics.
59 void DetLineFit::Add(const ICOORD& pt, int halfwidth) {
60  pts_.push_back(PointWidth(pt, halfwidth));
61 }
62 
63 // Fits a line to the points, ignoring the skip_first initial points and the
64 // skip_last final points, returning the fitted line as a pair of points,
65 // and the upper quartile error.
66 double DetLineFit::Fit(int skip_first, int skip_last,
67  ICOORD* pt1, ICOORD* pt2) {
68  // Do something sensible with no points.
69  if (pts_.empty()) {
70  pt1->set_x(0);
71  pt1->set_y(0);
72  *pt2 = *pt1;
73  return 0.0;
74  }
75  // Count the points and find the first and last kNumEndPoints.
76  int pt_count = pts_.size();
77  ICOORD* starts[kNumEndPoints];
78  if (skip_first >= pt_count) skip_first = pt_count - 1;
79  int start_count = 0;
80  int end_i = MIN(skip_first + kNumEndPoints, pt_count);
81  for (int i = skip_first; i < end_i; ++i) {
82  starts[start_count++] = &pts_[i].pt;
83  }
84  ICOORD* ends[kNumEndPoints];
85  if (skip_last >= pt_count) skip_last = pt_count - 1;
86  int end_count = 0;
87  end_i = MAX(0, pt_count - kNumEndPoints - skip_last);
88  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
89  ends[end_count++] = &pts_[i].pt;
90  }
91  // 1 or 2 points need special treatment.
92  if (pt_count <= 2) {
93  *pt1 = *starts[0];
94  if (pt_count > 1)
95  *pt2 = *ends[0];
96  else
97  *pt2 = *pt1;
98  return 0.0;
99  }
100  // Although with between 2 and 2*kNumEndPoints-1 points, there will be
101  // overlap in the starts, ends sets, this is OK and taken care of by the
102  // if (*start != *end) test below, which also tests for equal input points.
103  double best_uq = -1.0;
104  // Iterate each pair of points and find the best fitting line.
105  for (int i = 0; i < start_count; ++i) {
106  ICOORD* start = starts[i];
107  for (int j = 0; j < end_count; ++j) {
108  ICOORD* end = ends[j];
109  if (*start != *end) {
110  ComputeDistances(*start, *end);
111  // Compute the upper quartile error from the line.
112  double dist = EvaluateLineFit();
113  if (dist < best_uq || best_uq < 0.0) {
114  best_uq = dist;
115  *pt1 = *start;
116  *pt2 = *end;
117  }
118  }
119  }
120  }
121  // Finally compute the square root to return the true distance.
122  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
123 }
124 
125 // Constrained fit with a supplied direction vector. Finds the best line_pt,
126 // that is one of the supplied points having the median cross product with
127 // direction, ignoring points that have a cross product outside of the range
128 // [min_dist, max_dist]. Returns the resulting error metric using the same
129 // reduced set of points.
130 // *Makes use of floating point arithmetic*
132  double min_dist, double max_dist,
133  bool debug, ICOORD* line_pt) {
134  ComputeConstrainedDistances(direction, min_dist, max_dist);
135  // Do something sensible with no points or computed distances.
136  if (pts_.empty() || distances_.empty()) {
137  line_pt->set_x(0);
138  line_pt->set_y(0);
139  return 0.0;
140  }
141  int median_index = distances_.choose_nth_item(distances_.size() / 2);
142  *line_pt = distances_[median_index].data;
143  if (debug) {
144  tprintf("Constrained fit to dir %g, %g = %d, %d :%d distances:\n",
145  direction.x(), direction.y(),
146  line_pt->x(), line_pt->y(), distances_.size());
147  for (int i = 0; i < distances_.size(); ++i) {
148  tprintf("%d: %d, %d -> %g\n", i, distances_[i].data.x(),
149  distances_[i].data.y(), distances_[i].key);
150  }
151  tprintf("Result = %d\n", median_index);
152  }
153  // Center distances on the fitted point.
154  double dist_origin = direction * *line_pt;
155  for (int i = 0; i < distances_.size(); ++i) {
156  distances_[i].key -= dist_origin;
157  }
158  return sqrt(EvaluateLineFit());
159 }
160 
161 // Returns true if there were enough points at the last call to Fit or
162 // ConstrainedFit for the fitted points to be used on a badly fitted line.
164  return distances_.size() >= kMinPointsForErrorCount;
165 }
166 
167 // Backwards compatible fit returning a gradient and constant.
168 // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
169 // function in preference to the LMS class.
170 double DetLineFit::Fit(float* m, float* c) {
171  ICOORD start, end;
172  double error = Fit(&start, &end);
173  if (end.x() != start.x()) {
174  *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
175  *c = start.y() - *m * start.x();
176  } else {
177  *m = 0.0f;
178  *c = 0.0f;
179  }
180  return error;
181 }
182 
183 // Backwards compatible constrained fit with a supplied gradient.
184 // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
185 // to avoid potential difficulties with infinite gradients.
186 double DetLineFit::ConstrainedFit(double m, float* c) {
187  // Do something sensible with no points.
188  if (pts_.empty()) {
189  *c = 0.0f;
190  return 0.0;
191  }
192  double cos = 1.0 / sqrt(1.0 + m * m);
193  FCOORD direction(cos, m * cos);
194  ICOORD line_pt;
195  double error = ConstrainedFit(direction, -MAX_FLOAT32, MAX_FLOAT32, false,
196  &line_pt);
197  *c = line_pt.y() - line_pt.x() * m;
198  return error;
199 }
200 
201 // Computes and returns the squared evaluation metric for a line fit.
202 double DetLineFit::EvaluateLineFit() {
203  // Compute the upper quartile error from the line.
204  double dist = ComputeUpperQuartileError();
205  if (distances_.size() >= kMinPointsForErrorCount &&
207  // Use the number of mis-fitted points as the error metric, as this
208  // gives a better measure of fit for badly fitted lines where more
209  // than a quarter are badly fitted.
210  double threshold = kMaxRealDistance * sqrt(square_length_);
211  dist = NumberOfMisfittedPoints(threshold);
212  }
213  return dist;
214 }
215 
216 // Computes the absolute error distances of the points from the line,
217 // and returns the squared upper-quartile error distance.
218 double DetLineFit::ComputeUpperQuartileError() {
219  int num_errors = distances_.size();
220  if (num_errors == 0) return 0.0;
221  // Get the absolute values of the errors.
222  for (int i = 0; i < num_errors; ++i) {
223  if (distances_[i].key < 0) distances_[i].key = -distances_[i].key;
224  }
225  // Now get the upper quartile distance.
226  int index = distances_.choose_nth_item(3 * num_errors / 4);
227  double dist = distances_[index].key;
228  // The true distance is the square root of the dist squared / square_length.
229  // Don't bother with the square root. Just return the square distance.
230  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
231 }
232 
233 // Returns the number of sample points that have an error more than threshold.
234 int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
235  int num_misfits = 0;
236  int num_dists = distances_.size();
237  // Get the absolute values of the errors.
238  for (int i = 0; i < num_dists; ++i) {
239  if (distances_[i].key > threshold)
240  ++num_misfits;
241  }
242  return num_misfits;
243 }
244 
245 // Computes all the cross product distances of the points from the line,
246 // storing the actual (signed) cross products in distances.
247 // Ignores distances of points that are further away than the previous point,
248 // and overlaps the previous point by at least half.
249 void DetLineFit::ComputeDistances(const ICOORD& start, const ICOORD& end) {
250  distances_.truncate(0);
251  ICOORD line_vector = end;
252  line_vector -= start;
253  square_length_ = line_vector.sqlength();
254  int line_length = IntCastRounded(sqrt(square_length_));
255  // Compute the distance of each point from the line.
256  int prev_abs_dist = 0;
257  int prev_dot = 0;
258  for (int i = 0; i < pts_.size(); ++i) {
259  ICOORD pt_vector = pts_[i].pt;
260  pt_vector -= start;
261  int dot = line_vector % pt_vector;
262  // Compute |line_vector||pt_vector|sin(angle between)
263  int dist = line_vector * pt_vector;
264  int abs_dist = dist < 0 ? -dist : dist;
265  if (abs_dist > prev_abs_dist && i > 0) {
266  // Ignore this point if it overlaps the previous one.
267  int separation = abs(dot - prev_dot);
268  if (separation < line_length * pts_[i].halfwidth ||
269  separation < line_length * pts_[i - 1].halfwidth)
270  continue;
271  }
272  distances_.push_back(DistPointPair(dist, pts_[i].pt));
273  prev_abs_dist = abs_dist;
274  prev_dot = dot;
275  }
276 }
277 
278 // Computes all the cross product distances of the points perpendicular to
279 // the given direction, ignoring distances outside of the give distance range,
280 // storing the actual (signed) cross products in distances_.
281 void DetLineFit::ComputeConstrainedDistances(const FCOORD& direction,
282  double min_dist, double max_dist) {
283  distances_.truncate(0);
284  square_length_ = direction.sqlength();
285  // Compute the distance of each point from the line.
286  for (int i = 0; i < pts_.size(); ++i) {
287  FCOORD pt_vector = pts_[i].pt;
288  // Compute |line_vector||pt_vector|sin(angle between)
289  double dist = direction * pt_vector;
290  if (min_dist <= dist && dist <= max_dist)
291  distances_.push_back(DistPointPair(dist, pts_[i].pt));
292  }
293 }
294 
295 } // namespace tesseract.
int IntCastRounded(double x)
Definition: helpers.h:172
integer coordinate
Definition: points.h:30
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:52
#define MIN(x, y)
Definition: ndminx.h:28
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
Definition: detlinefit.cpp:131
int push_back(T object)
#define MAX_FLOAT32
Definition: host.h:57
const int kMaxRealDistance
Definition: detlinefit.cpp:37
bool SufficientPointsForIndependentFit() const
Definition: detlinefit.cpp:163
inT16 x() const
access function
Definition: points.h:52
float sqlength() const
find sq length
Definition: points.h:73
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
#define MAX(x, y)
Definition: ndminx.h:24
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
Definition: points.h:189
const int kNumEndPoints
Definition: detlinefit.cpp:28
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
bool empty() const
Definition: genericvector.h:84
const int kMinPointsForErrorCount
Definition: detlinefit.cpp:34
inT16 y() const
access_function
Definition: points.h:56