tesseract  3.05.02
OL_BUCKETS Class Reference

#include <edgblob.h>

Public Member Functions

 ~OL_BUCKETS ()
 
C_OUTLINE_LIST * start_scan ()
 
C_OUTLINE_LIST * scan_next ()
 
OL_BUCKETS::OL_BUCKETS

Construct an array of buckets for associating outlines into blobs.

 OL_BUCKETS (ICOORD bleft, ICOORD tright)
 
OL_BUCKETS::operator(

Return a pointer to a list of C_OUTLINEs corresponding to the given pixel coordinates.

C_OUTLINE_LIST * operator() (inT16 x, inT16 y)
 
OL_BUCKETS::count_children

Find number of descendants of this outline.

inT32 count_children (C_OUTLINE *outline, inT32 max_count)
 
OL_BUCKETS::outline_complexity

This is the new version of count_child.

The goal of this function is to determine if an outline and its interiors could be part of a character blob. This is done by computing a "complexity" index for the outline, which is the return value of this function, and checking it against a threshold. The max_count is used for short-circuiting the recursion and forcing a rejection that guarantees to fail the threshold test. The complexity F for outline X with N children X[i] is F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild so each layer of nesting increases complexity exponentially. An outline can be rejected as a text blob candidate if its complexity is too high, has too many children(likely a container), or has too many layers of nested inner loops. This has the side-effect of flattening out boxed or reversed video text regions.

inT32 outline_complexity (C_OUTLINE *outline, inT32 max_count, inT16 depth)
 
OL_BUCKETS::extract_children

Find number of descendants of this outline.

void extract_children (C_OUTLINE *outline, C_OUTLINE_IT *it)
 

Detailed Description

Definition at line 31 of file edgblob.h.

Constructor & Destructor Documentation

◆ OL_BUCKETS()

OL_BUCKETS::OL_BUCKETS ( ICOORD  bleft,
ICOORD  tright 
)

Definition at line 68 of file edgblob.cpp.

70  : bl(bleft), tr(tright) {
71  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
72  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
73  // make array
74  buckets = new C_OUTLINE_LIST[bxdim * bydim];
75  index = 0;
76 }
inT16 x() const
access function
Definition: points.h:52
#define BUCKETSIZE
Definition: edgblob.h:29
inT16 y() const
access_function
Definition: points.h:56

◆ ~OL_BUCKETS()

OL_BUCKETS::~OL_BUCKETS ( )
inline

Definition at line 38 of file edgblob.h.

38  { //cleanup
39  delete[]buckets;
40  }

Member Function Documentation

◆ count_children()

inT32 OL_BUCKETS::count_children ( C_OUTLINE outline,
inT32  max_count 
)

Definition at line 183 of file edgblob.cpp.

186  {
187  BOOL8 parent_box; // could it be boxy
188  inT16 xmin, xmax; // coord limits
189  inT16 ymin, ymax;
190  inT16 xindex, yindex; // current bucket
191  C_OUTLINE *child; // current child
192  inT32 child_count; // no of children
193  inT32 grandchild_count; // no of grandchildren
194  inT32 parent_area; // potential box
195  FLOAT32 max_parent_area; // potential box
196  inT32 child_area; // current child
197  inT32 child_length; // current child
198  TBOX olbox;
199  C_OUTLINE_IT child_it; // search iterator
200 
201  olbox = outline->bounding_box();
202  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
203  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
204  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
205  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
206  child_count = 0;
207  grandchild_count = 0;
208  parent_area = 0;
209  max_parent_area = 0;
210  parent_box = TRUE;
211  for (yindex = ymin; yindex <= ymax; yindex++) {
212  for (xindex = xmin; xindex <= xmax; xindex++) {
213  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
214  if (child_it.empty())
215  continue;
216  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
217  child_it.forward()) {
218  child = child_it.data();
219  if (child != outline && *child < *outline) {
220  child_count++;
221  if (child_count <= max_count) {
222  int max_grand =(max_count - child_count) /
224  if (max_grand > 0)
225  grandchild_count += count_children(child, max_grand) *
227  else
228  grandchild_count += count_children(child, 1);
229  }
230  if (child_count + grandchild_count > max_count) {
231  if (edges_debug)
232  tprintf("Discarding parent with child count=%d, gc=%d\n",
233  child_count,grandchild_count);
234  return child_count + grandchild_count;
235  }
236  if (parent_area == 0) {
237  parent_area = outline->outer_area();
238  if (parent_area < 0)
239  parent_area = -parent_area;
240  max_parent_area = outline->bounding_box().area() * edges_boxarea;
241  if (parent_area < max_parent_area)
242  parent_box = FALSE;
243  }
244  if (parent_box &&
245  (!edges_children_fix ||
246  child->bounding_box().height() > edges_min_nonhole)) {
247  child_area = child->outer_area();
248  if (child_area < 0)
249  child_area = -child_area;
250  if (edges_children_fix) {
251  if (parent_area - child_area < max_parent_area) {
252  parent_box = FALSE;
253  continue;
254  }
255  if (grandchild_count > 0) {
256  if (edges_debug)
257  tprintf("Discarding parent of area %d, child area=%d, max%g "
258  "with gc=%d\n",
259  parent_area, child_area, max_parent_area,
260  grandchild_count);
261  return max_count + 1;
262  }
263  child_length = child->pathlength();
264  if (child_length * child_length >
265  child_area * edges_patharea_ratio) {
266  if (edges_debug)
267  tprintf("Discarding parent of area %d, child area=%d, max%g "
268  "with child length=%d\n",
269  parent_area, child_area, max_parent_area,
270  child_length);
271  return max_count + 1;
272  }
273  }
274  if (child_area < child->bounding_box().area() * edges_childarea) {
275  if (edges_debug)
276  tprintf("Discarding parent of area %d, child area=%d, max%g "
277  "with child rect=%d\n",
278  parent_area, child_area, max_parent_area,
279  child->bounding_box().area());
280  return max_count + 1;
281  }
282  }
283  }
284  }
285  }
286  }
287  return child_count + grandchild_count;
288 }
#define TRUE
Definition: capi.h:45
EXTERN double edges_boxarea
Definition: edgblob.cpp:60
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:48
short inT16
Definition: host.h:33
inT32 outer_area() const
Definition: coutln.cpp:308
EXTERN bool edges_debug
Definition: edgblob.cpp:44
EXTERN double edges_childarea
Definition: edgblob.cpp:58
inT32 pathlength() const
Definition: coutln.h:133
unsigned char BOOL8
Definition: host.h:46
inT16 bottom() const
Definition: rect.h:61
EXTERN int edges_min_nonhole
Definition: edgblob.cpp:54
EXTERN int edges_patharea_ratio
Definition: edgblob.cpp:56
inT32 count_children(C_OUTLINE *outline, inT32 max_count)
Definition: edgblob.cpp:183
#define FALSE
Definition: capi.h:46
inT16 x() const
access function
Definition: points.h:52
float FLOAT32
Definition: host.h:44
inT16 left() const
Definition: rect.h:68
const TBOX & bounding_box() const
Definition: coutln.h:111
inT32 area() const
Definition: rect.h:118
inT16 height() const
Definition: rect.h:104
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
inT16 top() const
Definition: rect.h:54
#define BUCKETSIZE
Definition: edgblob.h:29
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
EXTERN bool edges_children_fix
Definition: edgblob.cpp:52
inT16 y() const
access_function
Definition: points.h:56

◆ extract_children()

void OL_BUCKETS::extract_children ( C_OUTLINE outline,
C_OUTLINE_IT *  it 
)

Definition at line 299 of file edgblob.cpp.

302  {
303  inT16 xmin, xmax; // coord limits
304  inT16 ymin, ymax;
305  inT16 xindex, yindex; // current bucket
306  TBOX olbox;
307  C_OUTLINE_IT child_it; // search iterator
308 
309  olbox = outline->bounding_box();
310  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
311  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
312  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
313  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
314  for (yindex = ymin; yindex <= ymax; yindex++) {
315  for (xindex = xmin; xindex <= xmax; xindex++) {
316  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
317  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
318  child_it.forward()) {
319  if (*child_it.data() < *outline) {
320  it->add_after_then_move(child_it.extract());
321  }
322  }
323  }
324  }
325 }
short inT16
Definition: host.h:33
inT16 bottom() const
Definition: rect.h:61
inT16 x() const
access function
Definition: points.h:52
inT16 left() const
Definition: rect.h:68
const TBOX & bounding_box() const
Definition: coutln.h:111
inT16 top() const
Definition: rect.h:54
#define BUCKETSIZE
Definition: edgblob.h:29
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 y() const
access_function
Definition: points.h:56

◆ operator()()

C_OUTLINE_LIST * OL_BUCKETS::operator() ( inT16  x,
inT16  y 
)

Definition at line 87 of file edgblob.cpp.

89  {
90  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
91 }
inT16 x() const
access function
Definition: points.h:52
#define BUCKETSIZE
Definition: edgblob.h:29
inT16 y() const
access_function
Definition: points.h:56

◆ outline_complexity()

inT32 OL_BUCKETS::outline_complexity ( C_OUTLINE outline,
inT32  max_count,
inT16  depth 
)

Definition at line 114 of file edgblob.cpp.

118  {
119  inT16 xmin, xmax; // coord limits
120  inT16 ymin, ymax;
121  inT16 xindex, yindex; // current bucket
122  C_OUTLINE *child; // current child
123  inT32 child_count; // no of children
124  inT32 grandchild_count; // no of grandchildren
125  C_OUTLINE_IT child_it; // search iterator
126 
127  TBOX olbox = outline->bounding_box();
128  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
129  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
130  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
131  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
132  child_count = 0;
133  grandchild_count = 0;
134  if (++depth > edges_max_children_layers) // nested loops are too deep
135  return max_count + depth;
136 
137  for (yindex = ymin; yindex <= ymax; yindex++) {
138  for (xindex = xmin; xindex <= xmax; xindex++) {
139  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
140  if (child_it.empty())
141  continue;
142  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
143  child_it.forward()) {
144  child = child_it.data();
145  if (child == outline || !(*child < *outline))
146  continue;
147  child_count++;
148 
149  if (child_count > edges_max_children_per_outline) { // too fragmented
150  if (edges_debug)
151  tprintf("Discard outline on child_count=%d > "
152  "max_children_per_outline=%d\n",
153  child_count,
154  static_cast<inT32>(edges_max_children_per_outline));
155  return max_count + child_count;
156  }
157 
158  // Compute the "complexity" of each child recursively
159  inT32 remaining_count = max_count - child_count - grandchild_count;
160  if (remaining_count > 0)
161  grandchild_count += edges_children_per_grandchild *
162  outline_complexity(child, remaining_count, depth);
163  if (child_count + grandchild_count > max_count) { // too complex
164  if (edges_debug)
165  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
166  "> max_count=%d\n",
167  child_count, grandchild_count, max_count);
168  return child_count + grandchild_count;
169  }
170  }
171  }
172  }
173  return child_count + grandchild_count;
174 }
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:48
short inT16
Definition: host.h:33
EXTERN bool edges_debug
Definition: edgblob.cpp:44
EXTERN int edges_max_children_layers
Definition: edgblob.cpp:42
inT32 outline_complexity(C_OUTLINE *outline, inT32 max_count, inT16 depth)
Definition: edgblob.cpp:114
inT16 bottom() const
Definition: rect.h:61
inT16 x() const
access function
Definition: points.h:52
inT16 left() const
Definition: rect.h:68
const TBOX & bounding_box() const
Definition: coutln.h:111
int inT32
Definition: host.h:35
#define tprintf(...)
Definition: tprintf.h:31
inT16 top() const
Definition: rect.h:54
EXTERN int edges_max_children_per_outline
Definition: edgblob.cpp:40
#define BUCKETSIZE
Definition: edgblob.h:29
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
inT16 y() const
access_function
Definition: points.h:56

◆ scan_next()

C_OUTLINE_LIST* OL_BUCKETS::scan_next ( )
inline

Definition at line 51 of file edgblob.h.

51  {
52  for (; buckets[index].empty () && index < bxdim * bydim - 1; index++);
53  return &buckets[index];
54  }

◆ start_scan()

C_OUTLINE_LIST* OL_BUCKETS::start_scan ( )
inline

Definition at line 45 of file edgblob.h.

45  {
46  for (index = 0; buckets[index].empty () && index < bxdim * bydim - 1;
47  index++);
48  return &buckets[index];
49  }

The documentation for this class was generated from the following files: