tesseract  3.05.02
gap_map.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #include "statistc.h"
11 #include "gap_map.h"
12 
13 #define EXTERN
14 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
16 "Use large space at start and end of rows");
18 "Ensure gaps not less than 2quanta wide");
19 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
20 
21 /*************************************************************************
22  * A block gap map is a quantised histogram of whitespace regions in the
23  * block. It is a vertical projection of wide gaps WITHIN lines
24  *
25  * The map is held as an array of counts of rows which have a wide gap
26  * covering that region of the row. Each bucket in the map represents a width
27  * of about half an xheight - (The median of the xhts in the rows is used.)
28  *
29  * The block is considered RECTANGULAR - delimited by the left and right
30  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
31  * counted.
32  *
33  *************************************************************************/
34 
35 GAPMAP::GAPMAP( //Constructor
36  TO_BLOCK *block //block
37  ) {
38  TO_ROW_IT row_it; //row iterator
39  TO_ROW *row; //current row
40  BLOBNBOX_IT blob_it; //iterator
41  TBOX blob_box;
42  TBOX prev_blob_box;
43  inT16 gap_width;
44  inT16 start_of_row;
45  inT16 end_of_row;
46  STATS xht_stats (0, 128);
47  inT16 min_quantum;
48  inT16 max_quantum;
49  inT16 i;
50 
51  row_it.set_to_list (block->get_rows ());
52  /*
53  Find left and right extremes and bucket size
54  */
55  map = NULL;
56  min_left = MAX_INT16;
57  max_right = -MAX_INT16;
58  total_rows = 0;
59  any_tabs = FALSE;
60  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
61  row = row_it.data ();
62  if (!row->blob_list ()->empty ()) {
63  total_rows++;
64  xht_stats.add ((inT16) floor (row->xheight + 0.5), 1);
65  blob_it.set_to_list (row->blob_list ());
66  start_of_row = blob_it.data ()->bounding_box ().left ();
67  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
68  if (min_left > start_of_row)
69  min_left = start_of_row;
70  if (max_right < end_of_row)
71  max_right = end_of_row;
72  }
73  }
74  if ((total_rows < 3) || (min_left >= max_right)) {
75  total_rows = 0;
76  min_left = max_right = 0;
77  return;
78  }
79  bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2;
80  map_max = (max_right - min_left) / bucket_size;
81  map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16));
82  for (i = 0; i <= map_max; i++)
83  map[i] = 0;
84 
85  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
86  row = row_it.data ();
87  if (!row->blob_list ()->empty ()) {
88  blob_it.set_to_list (row->blob_list ());
89  blob_it.mark_cycle_pt ();
90  blob_box = box_next (&blob_it);
91  prev_blob_box = blob_box;
92  if (gapmap_use_ends) {
93  /* Leading space */
94  gap_width = blob_box.left () - min_left;
95  if ((gap_width > gapmap_big_gaps * row->xheight)
96  && gap_width > 2) {
97  max_quantum = (blob_box.left () - min_left) / bucket_size;
98  if (max_quantum > map_max) max_quantum = map_max;
99  for (i = 0; i <= max_quantum; i++)
100  map[i]++;
101  }
102  }
103  while (!blob_it.cycled_list ()) {
104  blob_box = box_next (&blob_it);
105  gap_width = blob_box.left () - prev_blob_box.right ();
106  if ((gap_width > gapmap_big_gaps * row->xheight)
107  && gap_width > 2) {
108  min_quantum =
109  (prev_blob_box.right () - min_left) / bucket_size;
110  max_quantum = (blob_box.left () - min_left) / bucket_size;
111  if (max_quantum > map_max) max_quantum = map_max;
112  for (i = min_quantum; i <= max_quantum; i++)
113  map[i]++;
114  }
115  prev_blob_box = blob_box;
116  }
117  if (gapmap_use_ends) {
118  /* Trailing space */
119  gap_width = max_right - prev_blob_box.right ();
120  if ((gap_width > gapmap_big_gaps * row->xheight)
121  && gap_width > 2) {
122  min_quantum =
123  (prev_blob_box.right () - min_left) / bucket_size;
124  if (min_quantum < 0) min_quantum = 0;
125  for (i = min_quantum; i <= map_max; i++)
126  map[i]++;
127  }
128  }
129  }
130  }
131  for (i = 0; i <= map_max; i++) {
132  if (map[i] > total_rows / 2) {
134  (((i == 0) &&
135  (map[i + 1] <= total_rows / 2)) ||
136  ((i == map_max) &&
137  (map[i - 1] <= total_rows / 2)) ||
138  ((i > 0) &&
139  (i < map_max) &&
140  (map[i - 1] <= total_rows / 2) &&
141  (map[i + 1] <= total_rows / 2)))) {
142  map[i] = 0; //prevent isolated quantum
143  }
144  else
145  any_tabs = TRUE;
146  }
147  }
148  if (gapmap_debug && any_tabs)
149  tprintf ("Table found\n");
150 }
151 
152 
153 /*************************************************************************
154  * GAPMAP::table_gap()
155  * Is there a bucket in the specified range where more than half the rows in the
156  * block have a wide gap?
157  *************************************************************************/
158 
159 BOOL8 GAPMAP::table_gap( //Is gap a table?
160  inT16 left, //From here
161  inT16 right //To here
162  ) {
163  inT16 min_quantum;
164  inT16 max_quantum;
165  inT16 i;
166  BOOL8 tab_found = FALSE;
167 
168  if (!any_tabs)
169  return FALSE;
170 
171  min_quantum = (left - min_left) / bucket_size;
172  max_quantum = (right - min_left) / bucket_size;
173  // Clip to the bounds of the array. In some circumstances (big blob followed
174  // by small blob) max_quantum can exceed the map_max bounds, but we clip
175  // here instead, as it provides better long-term safety.
176  if (min_quantum < 0) min_quantum = 0;
177  if (max_quantum > map_max) max_quantum = map_max;
178  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
179  if (map[i] > total_rows / 2)
180  tab_found = TRUE;
181  return tab_found;
182 }
#define TRUE
Definition: capi.h:45
EXTERN bool gapmap_debug
Definition: gap_map.cpp:14
short inT16
Definition: host.h:33
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
BOOL8 table_gap(inT16 left, inT16 right)
Definition: gap_map.cpp:159
unsigned char BOOL8
Definition: host.h:46
EXTERN bool gapmap_use_ends
Definition: gap_map.cpp:16
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
EXTERN bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:18
#define MAX_INT16
Definition: host.h:52
#define FALSE
Definition: capi.h:46
void * alloc_mem(inT32 count)
Definition: memry.cpp:47
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:631
EXTERN double gapmap_big_gaps
Definition: gap_map.cpp:19
inT16 left() const
Definition: rect.h:68
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
#define EXTERN
Definition: gap_map.cpp:13
#define tprintf(...)
Definition: tprintf.h:31
float xheight
Definition: blobbox.h:653
double median() const
Definition: statistc.cpp:239
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:35
#define double_VAR(name, val, comment)
Definition: params.h:286
#define BOOL_VAR(name, val, comment)
Definition: params.h:280
Definition: statistc.h:33