tesseract  3.05.02
tablerecog.h
Go to the documentation of this file.
1 // File: tablerecog.h
3 // Description: Functions to detect structure of tables.
4 // Author: Nicholas Beato
5 // Created: Aug 17, 2010
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TABLERECOG_H_
21 #define TABLERECOG_H_
22 
23 #include "colpartitiongrid.h"
24 #include "genericvector.h"
25 
26 namespace tesseract {
27 
28 // There are 2 classes in this file. They have 2 different purposes.
29 // - StructuredTable contains the methods to find the structure given
30 // a specific bounding box and grow that structure.
31 // - TableRecognizer contains the methods to adjust the possible positions
32 // of a table without worrying about structure.
33 //
34 // To use these classes, the assumption is that the TableFinder will
35 // have a guess of the location of a table (or possibly over/undersegmented
36 // tables). The TableRecognizer is responsible for finding the table boundaries
37 // at a high level. The StructuredTable class is responsible for determining
38 // the structure of the table and trying to maximize its bounds while retaining
39 // the structure.
40 // (The latter part is not implemented yet, but that was the goal).
41 //
42 // While on the boundary discussion, keep in mind that this is a first pass.
43 // There should eventually be some things like internal structure checks,
44 // and, more importantly, surrounding text flow checks.
45 //
46 
47 // Usage:
48 // The StructuredTable class contains methods to query a potential table.
49 // It has functions to find structure, count rows, find ColPartitions that
50 // intersect gridlines, etc. It is not meant to blindly find a table. It
51 // is meant to start with a known table location and enhance it.
52 // Usage:
53 // ColPartitionGrid text_grid, line_grid; // init
54 // TBOX table_box; // known location of table location
55 //
56 // StructuredTable table;
57 // table.Init(); // construction code
58 // table.set_text_grid(/* text */); // These 2 grids can be the same!
59 // table.set_line_grid(/* lines */);
60 // table.set_min_text_height(10); // Filter vertical and tall text.
61 // // IMPORTANT! The table needs to be told where it is!
62 // table.set_bounding_box(table_box); // Set initial table location.
63 // if (table.FindWhitespacedStructure()) {
64 // // process table
65 // table.column_count(); // number of columns
66 // table.row_count(); // number of rows
67 // table.cells_count(); // number of cells
68 // table.bounding_box(); // updated bounding box
69 // // etc.
70 // }
71 //
73  public:
76 
77  // Initialization code. Must be called after the constructor.
78  void Init();
79 
80  // Sets the grids used by the table. These can be changed between
81  // calls to Recognize. They are treated as read-only data.
82  void set_text_grid(ColPartitionGrid* text);
83  void set_line_grid(ColPartitionGrid* lines);
84  // Filters text partitions that are ridiculously tall to prevent
85  // merging rows.
86  void set_max_text_height(int height);
87 
88  // Basic accessors. Some are treated as attributes despite having indirect
89  // representation.
90  bool is_lined() const;
91  int row_count() const;
92  int column_count() const;
93  int cell_count() const;
94  void set_bounding_box(const TBOX& box);
95  const TBOX& bounding_box() const;
96  int median_cell_height();
97  int median_cell_width();
98  int row_height(int row) const;
99  int column_width(int column) const;
100  int space_above() const;
101  int space_below() const;
102 
103  // Given enough horizontal and vertical lines in a region, create this table
104  // based on the structure given by the lines. Return true if it worked out.
105  // Code assumes the lines exist. It is the caller's responsibility to check
106  // for lines and find an appropriate bounding box.
107  bool FindLinedStructure();
108 
109  // The main subroutine for finding generic table structure. The function
110  // finds the grid structure in the given box. Returns true if a good grid
111  // exists, implying that "this" table is valid.
113 
117 
118  // Returns true if inserting part into the table does not cause any
119  // cell merges.
120  bool DoesPartitionFit(const ColPartition& part) const;
121  // Checks if a sub-table has multiple data cells filled.
122  int CountFilledCells();
123  int CountFilledCellsInRow(int row);
124  int CountFilledCellsInColumn(int column);
125  int CountFilledCells(int row_start, int row_end,
126  int column_start, int column_end);
127 
128  // Makes sure that at least one cell in a row has substantial area filled.
129  // This can filter out large whitespace caused by growing tables too far
130  // and page numbers.
131  // (currently bugged for some reason).
132  bool VerifyRowFilled(int row);
133  // Finds the filled area in a cell.
134  double CalculateCellFilledPercentage(int row, int column);
135 
136  // Debug display, draws the table in the given color. If the table is not
137  // valid, the table and "best" grid lines are still drawn in the given color.
138  void Display(ScrollView* window, ScrollView::Color color);
139 
140  protected:
141  // Clear the structure information.
142  void ClearStructure();
143 
147 
148  // Verifies the lines do not intersect partitions. This happens when
149  // the lines are in column boundaries and extend the full page. As a result,
150  // the grid lines go through column text. The condition is detectable.
151  bool VerifyLinedTableCells();
152 
156 
157  // This is the function to change if you want to filter resulting tables
158  // better. Right now it just checks for a minimum cell count and such.
159  // You could add things like maximum number of ColPartitions per cell or
160  // similar.
161  bool VerifyWhitespacedTable();
162  // Find the columns of a table using whitespace.
163  void FindWhitespacedColumns();
164  // Find the rows of a table using whitespace.
165  void FindWhitespacedRows();
166 
170 
171  // Calculates the whitespace around the table using the table boundary and
172  // the supplied grids (set_text_grid and set_line_grid).
173  void CalculateMargins();
174  // Update the table margins with the supplied grid. This is
175  // only called by calculate margins to use multiple grid sources.
176  void UpdateMargins(ColPartitionGrid* grid);
177  int FindVerticalMargin(ColPartitionGrid* grid, int start_x,
178  bool decrease) const;
179  int FindHorizontalMargin(ColPartitionGrid* grid, int start_y,
180  bool decrease) const;
181  // Calculates stats on the table, namely the median cell height and width.
182  void CalculateStats();
183 
187 
188  // Given a whitespaced table, this looks for bordering lines that might
189  // be page layout boxes around the table. It is necessary to get the margins
190  // correct on the table. If the lines are not joined, the margins will be
191  // the distance to the line, which is not right.
192  void AbsorbNearbyLines();
193 
194  // Nice utility function for finding partition gaps. You feed it a sorted
195  // list of all of the mins/maxes of the partitions in the table, and it gives
196  // you the gaps (middle). This works for both vertical and horizontal
197  // gaps.
198  //
199  // If you want to allow slight overlap in the division and the partitions,
200  // just scale down the partitions before inserting them in the list.
201  // Likewise, you can force at least some space between partitions.
202  // This trick is how the horizontal partitions are done (since the page
203  // skew could make it hard to find splits in the text).
204  //
205  // As a result, "0 distance" between closest partitions causes a gap.
206  // This is not a programmatic assumption. It is intentional and simplifies
207  // things.
208  //
209  // "max_merged" indicates both the minimum number of stacked partitions
210  // to cause a cell (add 1 to it), and the maximum number of partitions that
211  // a grid line can intersect. For example, if max_merged is 0, then lines
212  // are inserted wherever space exists between partitions. If it is 2,
213  // lines may intersect 2 partitions at most, but you also need at least
214  // 2 partitions to generate a line.
215  static void FindCellSplitLocations(const GenericVector<int>& min_list,
216  const GenericVector<int>& max_list,
217  int max_merged,
218  GenericVector<int>* locations);
219 
223 
224  // Counts the number of ColPartitions that intersect vertical cell
225  // division at this x value. Used by VerifyLinedTable.
226  int CountVerticalIntersections(int x);
227  int CountHorizontalIntersections(int y);
228 
229  // Counts how many text partitions are in this box.
230  int CountPartitions(const TBOX& box);
231 
235 
236  // Input data, used as read only data to make decisions.
237  ColPartitionGrid* text_grid_; // Text ColPartitions
238  ColPartitionGrid* line_grid_; // Line ColPartitions
239  // Table structure.
240  // bounding box is a convenient external representation.
241  // cell_x_ and cell_y_ indicate the grid lines.
242  TBOX bounding_box_; // Bounding box
243  GenericVectorEqEq<int> cell_x_; // Locations of vertical divisions (sorted)
244  GenericVectorEqEq<int> cell_y_; // Locations of horizontal divisions (sorted)
245  bool is_lined_; // Is the table backed up by a line structure
246  // Table margins, set via CalculateMargins
253  // Filters, used to prevent awkward partitions from destroying structure.
255 };
256 
258  public:
259  TableRecognizer();
261 
262  // Initialization code. Must be called after the constructor.
263  void Init();
264 
268 
269  // Sets the grids used by the table. These can be changed between
270  // calls to Recognize. They are treated as read-only data.
271  void set_text_grid(ColPartitionGrid* text);
272  void set_line_grid(ColPartitionGrid* lines);
273  // Sets some additional constraints on the table.
274  void set_min_height(int height);
275  void set_min_width(int width);
276  // Filters text partitions that are ridiculously tall to prevent
277  // merging rows. Note that "filters" refers to allowing horizontal
278  // cells to slice through them on the premise that they were
279  // merged text rows during previous layout.
280  void set_max_text_height(int height);
281 
282  // Given a guess location, the RecognizeTable function will try to find a
283  // structured grid in the area. On success, it will return a new
284  // StructuredTable (and assumes you will delete it). Otherwise,
285  // NULL is returned.
286  //
287  // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
288  // Ideally, there is a either a one-to-one correspondence between
289  // the guess and table or no table at all. This is not the best of
290  // assumptions right now, but was made to try to keep things simple in
291  // the first pass.
292  //
293  // If a line structure is available on the page in the given region,
294  // the table will use the linear structure as it is.
295  // Otherwise, it will try to maximize the whitespace around it while keeping
296  // a grid structure. This is somewhat working.
297  //
298  // Since the combination of adjustments can get high, effort was
299  // originally made to keep the number of adjustments linear in the number
300  // of partitions. The underlying structure finding code used to be
301  // much more complex. I don't know how necessary this constraint is anymore.
302  // The evaluation of a possible table is kept within O(nlogn) in the size of
303  // the table (where size is the number of partitions in the table).
304  // As a result, the algorithm is capable of O(n^2 log n). Depending
305  // on the grid search size, it may be higher.
306  //
307  // Last note: it is possible to just try all partition boundaries at a high
308  // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
309  // area 200 partitions on a page, this could be too costly. Effort could go
310  // into pruning the search, but I opted for something quicker. I'm confident
311  // that the independent adjustments can get similar results and keep the
312  // complextiy down. However, the other approach could work without using
313  // TableFinder at all if it is fast enough. It comes down to properly
314  // deciding what is a table. The code currently relies on TableFinder's
315  // guess to the location of a table for that.
316  StructuredTable* RecognizeTable(const TBOX& guess_box);
317 
318  protected:
322 
323  // Returns true if the given box has a lined table within it. The
324  // table argument will be updated with the table if the table exists.
325  bool RecognizeLinedTable(const TBOX& guess_box, StructuredTable* table);
326  // Returns true if the given box has a large number of horizontal and
327  // vertical lines present. If so, we assume the extent of these lines
328  // uniquely defines a table and find that table via SolveLinedTable.
329  bool HasSignificantLines(const TBOX& guess);
330 
331  // Given enough horizontal and vertical lines in a region, find a bounding
332  // box that encloses all of them (as well as newly introduced lines).
333  // The bounding box is the smallest box that encloses the lines in guess
334  // without having any lines sticking out of it.
335  // bounding_box is an in/out parameter.
336  // On input, it in the extents of the box to search.
337  // On output, it is the resulting bounding box.
338  bool FindLinesBoundingBox(TBOX* bounding_box);
339  // Iteration in above search.
340  // bounding_box is an in/out parameter.
341  // On input, it in the extents of the box to search.
342  // On output, it is the resulting bounding box.
343  bool FindLinesBoundingBoxIteration(TBOX* bounding_box);
344 
348 
349  // Returns true if the given box has a whitespaced table within it. The
350  // table argument will be updated if the table exists. Also note
351  // that this method will fail if the guess_box center is not
352  // mostly within the table.
353  bool RecognizeWhitespacedTable(const TBOX& guess_box, StructuredTable* table);
354 
355  // Finds the location of a horizontal split relative to y.
356  // This function is mostly unused now. If the SolveWhitespacedTable
357  // changes much, it can be removed. Note, it isn't really as reliable
358  // as I thought. I went with alternatives for most of the other uses.
359  int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
360 
361  // Indicates that a table row is weak. This means that it has
362  // many missing data cells or very large cell heights compared.
363  // to the rest of the table.
364  static bool IsWeakTableRow(StructuredTable* table, int row);
365 
366  // Input data, used as read only data to make decisions.
367  ColPartitionGrid* text_grid_; // Text ColPartitions
368  ColPartitionGrid* line_grid_; // Line ColPartitions
369  // Table constraints, a "good" table must satisfy these.
372  // Filters, used to prevent awkward partitions from destroying structure.
373  int max_text_height_; // Horizontal lines may intersect taller text.
374 };
375 
376 } // namespace tesseract
377 
378 #endif /* TABLERECOG_H_ */
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:723
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:815
const TBOX & bounding_box() const
Definition: tablerecog.cpp:110
ColPartitionGrid * line_grid_
Definition: tablerecog.h:368
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:89
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:290
ColPartitionGrid * text_grid_
Definition: tablerecog.h:367
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:875
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:837
void set_min_height(int height)
Definition: tablerecog.cpp:726
void set_max_text_height(int height)
Definition: tablerecog.cpp:92
void set_min_width(int width)
Definition: tablerecog.cpp:729
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:759
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:639
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:720
bool DoesPartitionFit(const ColPartition &part) const
Definition: tablerecog.cpp:212
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:502
void set_bounding_box(const TBOX &box)
Definition: tablerecog.cpp:107
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:736
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:485
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:776
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)
int column_width(int column) const
Definition: tablerecog.cpp:123
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:86
bool VerifyRowFilled(int row)
Definition: tablerecog.cpp:256
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:593
int CountFilledCellsInColumn(int column)
Definition: tablerecog.cpp:230
int row_height(int row) const
Definition: tablerecog.cpp:119
void set_max_text_height(int height)
Definition: tablerecog.cpp:732
int CountFilledCellsInRow(int row)
Definition: tablerecog.cpp:227
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:663
static bool IsWeakTableRow(StructuredTable *table, int row)
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
Definition: rect.h:30
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:689
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:475
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:267