tesseract  3.05.02
textord.h
Go to the documentation of this file.
1 // File: textord.h
3 // Description: The Textord class definition gathers text line and word
4 // finding functionality.
5 // Author: Ray Smith
6 // Created: Fri Mar 13 14:29:01 PDT 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_TEXTORD_TEXTORD_H__
22 #define TESSERACT_TEXTORD_TEXTORD_H__
23 
24 #include "ccstruct.h"
25 #include "bbgrid.h"
26 #include "blobbox.h"
27 #include "gap_map.h"
28 #include "publictypes.h" // For PageSegMode.
29 
30 class FCOORD;
31 class BLOCK_LIST;
32 class PAGE_RES;
33 class TO_BLOCK;
34 class TO_BLOCK_LIST;
35 class ScrollView;
36 
37 namespace tesseract {
38 
39 // A simple class that can be used by BBGrid to hold a word and an expanded
40 // bounding box that makes it easy to find words to put diacritics.
41 class WordWithBox {
42  public:
43  WordWithBox() : word_(NULL) {}
44  explicit WordWithBox(WERD *word)
45  : word_(word), bounding_box_(word->bounding_box()) {
46  int height = bounding_box_.height();
47  bounding_box_.pad(height, height);
48  }
49 
50  const TBOX &bounding_box() const { return bounding_box_; }
51  // Returns the bounding box of only the good blobs.
52  TBOX true_bounding_box() const { return word_->true_bounding_box(); }
53  C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); }
54  const WERD *word() const { return word_; }
55 
56  private:
57  // Borrowed pointer to a real word somewhere that must outlive this class.
58  WERD *word_;
59  // Cached expanded bounding box of the word, padded all round by its height.
60  TBOX bounding_box_;
61 };
62 
63 // Make it usable by BBGrid.
64 CLISTIZEH(WordWithBox)
65 typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordGrid;
66 typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordSearch;
67 
68 class Textord {
69  public:
70  explicit Textord(CCStruct* ccstruct);
71  ~Textord();
72 
73  // Make the textlines and words inside each block.
74  // binary_pix is mandatory and is the binarized input after line removal.
75  // grey_pix is optional, but if present must match the binary_pix in size,
76  // and must be a *real* grey image instead of binary_pix * 255.
77  // thresholds_pix is expected to be present iff grey_pix is present and
78  // can be an integer factor reduction of the grey_pix. It represents the
79  // thresholds that were used to create the binary_pix from the grey_pix.
80  // diacritic_blobs contain small confusing components that should be added
81  // to the appropriate word(s) in case they are really diacritics.
82  void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width,
83  int height, Pix *binary_pix, Pix *thresholds_pix,
84  Pix *grey_pix, bool use_box_bottoms,
85  BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
86  TO_BLOCK_LIST *to_blocks);
87 
88  // If we were supposed to return only a single textline, and there is more
89  // than one, clean up and leave only the best.
90  void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES* page_res);
91 
92  bool use_cjk_fp_model() const {
93  return use_cjk_fp_model_;
94  }
95  void set_use_cjk_fp_model(bool flag) {
96  use_cjk_fp_model_ = flag;
97  }
98 
99  // tospace.cpp ///////////////////////////////////////////
100  void to_spacing(
101  ICOORD page_tr, //topright of page
102  TO_BLOCK_LIST *blocks //blocks on page
103  );
104  ROW *make_prop_words(TO_ROW *row, // row to make
105  FCOORD rotation // for drawing
106  );
107  ROW *make_blob_words(TO_ROW *row, // row to make
108  FCOORD rotation // for drawing
109  );
110  // tordmain.cpp ///////////////////////////////////////////
111  void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
112  void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on);
113 
114  private:
115  // For underlying memory management and other utilities.
116  CCStruct* ccstruct_;
117 
118  // The size of the input image.
119  ICOORD page_tr_;
120 
121  bool use_cjk_fp_model_;
122 
123  // makerow.cpp ///////////////////////////////////////////
124  // Make the textlines inside each block.
125  void MakeRows(PageSegMode pageseg_mode, const FCOORD& skew,
126  int width, int height, TO_BLOCK_LIST* to_blocks);
127  // Make the textlines inside a single block.
128  void MakeBlockRows(int min_spacing, int max_spacing,
129  const FCOORD& skew, TO_BLOCK* block,
130  ScrollView* win);
131 
132  public:
133  void compute_block_xheight(TO_BLOCK *block, float gradient);
134  void compute_row_xheight(TO_ROW *row, // row to do
135  const FCOORD& rotation,
136  float gradient, // global skew
137  int block_line_size);
138  void make_spline_rows(TO_BLOCK *block, // block to do
139  float gradient, // gradient to fit
140  BOOL8 testing_on);
141  private:
143  void make_old_baselines(TO_BLOCK *block, // block to do
144  BOOL8 testing_on, // correct orientation
145  float gradient);
146  void correlate_lines(TO_BLOCK *block, float gradient);
147  void correlate_neighbours(TO_BLOCK *block, // block rows are in.
148  TO_ROW **rows, // rows of block.
149  int rowcount); // no of rows to do.
150  int correlate_with_stats(TO_ROW **rows, // rows of block.
151  int rowcount, // no of rows to do.
152  TO_BLOCK* block);
153  void find_textlines(TO_BLOCK *block, // block row is in
154  TO_ROW *row, // row to do
155  int degree, // required approximation
156  QSPLINE *spline); // starting spline
157  // tospace.cpp ///////////////////////////////////////////
158  //DEBUG USE ONLY
159  void block_spacing_stats(TO_BLOCK *block,
160  GAPMAP *gapmap,
161  BOOL8 &old_text_ord_proportional,
162  //resulting estimate
163  inT16 &block_space_gap_width,
164  //resulting estimate
165  inT16 &block_non_space_gap_width
166  );
167  void row_spacing_stats(TO_ROW *row,
168  GAPMAP *gapmap,
169  inT16 block_idx,
170  inT16 row_idx,
171  //estimate for block
172  inT16 block_space_gap_width,
173  //estimate for block
174  inT16 block_non_space_gap_width
175  );
176  void old_to_method(TO_ROW *row,
177  STATS *all_gap_stats,
178  STATS *space_gap_stats,
179  STATS *small_gap_stats,
180  inT16 block_space_gap_width,
181  //estimate for block
182  inT16 block_non_space_gap_width
183  );
184  BOOL8 isolated_row_stats(TO_ROW *row,
185  GAPMAP *gapmap,
186  STATS *all_gap_stats,
187  BOOL8 suspected_table,
188  inT16 block_idx,
189  inT16 row_idx);
190  inT16 stats_count_under(STATS *stats, inT16 threshold);
191  void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats);
192  BOOL8 make_a_word_break(TO_ROW *row, // row being made
193  TBOX blob_box, // for next_blob // how many blanks?
194  inT16 prev_gap,
195  TBOX prev_blob_box,
196  inT16 real_current_gap,
197  inT16 within_xht_current_gap,
198  TBOX next_blob_box,
199  inT16 next_gap,
200  uinT8 &blanks,
201  BOOL8 &fuzzy_sp,
202  BOOL8 &fuzzy_non,
203  BOOL8& prev_gap_was_a_space,
204  BOOL8& break_at_next_gap);
205  BOOL8 narrow_blob(TO_ROW *row, TBOX blob_box);
206  BOOL8 wide_blob(TO_ROW *row, TBOX blob_box);
207  BOOL8 suspected_punct_blob(TO_ROW *row, TBOX box);
208  void peek_at_next_gap(TO_ROW *row,
209  BLOBNBOX_IT box_it,
210  TBOX &next_blob_box,
211  inT16 &next_gap,
212  inT16 &next_within_xht_gap);
213  void mark_gap(TBOX blob, //blob following gap
214  inT16 rule, // heuristic id
215  inT16 prev_gap,
216  inT16 prev_blob_width,
217  inT16 current_gap,
218  inT16 next_blob_width,
219  inT16 next_gap);
220  float find_mean_blob_spacing(WERD *word);
221  BOOL8 ignore_big_gap(TO_ROW *row,
222  inT32 row_length,
223  GAPMAP *gapmap,
224  inT16 left,
225  inT16 right);
226  //get bounding box
227  TBOX reduced_box_next(TO_ROW *row, //current row
228  BLOBNBOX_IT *it //iterator to blobds
229  );
230  TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, inT16 *left_above_xht);
231  // tordmain.cpp ///////////////////////////////////////////
232  float filter_noise_blobs(BLOBNBOX_LIST *src_list,
233  BLOBNBOX_LIST *noise_list,
234  BLOBNBOX_LIST *small_list,
235  BLOBNBOX_LIST *large_list);
236  // Fixes the block so it obeys all the rules:
237  // Must have at least one ROW.
238  // Must have at least one WERD.
239  // WERDs contain a fake blob.
240  void cleanup_nontext_block(BLOCK* block);
241  void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
242  BOOL8 clean_noise_from_row(ROW *row);
243  void clean_noise_from_words(ROW *row);
244  // Remove outlines that are a tiny fraction in either width or height
245  // of the word height.
246  void clean_small_noise_from_words(ROW *row);
247  // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
248  // TransferDiacriticsToWords to copy the diacritic blobs to the most
249  // appropriate words in the group of blocks. Source blobs are not touched.
250  void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
251  BLOCK_LIST* blocks);
252  // Places a copy of blobs that are near a word (after applying rotation to the
253  // blob) in the most appropriate word, unless there is doubt, in which case a
254  // blob can end up in two words. Source blobs are not touched.
255  void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
256  const FCOORD &rotation, WordGrid *word_grid);
257 
258  public:
259  // makerow.cpp ///////////////////////////////////////////
260  BOOL_VAR_H(textord_single_height_mode, false,
261  "Script has no xheight, so use a single mode for horizontal text");
262  // tospace.cpp ///////////////////////////////////////////
263  BOOL_VAR_H(tosp_old_to_method, false, "Space stats use prechopping?");
264  BOOL_VAR_H(tosp_old_to_constrain_sp_kn, false,
265  "Constrain relative values of inter and intra-word gaps for "
266  "old_to_method.");
267  BOOL_VAR_H(tosp_only_use_prop_rows, true,
268  "Block stats to use fixed pitch rows?");
269  BOOL_VAR_H(tosp_force_wordbreak_on_punct, false,
270  "Force word breaks on punct to break long lines in non-space "
271  "delimited langs");
272  BOOL_VAR_H(tosp_use_pre_chopping, false,
273  "Space stats use prechopping?");
274  BOOL_VAR_H(tosp_old_to_bug_fix, false,
275  "Fix suspected bug in old code");
276  BOOL_VAR_H(tosp_block_use_cert_spaces, true,
277  "Only stat OBVIOUS spaces");
278  BOOL_VAR_H(tosp_row_use_cert_spaces, true,
279  "Only stat OBVIOUS spaces");
280  BOOL_VAR_H(tosp_narrow_blobs_not_cert, true,
281  "Only stat OBVIOUS spaces");
282  BOOL_VAR_H(tosp_row_use_cert_spaces1, true,
283  "Only stat OBVIOUS spaces");
284  BOOL_VAR_H(tosp_recovery_isolated_row_stats, true,
285  "Use row alone when inadequate cert spaces");
286  BOOL_VAR_H(tosp_only_small_gaps_for_kern, false, "Better guess");
287  BOOL_VAR_H(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?");
288  BOOL_VAR_H(tosp_fuzzy_limit_all, true,
289  "Don't restrict kn->sp fuzzy limit to tables");
290  BOOL_VAR_H(tosp_stats_use_xht_gaps, true,
291  "Use within xht gap for wd breaks");
292  BOOL_VAR_H(tosp_use_xht_gaps, true,
293  "Use within xht gap for wd breaks");
294  BOOL_VAR_H(tosp_only_use_xht_gaps, false,
295  "Only use within xht gap for wd breaks");
296  BOOL_VAR_H(tosp_rule_9_test_punct, false,
297  "Don't chng kn to space next to punct");
298  BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp, true, "Default flip");
299  BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn, true, "Default flip");
300  BOOL_VAR_H(tosp_improve_thresh, false,
301  "Enable improvement heuristic");
302  INT_VAR_H(tosp_debug_level, 0, "Debug data");
303  INT_VAR_H(tosp_enough_space_samples_for_median, 3,
304  "or should we use mean");
305  INT_VAR_H(tosp_redo_kern_limit, 10,
306  "No.samples reqd to reestimate for row");
307  INT_VAR_H(tosp_few_samples, 40,
308  "No.gaps reqd with 1 large gap to treat as a table");
309  INT_VAR_H(tosp_short_row, 20,
310  "No.gaps reqd with few cert spaces to use certs");
311  INT_VAR_H(tosp_sanity_method, 1, "How to avoid being silly");
312  double_VAR_H(tosp_old_sp_kn_th_factor, 2.0,
313  "Factor for defining space threshold in terms of space and "
314  "kern sizes");
315  double_VAR_H(tosp_threshold_bias1, 0,
316  "how far between kern and space?");
317  double_VAR_H(tosp_threshold_bias2, 0,
318  "how far between kern and space?");
319  double_VAR_H(tosp_narrow_fraction, 0.3,
320  "Fract of xheight for narrow");
321  double_VAR_H(tosp_narrow_aspect_ratio, 0.48,
322  "narrow if w/h less than this");
323  double_VAR_H(tosp_wide_fraction, 0.52, "Fract of xheight for wide");
324  double_VAR_H(tosp_wide_aspect_ratio, 0.0,
325  "wide if w/h less than this");
326  double_VAR_H(tosp_fuzzy_space_factor, 0.6,
327  "Fract of xheight for fuzz sp");
328  double_VAR_H(tosp_fuzzy_space_factor1, 0.5,
329  "Fract of xheight for fuzz sp");
330  double_VAR_H(tosp_fuzzy_space_factor2, 0.72,
331  "Fract of xheight for fuzz sp");
332  double_VAR_H(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern");
333  double_VAR_H(tosp_kern_gap_factor1, 2.0,
334  "gap ratio to flip kern->sp");
335  double_VAR_H(tosp_kern_gap_factor2, 1.3,
336  "gap ratio to flip kern->sp");
337  double_VAR_H(tosp_kern_gap_factor3, 2.5,
338  "gap ratio to flip kern->sp");
339  double_VAR_H(tosp_ignore_big_gaps, -1, "xht multiplier");
340  double_VAR_H(tosp_ignore_very_big_gaps, 3.5, "xht multiplier");
341  double_VAR_H(tosp_rep_space, 1.6, "rep gap multiplier for space");
342  double_VAR_H(tosp_enough_small_gaps, 0.65,
343  "Fract of kerns reqd for isolated row stats");
344  double_VAR_H(tosp_table_kn_sp_ratio, 2.25,
345  "Min difference of kn & sp in table");
346  double_VAR_H(tosp_table_xht_sp_ratio, 0.33,
347  "Expect spaces bigger than this");
348  double_VAR_H(tosp_table_fuzzy_kn_sp_ratio, 3.0,
349  "Fuzzy if less than this");
350  double_VAR_H(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg");
351  double_VAR_H(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg");
352  double_VAR_H(tosp_min_sane_kn_sp, 1.5,
353  "Don't trust spaces less than this time kn");
354  double_VAR_H(tosp_init_guess_kn_mult, 2.2,
355  "Thresh guess - mult kn by this");
356  double_VAR_H(tosp_init_guess_xht_mult, 0.28,
357  "Thresh guess - mult xht by this");
358  double_VAR_H(tosp_max_sane_kn_thresh, 5.0,
359  "Multiplier on kn to limit thresh");
360  double_VAR_H(tosp_flip_caution, 0.0,
361  "Don't autoflip kn to sp when large separation");
362  double_VAR_H(tosp_large_kerning, 0.19,
363  "Limit use of xht gap with large kns");
364  double_VAR_H(tosp_dont_fool_with_small_kerns, -1,
365  "Limit use of xht gap with odd small kns");
366  double_VAR_H(tosp_near_lh_edge, 0,
367  "Don't reduce box if the top left is non blank");
368  double_VAR_H(tosp_silly_kn_sp_gap, 0.2,
369  "Don't let sp minus kn get too small");
370  double_VAR_H(tosp_pass_wide_fuzz_sp_to_context, 0.75,
371  "How wide fuzzies need context");
372  // tordmain.cpp ///////////////////////////////////////////
373  BOOL_VAR_H(textord_no_rejects, false, "Don't remove noise blobs");
374  BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
375  BOOL_VAR_H(textord_show_boxes, false, "Display boxes");
376  INT_VAR_H(textord_max_noise_size, 7, "Pixel size of noise");
377  INT_VAR_H(textord_baseline_debug, 0, "Baseline debug level");
378  double_VAR_H(textord_blob_size_bigile, 95, "Percentile for large blobs");
379  double_VAR_H(textord_noise_area_ratio, 0.7,
380  "Fraction of bounding box for noise");
381  double_VAR_H(textord_blob_size_smallile, 20, "Percentile for small blobs");
382  double_VAR_H(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess");
383  double_VAR_H(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess");
384  INT_VAR_H(textord_noise_sizefraction, 10, "Fraction of size for maxima");
385  double_VAR_H(textord_noise_sizelimit, 0.5, "Fraction of x for big t count");
386  INT_VAR_H(textord_noise_translimit, 16, "Transitions for normal blob");
387  double_VAR_H(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion");
388  BOOL_VAR_H(textord_noise_rejwords, true, "Reject noise-like words");
389  BOOL_VAR_H(textord_noise_rejrows, true, "Reject noise-like rows");
390  double_VAR_H(textord_noise_syfract, 0.2, "xh fract error for norm blobs");
391  double_VAR_H(textord_noise_sxfract, 0.4,
392  "xh fract width error for norm blobs");
393  double_VAR_H(textord_noise_hfract, 1.0/64,
394  "Height fraction to discard outlines as speckle noise");
395  INT_VAR_H(textord_noise_sncount, 1, "super norm blobs to save row");
396  double_VAR_H(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion");
397  BOOL_VAR_H(textord_noise_debug, FALSE, "Debug row garbage detector");
398  double_VAR_H(textord_blshift_maxshift, 0.00, "Max baseline shift");
399  double_VAR_H(textord_blshift_xfraction, 9.99, "Min size of baseline shift");
400 };
401 } // namespace tesseract.
402 
403 #endif // TESSERACT_TEXTORD_TEXTORD_H__
TBOX true_bounding_box() const
Definition: werd.cpp:181
bool use_cjk_fp_model() const
Definition: textord.h:92
#define CLISTIZEH(CLASSNAME)
Definition: clst.h:901
TBOX true_bounding_box() const
Definition: textord.h:52
short inT16
Definition: host.h:33
integer coordinate
Definition: points.h:30
#define INT_VAR_H(name, val, comment)
Definition: params.h:265
unsigned char uinT8
Definition: host.h:32
unsigned char BOOL8
Definition: host.h:46
const TBOX & bounding_box() const
Definition: textord.h:50
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268
Definition: werd.h:60
#define FALSE
Definition: capi.h:46
inT16 height() const
Definition: rect.h:104
int inT32
Definition: host.h:35
Definition: ocrrow.h:32
Definition: points.h:189
Definition: ocrblock.h:30
WordWithBox(WERD *word)
Definition: textord.h:44
void pad(int xpad, int ypad)
Definition: rect.h:127
C_BLOB_LIST * RejBlobs() const
Definition: textord.h:53
const WERD * word() const
Definition: textord.h:54
Definition: gap_map.h:15
Definition: rect.h:30
Definition: statistc.h:33
#define double_VAR_H(name, val, comment)
Definition: params.h:274
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95