tesseract  3.05.02
tordmain.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tordmain.cpp (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author: Ray Smith
5  * Created: Tue Jul 28 17:12:33 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #ifdef __UNIX__
25 #include <assert.h>
26 #endif
27 #include "stderr.h"
28 #include "globaloc.h"
29 #include "blread.h"
30 #include "blobbox.h"
31 #include "ccstruct.h"
32 #include "edgblob.h"
33 #include "drawtord.h"
34 #include "makerow.h"
35 #include "wordseg.h"
36 #include "textord.h"
37 #include "tordmain.h"
38 
39 #include "allheaders.h"
40 
41 #undef EXTERN
42 #define EXTERN
43 
44 #define MAX_NEAREST_DIST 600 //for block skew stats
45 
46 namespace tesseract {
47 
48 CLISTIZE(WordWithBox)
49 
50 /**********************************************************************
51  * SetBlobStrokeWidth
52  *
53  * Set the horizontal and vertical stroke widths in the blob.
54  **********************************************************************/
55 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
56  // Cut the blob rectangle into a Pix.
57  int pix_height = pixGetHeight(pix);
58  const TBOX& box = blob->bounding_box();
59  int width = box.width();
60  int height = box.height();
61  Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
62  width, height);
63  Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
64  boxDestroy(&blob_pix_box);
65  Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
66  pixDestroy(&pix_blob);
67  // Compute the stroke widths.
68  uinT32* data = pixGetData(dist_pix);
69  int wpl = pixGetWpl(dist_pix);
70  // Horizontal width of stroke.
71  STATS h_stats(0, width + 1);
72  for (int y = 0; y < height; ++y) {
73  uinT32* pixels = data + y*wpl;
74  int prev_pixel = 0;
75  int pixel = GET_DATA_BYTE(pixels, 0);
76  for (int x = 1; x < width; ++x) {
77  int next_pixel = GET_DATA_BYTE(pixels, x);
78  // We are looking for a pixel that is equal to its vertical neighbours,
79  // yet greater than its left neighbour.
80  if (prev_pixel < pixel &&
81  (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
82  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
83  if (pixel > next_pixel) {
84  // Single local max, so an odd width.
85  h_stats.add(pixel * 2 - 1, 1);
86  } else if (pixel == next_pixel && x + 1 < width &&
87  pixel > GET_DATA_BYTE(pixels, x + 1)) {
88  // Double local max, so an even width.
89  h_stats.add(pixel * 2, 1);
90  }
91  }
92  prev_pixel = pixel;
93  pixel = next_pixel;
94  }
95  }
96  // Vertical width of stroke.
97  STATS v_stats(0, height + 1);
98  for (int x = 0; x < width; ++x) {
99  int prev_pixel = 0;
100  int pixel = GET_DATA_BYTE(data, x);
101  for (int y = 1; y < height; ++y) {
102  uinT32* pixels = data + y*wpl;
103  int next_pixel = GET_DATA_BYTE(pixels, x);
104  // We are looking for a pixel that is equal to its horizontal neighbours,
105  // yet greater than its upper neighbour.
106  if (prev_pixel < pixel &&
107  (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
108  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
109  if (pixel > next_pixel) {
110  // Single local max, so an odd width.
111  v_stats.add(pixel * 2 - 1, 1);
112  } else if (pixel == next_pixel && y + 1 < height &&
113  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
114  // Double local max, so an even width.
115  v_stats.add(pixel * 2, 1);
116  }
117  }
118  prev_pixel = pixel;
119  pixel = next_pixel;
120  }
121  }
122  pixDestroy(&dist_pix);
123  // Store the horizontal and vertical width in the blob, keeping both
124  // widths if there is enough information, otherwse only the one with
125  // the most samples.
126  // If there are insufficent samples, store zero, rather than using
127  // 2*area/perimeter, as the numbers that gives do not match the numbers
128  // from the distance method.
129  if (h_stats.get_total() >= (width + height) / 4) {
130  blob->set_horz_stroke_width(h_stats.ile(0.5f));
131  if (v_stats.get_total() >= (width + height) / 4)
132  blob->set_vert_stroke_width(v_stats.ile(0.5f));
133  else
134  blob->set_vert_stroke_width(0.0f);
135  } else {
136  if (v_stats.get_total() >= (width + height) / 4 ||
137  v_stats.get_total() > h_stats.get_total()) {
138  blob->set_horz_stroke_width(0.0f);
139  blob->set_vert_stroke_width(v_stats.ile(0.5f));
140  } else {
141  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
142  : 0.0f);
143  blob->set_vert_stroke_width(0.0f);
144  }
145  }
146 }
147 
148 /**********************************************************************
149  * assign_blobs_to_blocks2
150  *
151  * Make a list of TO_BLOCKs for portrait and landscape orientation.
152  **********************************************************************/
153 
155  BLOCK_LIST *blocks, // blocks to process
156  TO_BLOCK_LIST *port_blocks) { // output list
157  BLOCK *block; // current block
158  BLOBNBOX *newblob; // created blob
159  C_BLOB *blob; // current blob
160  BLOCK_IT block_it = blocks;
161  C_BLOB_IT blob_it; // iterator
162  BLOBNBOX_IT port_box_it; // iterator
163  // destination iterator
164  TO_BLOCK_IT port_block_it = port_blocks;
165  TO_BLOCK *port_block; // created block
166 
167  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
168  block = block_it.data();
169  port_block = new TO_BLOCK(block);
170 
171  // Convert the good outlines to block->blob_list
172  port_box_it.set_to_list(&port_block->blobs);
173  blob_it.set_to_list(block->blob_list());
174  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
175  blob = blob_it.extract();
176  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
177  SetBlobStrokeWidth(pix, newblob);
178  port_box_it.add_after_then_move(newblob);
179  }
180 
181  // Put the rejected outlines in block->noise_blobs, which allows them to
182  // be reconsidered and sorted back into rows and recover outlines mistakenly
183  // rejected.
184  port_box_it.set_to_list(&port_block->noise_blobs);
185  blob_it.set_to_list(block->reject_blobs());
186  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
187  blob = blob_it.extract();
188  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
189  SetBlobStrokeWidth(pix, newblob);
190  port_box_it.add_after_then_move(newblob);
191  }
192 
193  port_block_it.add_after_then_move(port_block);
194  }
195 }
196 
197 /**********************************************************************
198  * find_components
199  *
200  * Find the C_OUTLINEs of the connected components in each block, put them
201  * in C_BLOBs, and filter them by size, putting the different size
202  * grades on different lists in the matching TO_BLOCK in to_blocks.
203  **********************************************************************/
204 
205 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
206  TO_BLOCK_LIST *to_blocks) {
207  int width = pixGetWidth(pix);
208  int height = pixGetHeight(pix);
209  if (width > MAX_INT16 || height > MAX_INT16) {
210  tprintf("Input image too large! (%d, %d)\n", width, height);
211  return; // Can't handle it.
212  }
213 
215 
216  BLOCK_IT block_it(blocks); // iterator
217  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
218  block_it.forward()) {
219  BLOCK* block = block_it.data();
220  if (block->poly_block() == NULL || block->poly_block()->IsText()) {
221  extract_edges(pix, block);
222  }
223  }
224 
225  assign_blobs_to_blocks2(pix, blocks, to_blocks);
226  ICOORD page_tr(width, height);
227  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
228 }
229 
230 /**********************************************************************
231  * filter_blobs
232  *
233  * Sort the blobs into sizes in all the blocks for later work.
234  **********************************************************************/
235 
236 void Textord::filter_blobs(ICOORD page_tr, // top right
237  TO_BLOCK_LIST *blocks, // output list
238  BOOL8 testing_on) { // for plotting
239  TO_BLOCK_IT block_it = blocks; // destination iterator
240  TO_BLOCK *block; // created block
241 
242  #ifndef GRAPHICS_DISABLED
243  if (to_win != NULL)
244  to_win->Clear();
245  #endif // GRAPHICS_DISABLED
246 
247  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
248  block_it.forward()) {
249  block = block_it.data();
250  block->line_size = filter_noise_blobs(&block->blobs,
251  &block->noise_blobs,
252  &block->small_blobs,
253  &block->large_blobs);
254  if (block->line_size == 0) block->line_size = 1;
255  block->line_spacing = block->line_size *
262 
263  #ifndef GRAPHICS_DISABLED
264  if (textord_show_blobs && testing_on) {
265  if (to_win == NULL)
266  create_to_win(page_tr);
267  block->plot_graded_blobs(to_win);
268  }
269  if (textord_show_boxes && testing_on) {
270  if (to_win == NULL)
271  create_to_win(page_tr);
276  }
277  #endif // GRAPHICS_DISABLED
278  }
279 }
280 
281 /**********************************************************************
282  * filter_noise_blobs
283  *
284  * Move small blobs to a separate list.
285  **********************************************************************/
286 
287 float Textord::filter_noise_blobs(
288  BLOBNBOX_LIST *src_list, // original list
289  BLOBNBOX_LIST *noise_list, // noise list
290  BLOBNBOX_LIST *small_list, // small blobs
291  BLOBNBOX_LIST *large_list) { // large blobs
292  inT16 height; //height of blob
293  inT16 width; //of blob
294  BLOBNBOX *blob; //current blob
295  float initial_x; //first guess
296  BLOBNBOX_IT src_it = src_list; //iterators
297  BLOBNBOX_IT noise_it = noise_list;
298  BLOBNBOX_IT small_it = small_list;
299  BLOBNBOX_IT large_it = large_list;
300  STATS size_stats (0, MAX_NEAREST_DIST);
301  //blob heights
302  float min_y; //size limits
303  float max_y;
304  float max_x;
305  float max_height; //of good blobs
306 
307  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
308  blob = src_it.data();
310  noise_it.add_after_then_move(src_it.extract());
311  else if (blob->enclosed_area() >= blob->bounding_box().height()
313  small_it.add_after_then_move(src_it.extract());
314  }
315  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
316  size_stats.add(src_it.data()->bounding_box().height(), 1);
317  }
318  initial_x = size_stats.ile(textord_initialx_ile);
319  max_y = ceil(initial_x *
324  min_y = floor (initial_x / 2);
325  max_x = ceil (initial_x * textord_width_limit);
326  small_it.move_to_first ();
327  for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
328  small_it.forward ()) {
329  height = small_it.data()->bounding_box().height();
330  if (height > max_y)
331  large_it.add_after_then_move(small_it.extract ());
332  else if (height >= min_y)
333  src_it.add_after_then_move(small_it.extract ());
334  }
335  size_stats.clear ();
336  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
337  height = src_it.data ()->bounding_box ().height ();
338  width = src_it.data ()->bounding_box ().width ();
339  if (height < min_y)
340  small_it.add_after_then_move (src_it.extract ());
341  else if (height > max_y || width > max_x)
342  large_it.add_after_then_move (src_it.extract ());
343  else
344  size_stats.add (height, 1);
345  }
346  max_height = size_stats.ile (textord_initialasc_ile);
347  // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
348  // max_y,min_y,initial_x,max_height);
350  if (max_height > initial_x)
351  initial_x = max_height;
352  // tprintf(" ret=%g\n",initial_x);
353  return initial_x;
354 }
355 
356 // Fixes the block so it obeys all the rules:
357 // Must have at least one ROW.
358 // Must have at least one WERD.
359 // WERDs contain a fake blob.
360 void Textord::cleanup_nontext_block(BLOCK* block) {
361  // Non-text blocks must contain at least one row.
362  ROW_IT row_it(block->row_list());
363  if (row_it.empty()) {
364  const TBOX& box = block->bounding_box();
365  float height = box.height();
366  inT32 xstarts[2] = {box.left(), box.right()};
367  double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
368  ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
369  height / 4.0f, 0, 1);
370  row_it.add_after_then_move(row);
371  }
372  // Each row must contain at least one word.
373  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
374  ROW* row = row_it.data();
375  WERD_IT w_it(row->word_list());
376  if (w_it.empty()) {
377  // Make a fake blob to put in the word.
378  TBOX box = block->row_list()->singleton() ? block->bounding_box()
379  : row->bounding_box();
380  C_BLOB* blob = C_BLOB::FakeBlob(box);
381  C_BLOB_LIST blobs;
382  C_BLOB_IT blob_it(&blobs);
383  blob_it.add_after_then_move(blob);
384  WERD* word = new WERD(&blobs, 0, NULL);
385  w_it.add_after_then_move(word);
386  }
387  // Each word must contain a fake blob.
388  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
389  WERD* word = w_it.data();
390  // Just assert that this is true, as it would be useful to find
391  // out why it isn't.
392  ASSERT_HOST(!word->cblob_list()->empty());
393  }
394  row->recalc_bounding_box();
395  }
396 }
397 
398 /**********************************************************************
399  * cleanup_blocks
400  *
401  * Delete empty blocks, rows from the page.
402  **********************************************************************/
403 
404 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
405  BLOCK_IT block_it = blocks; //iterator
406  ROW_IT row_it; //row iterator
407 
408  int num_rows = 0;
409  int num_rows_all = 0;
410  int num_blocks = 0;
411  int num_blocks_all = 0;
412  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
413  block_it.forward()) {
414  BLOCK* block = block_it.data();
415  if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
416  cleanup_nontext_block(block);
417  continue;
418  }
419  num_rows = 0;
420  num_rows_all = 0;
421  if (clean_noise) {
422  row_it.set_to_list(block->row_list());
423  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
424  ROW* row = row_it.data();
425  ++num_rows_all;
426  clean_small_noise_from_words(row);
427  if ((textord_noise_rejrows && !row->word_list()->empty() &&
428  clean_noise_from_row(row)) ||
429  row->word_list()->empty()) {
430  delete row_it.extract(); // lose empty row.
431  } else {
433  clean_noise_from_words(row_it.data());
434  if (textord_blshift_maxshift >= 0)
437  ++num_rows;
438  }
439  }
440  }
441  if (block->row_list()->empty()) {
442  delete block_it.extract(); // Lose empty text blocks.
443  } else {
444  ++num_blocks;
445  }
446  ++num_blocks_all;
448  tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
449  }
451  tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
452 }
453 
454 
455 /**********************************************************************
456  * clean_noise_from_row
457  *
458  * Move blobs of words from rows of garbage into the reject blobs list.
459  **********************************************************************/
460 
461 BOOL8 Textord::clean_noise_from_row( //remove empties
462  ROW *row //row to clean
463  ) {
464  BOOL8 testing_on;
465  TBOX blob_box; //bounding box
466  C_BLOB *blob; //current blob
467  C_OUTLINE *outline; //current outline
468  WERD *word; //current word
469  inT32 blob_size; //biggest size
470  inT32 trans_count = 0; //no of transitions
471  inT32 trans_threshold; //noise tolerance
472  inT32 dot_count; //small objects
473  inT32 norm_count; //normal objects
474  inT32 super_norm_count; //real char-like
475  //words of row
476  WERD_IT word_it = row->word_list ();
477  C_BLOB_IT blob_it; //blob iterator
478  C_OUTLINE_IT out_it; //outline iterator
479 
482  && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
483  testing_on = TRUE;
484  else
485  testing_on = FALSE;
486  dot_count = 0;
487  norm_count = 0;
488  super_norm_count = 0;
489  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
490  word = word_it.data (); //current word
491  //blobs in word
492  blob_it.set_to_list (word->cblob_list ());
493  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
494  blob_it.forward ()) {
495  blob = blob_it.data ();
496  if (!word->flag (W_DONT_CHOP)) {
497  //get outlines
498  out_it.set_to_list (blob->out_list ());
499  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
500  out_it.forward ()) {
501  outline = out_it.data ();
502  blob_box = outline->bounding_box ();
503  blob_size =
504  blob_box.width () >
505  blob_box.height ()? blob_box.width () : blob_box.
506  height();
507  if (blob_size < textord_noise_sizelimit * row->x_height ())
508  dot_count++; //count smal outlines
509  if (!outline->child ()->empty ()
510  && blob_box.height () <
511  (1 + textord_noise_syfract) * row->x_height ()
512  && blob_box.height () >
513  (1 - textord_noise_syfract) * row->x_height ()
514  && blob_box.width () <
515  (1 + textord_noise_sxfract) * row->x_height ()
516  && blob_box.width () >
517  (1 - textord_noise_sxfract) * row->x_height ())
518  super_norm_count++; //count smal outlines
519  }
520  }
521  else
522  super_norm_count++;
523  blob_box = blob->bounding_box ();
524  blob_size =
525  blob_box.width () >
526  blob_box.height ()? blob_box.width () : blob_box.height ();
527  if (blob_size >= textord_noise_sizelimit * row->x_height ()
528  && blob_size < row->x_height () * 2) {
529  trans_threshold = blob_size / textord_noise_sizefraction;
530  trans_count = blob->count_transitions (trans_threshold);
531  if (trans_count < textord_noise_translimit)
532  norm_count++;
533  }
534  else if (blob_box.height () > row->x_height () * 2
535  && (!word_it.at_first () || !blob_it.at_first ()))
536  dot_count += 2;
537  if (testing_on) {
538  tprintf
539  ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
540  blob_box.left (), blob_box.bottom (), blob_box.right (),
541  blob_box.top (), blob->out_list ()->length (), trans_count,
542  blob_box.bottom () - row->base_line (blob_box.left ()));
543  }
544  }
545  }
546  if (textord_noise_debug) {
547  tprintf ("Row ending at (%d,%g):",
548  blob_box.right (), row->base_line (blob_box.right ()));
549  tprintf (" R=%g, dc=%d, nc=%d, %s\n",
550  norm_count > 0 ? (float) dot_count / norm_count : 9999,
551  dot_count, norm_count,
552  dot_count > norm_count * textord_noise_normratio
553  && dot_count > 2 ? "REJECTED" : "ACCEPTED");
554  }
555  return super_norm_count < textord_noise_sncount
556  && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
557 }
558 
559 /**********************************************************************
560  * clean_noise_from_words
561  *
562  * Move blobs of words from rows of garbage into the reject blobs list.
563  **********************************************************************/
564 
565 void Textord::clean_noise_from_words( //remove empties
566  ROW *row //row to clean
567  ) {
568  TBOX blob_box; //bounding box
569  inT8 *word_dud; //was it chucked
570  C_BLOB *blob; //current blob
571  C_OUTLINE *outline; //current outline
572  WERD *word; //current word
573  inT32 blob_size; //biggest size
574  inT32 trans_count; //no of transitions
575  inT32 trans_threshold; //noise tolerance
576  inT32 dot_count; //small objects
577  inT32 norm_count; //normal objects
578  inT32 dud_words; //number discarded
579  inT32 ok_words; //number remaining
580  inT32 word_index; //current word
581  //words of row
582  WERD_IT word_it = row->word_list ();
583  C_BLOB_IT blob_it; //blob iterator
584  C_OUTLINE_IT out_it; //outline iterator
585 
586  ok_words = word_it.length ();
587  if (ok_words == 0 || textord_no_rejects)
588  return;
589  word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
590  dud_words = 0;
591  ok_words = 0;
592  word_index = 0;
593  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
594  word = word_it.data (); //current word
595  dot_count = 0;
596  norm_count = 0;
597  //blobs in word
598  blob_it.set_to_list (word->cblob_list ());
599  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
600  blob_it.forward ()) {
601  blob = blob_it.data ();
602  if (!word->flag (W_DONT_CHOP)) {
603  //get outlines
604  out_it.set_to_list (blob->out_list ());
605  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
606  out_it.forward ()) {
607  outline = out_it.data ();
608  blob_box = outline->bounding_box ();
609  blob_size =
610  blob_box.width () >
611  blob_box.height ()? blob_box.width () : blob_box.
612  height();
613  if (blob_size < textord_noise_sizelimit * row->x_height ())
614  dot_count++; //count smal outlines
615  if (!outline->child ()->empty ()
616  && blob_box.height () <
617  (1 + textord_noise_syfract) * row->x_height ()
618  && blob_box.height () >
619  (1 - textord_noise_syfract) * row->x_height ()
620  && blob_box.width () <
621  (1 + textord_noise_sxfract) * row->x_height ()
622  && blob_box.width () >
623  (1 - textord_noise_sxfract) * row->x_height ())
624  norm_count++; //count smal outlines
625  }
626  }
627  else
628  norm_count++;
629  blob_box = blob->bounding_box ();
630  blob_size =
631  blob_box.width () >
632  blob_box.height ()? blob_box.width () : blob_box.height ();
633  if (blob_size >= textord_noise_sizelimit * row->x_height ()
634  && blob_size < row->x_height () * 2) {
635  trans_threshold = blob_size / textord_noise_sizefraction;
636  trans_count = blob->count_transitions (trans_threshold);
637  if (trans_count < textord_noise_translimit)
638  norm_count++;
639  }
640  else if (blob_box.height () > row->x_height () * 2
641  && (!word_it.at_first () || !blob_it.at_first ()))
642  dot_count += 2;
643  }
644  if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
645  if (dot_count > norm_count * textord_noise_normratio * 2)
646  word_dud[word_index] = 2;
647  else if (dot_count > norm_count * textord_noise_normratio)
648  word_dud[word_index] = 1;
649  else
650  word_dud[word_index] = 0;
651  } else {
652  word_dud[word_index] = 0;
653  }
654  if (word_dud[word_index] == 2)
655  dud_words++;
656  else
657  ok_words++;
658  word_index++;
659  }
660 
661  word_index = 0;
662  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
663  if (word_dud[word_index] == 2
664  || (word_dud[word_index] == 1 && dud_words > ok_words)) {
665  word = word_it.data(); // Current word.
666  // Previously we threw away the entire word.
667  // Now just aggressively throw all small blobs into the reject list, where
668  // the classifier can decide whether they are actually needed.
670  }
671  word_index++;
672  }
673  free_mem(word_dud);
674 }
675 
676 // Remove outlines that are a tiny fraction in either width or height
677 // of the word height.
678 void Textord::clean_small_noise_from_words(ROW *row) {
679  WERD_IT word_it(row->word_list());
680  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
681  WERD* word = word_it.data();
682  int min_size = static_cast<int>(
683  textord_noise_hfract * word->bounding_box().height() + 0.5);
684  C_BLOB_IT blob_it(word->cblob_list());
685  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
686  C_BLOB* blob = blob_it.data();
687  C_OUTLINE_IT out_it(blob->out_list());
688  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
689  C_OUTLINE* outline = out_it.data();
690  outline->RemoveSmallRecursive(min_size, &out_it);
691  }
692  if (blob->out_list()->empty()) {
693  delete blob_it.extract();
694  }
695  }
696  if (word->cblob_list()->empty()) {
697  if (!word_it.at_last()) {
698  // The next word is no longer a fuzzy non space if it was before,
699  // since the word before is about to be deleted.
700  WERD* next_word = word_it.data_relative(1);
701  if (next_word->flag(W_FUZZY_NON)) {
702  next_word->set_flag(W_FUZZY_NON, false);
703  }
704  }
705  delete word_it.extract();
706  }
707  }
708 }
709 
710 // Local struct to hold a group of blocks.
711 struct BlockGroup {
712  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
713  explicit BlockGroup(BLOCK* block)
714  : bounding_box(block->bounding_box()),
715  rotation(block->re_rotation()),
716  angle(block->re_rotation().angle()),
717  min_xheight(block->x_height()) {
718  blocks.push_back(block);
719  }
720  // Union of block bounding boxes.
722  // Common rotation of the blocks.
724  // Angle of rotation.
725  float angle;
726  // Min xheight of the blocks.
727  float min_xheight;
728  // Collection of borrowed pointers to the blocks in the group.
730 };
731 
732 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
733 // TransferDiacriticsToWords to copy the diacritic blobs to the most
734 // appropriate words in the group of blocks. Source blobs are not touched.
735 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
736  BLOCK_LIST* blocks) {
737  // Angle difference larger than this is too much to consider equal.
738  // They should only be in multiples of M_PI/2 anyway.
739  const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
741  BLOCK_IT bk_it(blocks);
742  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
743  BLOCK* block = bk_it.data();
744  if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
745  continue;
746  }
747  // Linear search of the groups to find a matching rotation.
748  float block_angle = block->re_rotation().angle();
749  int best_g = 0;
750  float best_angle_diff = MAX_FLOAT32;
751  for (int g = 0; g < groups.size(); ++g) {
752  double angle_diff = fabs(block_angle - groups[g]->angle);
753  if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
754  if (angle_diff < best_angle_diff) {
755  best_angle_diff = angle_diff;
756  best_g = g;
757  }
758  }
759  if (best_angle_diff > kMaxAngleDiff) {
760  groups.push_back(new BlockGroup(block));
761  } else {
762  groups[best_g]->blocks.push_back(block);
763  groups[best_g]->bounding_box += block->bounding_box();
764  float x_height = block->x_height();
765  if (x_height < groups[best_g]->min_xheight)
766  groups[best_g]->min_xheight = x_height;
767  }
768  }
769  // Now process each group of blocks.
770  PointerVector<WordWithBox> word_ptrs;
771  for (int g = 0; g < groups.size(); ++g) {
772  const BlockGroup* group = groups[g];
773  if (group->bounding_box.null_box()) continue;
774  WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
775  group->bounding_box.topright());
776  for (int b = 0; b < group->blocks.size(); ++b) {
777  ROW_IT row_it(group->blocks[b]->row_list());
778  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
779  ROW* row = row_it.data();
780  // Put the words of the row into the grid.
781  WERD_IT w_it(row->word_list());
782  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
783  WERD* word = w_it.data();
784  WordWithBox* box_word = new WordWithBox(word);
785  word_grid.InsertBBox(true, true, box_word);
786  // Save the pointer where it will be auto-deleted.
787  word_ptrs.push_back(box_word);
788  }
789  }
790  }
791  FCOORD rotation = group->rotation;
792  // Make it a forward rotation that will transform blob coords to block.
793  rotation.set_y(-rotation.y());
794  TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
795  }
796 }
797 
798 // Places a copy of blobs that are near a word (after applying rotation to the
799 // blob) in the most appropriate word, unless there is doubt, in which case a
800 // blob can end up in two words. Source blobs are not touched.
801 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
802  const FCOORD& rotation,
803  WordGrid* word_grid) {
804  WordSearch ws(word_grid);
805  BLOBNBOX_IT b_it(diacritic_blobs);
806  // Apply rotation to each blob before finding the nearest words. The rotation
807  // allows us to only consider above/below placement and not left/right on
808  // vertical text, because all text is horizontal here.
809  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
810  BLOBNBOX* blobnbox = b_it.data();
811  TBOX blob_box = blobnbox->bounding_box();
812  blob_box.rotate(rotation);
813  ws.StartRectSearch(blob_box);
814  // Above/below refer to word position relative to diacritic. Since some
815  // scripts eg Kannada/Telugu habitually put diacritics below words, and
816  // others eg Thai/Vietnamese/Latin put most diacritics above words, try
817  // for both if there isn't much in it.
818  WordWithBox* best_above_word = NULL;
819  WordWithBox* best_below_word = NULL;
820  int best_above_distance = 0;
821  int best_below_distance = 0;
822  for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
823  word = ws.NextRectSearch()) {
824  if (word->word()->flag(W_REP_CHAR)) continue;
825  TBOX word_box = word->true_bounding_box();
826  int x_distance = blob_box.x_gap(word_box);
827  int y_distance = blob_box.y_gap(word_box);
828  if (x_distance > 0) {
829  // Arbitrarily divide x-distance by 2 if there is a major y overlap,
830  // and the word is to the left of the diacritic. If the
831  // diacritic is a dropped broken character between two words, this will
832  // help send all the pieces to a single word, instead of splitting them
833  // over the 2 words.
834  if (word_box.major_y_overlap(blob_box) &&
835  blob_box.left() > word_box.right()) {
836  x_distance /= 2;
837  }
838  y_distance += x_distance;
839  }
840  if (word_box.y_middle() > blob_box.y_middle() &&
841  (best_above_word == NULL || y_distance < best_above_distance)) {
842  best_above_word = word;
843  best_above_distance = y_distance;
844  }
845  if (word_box.y_middle() <= blob_box.y_middle() &&
846  (best_below_word == NULL || y_distance < best_below_distance)) {
847  best_below_word = word;
848  best_below_distance = y_distance;
849  }
850  }
851  bool above_good =
852  best_above_word != NULL &&
853  (best_below_word == NULL ||
854  best_above_distance < best_below_distance + blob_box.height());
855  bool below_good =
856  best_below_word != NULL && best_below_word != best_above_word &&
857  (best_above_word == NULL ||
858  best_below_distance < best_above_distance + blob_box.height());
859  if (below_good) {
860  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
861  copied_blob->rotate(rotation);
862  // Put the blob into the word's reject blobs list.
863  C_BLOB_IT blob_it(best_below_word->RejBlobs());
864  blob_it.add_to_end(copied_blob);
865  }
866  if (above_good) {
867  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
868  copied_blob->rotate(rotation);
869  // Put the blob into the word's reject blobs list.
870  C_BLOB_IT blob_it(best_above_word->RejBlobs());
871  blob_it.add_to_end(copied_blob);
872  }
873  }
874 }
875 
876 } // tesseract
877 
878 /**********************************************************************
879  * tweak_row_baseline
880  *
881  * Shift baseline to fit the blobs more accurately where they are
882  * close enough.
883  **********************************************************************/
884 
886  double blshift_maxshift,
887  double blshift_xfraction) {
888  TBOX blob_box; //bounding box
889  C_BLOB *blob; //current blob
890  WERD *word; //current word
891  inT32 blob_count; //no of blobs
892  inT32 src_index; //source segment
893  inT32 dest_index; //destination segment
894  inT32 *xstarts; //spline segments
895  double *coeffs; //spline coeffs
896  float ydiff; //baseline error
897  float x_centre; //centre of blob
898  //words of row
899  WERD_IT word_it = row->word_list ();
900  C_BLOB_IT blob_it; //blob iterator
901 
902  blob_count = 0;
903  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
904  word = word_it.data (); //current word
905  //get total blobs
906  blob_count += word->cblob_list ()->length ();
907  }
908  if (blob_count == 0)
909  return;
910  xstarts =
911  (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
912  sizeof (inT32));
913  coeffs =
914  (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
915  sizeof (double));
916 
917  src_index = 0;
918  dest_index = 0;
919  xstarts[0] = row->baseline.xcoords[0];
920  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
921  word = word_it.data (); //current word
922  //blobs in word
923  blob_it.set_to_list (word->cblob_list ());
924  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
925  blob_it.forward ()) {
926  blob = blob_it.data ();
927  blob_box = blob->bounding_box ();
928  x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
929  ydiff = blob_box.bottom () - row->base_line (x_centre);
930  if (ydiff < 0)
931  ydiff = -ydiff / row->x_height ();
932  else
933  ydiff = ydiff / row->x_height ();
934  if (ydiff < blshift_maxshift
935  && blob_box.height () / row->x_height () > blshift_xfraction) {
936  if (xstarts[dest_index] >= x_centre)
937  xstarts[dest_index] = blob_box.left ();
938  coeffs[dest_index * 3] = 0;
939  coeffs[dest_index * 3 + 1] = 0;
940  coeffs[dest_index * 3 + 2] = blob_box.bottom ();
941  //shift it
942  dest_index++;
943  xstarts[dest_index] = blob_box.right () + 1;
944  }
945  else {
946  if (xstarts[dest_index] <= x_centre) {
947  while (row->baseline.xcoords[src_index + 1] <= x_centre
948  && src_index < row->baseline.segments - 1) {
949  if (row->baseline.xcoords[src_index + 1] >
950  xstarts[dest_index]) {
951  coeffs[dest_index * 3] =
952  row->baseline.quadratics[src_index].a;
953  coeffs[dest_index * 3 + 1] =
954  row->baseline.quadratics[src_index].b;
955  coeffs[dest_index * 3 + 2] =
956  row->baseline.quadratics[src_index].c;
957  dest_index++;
958  xstarts[dest_index] =
959  row->baseline.xcoords[src_index + 1];
960  }
961  src_index++;
962  }
963  coeffs[dest_index * 3] =
964  row->baseline.quadratics[src_index].a;
965  coeffs[dest_index * 3 + 1] =
966  row->baseline.quadratics[src_index].b;
967  coeffs[dest_index * 3 + 2] =
968  row->baseline.quadratics[src_index].c;
969  dest_index++;
970  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
971  }
972  }
973  }
974  }
975  while (src_index < row->baseline.segments
976  && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
977  src_index++;
978  while (src_index < row->baseline.segments) {
979  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
980  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
981  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
982  dest_index++;
983  src_index++;
984  xstarts[dest_index] = row->baseline.xcoords[src_index];
985  }
986  //turn to spline
987  row->baseline = QSPLINE (dest_index, xstarts, coeffs);
988  free_mem(xstarts);
989  free_mem(coeffs);
990 }
int textord_max_noise_size
Definition: textord.h:376
double textord_min_linesize
Definition: makerow.cpp:83
const TBOX & bounding_box() const
Definition: blobbox.h:215
void rotate(const FCOORD &vec)
Definition: rect.h:189
TBOX true_bounding_box() const
Definition: werd.cpp:181
bool textord_test_landscape
Definition: makerow.cpp:50
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47
bool textord_show_boxes
Definition: textord.h:375
bool textord_noise_rejrows
Definition: textord.h:389
void rotate(const FCOORD &rotation)
Definition: stepblob.cpp:387
TBOX bounding_box() const
Definition: ocrrow.h:85
#define TRUE
Definition: capi.h:45
short inT16
Definition: host.h:33
bool textord_show_blobs
Definition: textord.h:374
float b
Definition: quadratc.h:59
integer coordinate
Definition: points.h:30
int textord_test_x
Definition: makerow.cpp:62
C_BLOB * cblob() const
Definition: blobbox.h:253
float base_line(float xpos) const
Definition: ocrrow.h:56
void SetBlobStrokeWidth(Pix *pix, BLOBNBOX *blob)
Definition: tordmain.cpp:55
void CleanNoise(float size_threshold)
Definition: werd.cpp:506
int textord_noise_translimit
Definition: textord.h:386
float line_spacing
Definition: blobbox.h:775
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:65
int textord_noise_sizefraction
Definition: textord.h:384
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
inT32 count_transitions(inT32 threshold)
Definition: stepblob.cpp:330
inT16 width() const
Definition: rect.h:111
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
float line_size
Definition: blobbox.h:781
TBOX bounding_box() const
Definition: stepblob.cpp:250
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
double textord_initialasc_ile
Definition: textord.h:383
double textord_noise_syfract
Definition: textord.h:390
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:885
float c
Definition: quadratc.h:60
inT32 enclosed_area() const
Definition: blobbox.h:238
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:713
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:205
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
void recalc_bounding_box()
Definition: ocrrow.cpp:101
int y_middle() const
Definition: rect.h:84
unsigned char BOOL8
Definition: host.h:46
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:70
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:135
int textord_test_y
Definition: makerow.cpp:63
#define LOC_EDGE_PROG
Definition: errcode.h:44
int push_back(T object)
double textord_excess_blobsize
Definition: makerow.cpp:85
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
inT32 get_total() const
Definition: statistc.h:86
double a
Definition: quadratc.h:58
#define MAX_FLOAT32
Definition: host.h:57
int y_gap(const TBOX &box) const
Definition: rect.h:225
inT16 bottom() const
Definition: rect.h:61
bool textord_no_rejects
Definition: textord.h:373
FCOORD re_rotation() const
Definition: ocrblock.h:138
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on)
Definition: tordmain.cpp:236
#define MAX_INT16
Definition: host.h:52
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it)
Definition: coutln.cpp:627
Definition: werd.h:60
static const double kXHeightCapRatio
Definition: ccstruct.h:37
#define FALSE
Definition: capi.h:46
void * alloc_mem(inT32 count)
Definition: memry.cpp:47
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:66
SIGNED char inT8
Definition: host.h:31
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
void set_y(float yin)
rewrite function
Definition: points.h:220
C_OUTLINE_LIST * child()
Definition: coutln.h:106
inT16 left() const
Definition: rect.h:68
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
GenericVector< BLOCK * > blocks
Definition: tordmain.cpp:729
float y() const
Definition: points.h:212
bool textord_noise_rejwords
Definition: textord.h:388
double textord_noise_sxfract
Definition: textord.h:392
void Clear()
Definition: scrollview.cpp:595
const TBOX & bounding_box() const
Definition: coutln.h:111
int x_gap(const TBOX &box) const
Definition: rect.h:217
inT16 height() const
Definition: rect.h:104
int inT32
Definition: host.h:35
static const double kAscenderFraction
Definition: ccstruct.h:35
double textord_noise_normratio
Definition: textord.h:387
#define tprintf(...)
Definition: tprintf.h:31
Definition: ocrrow.h:32
Definition: points.h:189
bool IsText() const
Definition: polyblk.h:52
Definition: ocrblock.h:30
double textord_blshift_xfraction
Definition: textord.h:399
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
double textord_noise_sizelimit
Definition: textord.h:385
inT16 top() const
Definition: rect.h:54
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:771
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1067
double textord_blshift_maxshift
Definition: textord.h:398
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:44
static const double kXHeightFraction
Definition: ccstruct.h:34
unsigned int uinT32
Definition: host.h:36
bool textord_noise_debug
Definition: textord.h:397
double ile(double frac) const
Definition: statistc.cpp:174
inT32 x_height() const
return xheight
Definition: ocrblock.h:110
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
void assign_blobs_to_blocks2(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:154
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
WERD_LIST * word_list()
Definition: ocrrow.h:52
int textord_noise_sncount
Definition: textord.h:395
float x_height() const
Definition: ocrrow.h:61
static const double kDescenderFraction
Definition: ccstruct.h:33
double textord_width_limit
Definition: makerow.cpp:77
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
void free_mem(void *oldchunk)
Definition: memry.cpp:55
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
double textord_noise_rowratio
Definition: textord.h:396
#define CLISTIZE(CLASSNAME)
Definition: clst.h:913
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
float angle() const
find angle
Definition: points.h:249
double textord_noise_area_ratio
Definition: textord.h:380
Definition: statistc.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:84
double textord_noise_hfract
Definition: textord.h:394
float max_blob_size
Definition: blobbox.h:782
double textord_initialx_ile
Definition: textord.h:382
TBOX bounding_box() const
Definition: werd.cpp:160
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59