tesseract  3.05.02
fixspace.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: fixspace.cpp (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  * spacing possibilities, trying to use context to improve the
5  * word spacing
6 * Author: Phil Cheatle
7 * Created: Thu Oct 21 11:38:43 BST 1993
8 *
9 * (C) Copyright 1993, Hewlett-Packard Ltd.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 *
20 **********************************************************************/
21 
22 #include <ctype.h>
23 #include "reject.h"
24 #include "statistc.h"
25 #include "control.h"
26 #include "fixspace.h"
27 #include "genblob.h"
28 #include "tessvars.h"
29 #include "tessbox.h"
30 #include "globals.h"
31 #include "tesseractclass.h"
32 
33 #define PERFECT_WERDS 999
34 #define MAXSPACING 128 /*max expected spacing in pix */
35 
36 namespace tesseract {
37 
49  inT32 word_count,
50  PAGE_RES *page_res) {
51  BLOCK_RES_IT block_res_it;
52  ROW_RES_IT row_res_it;
53  WERD_RES_IT word_res_it_from;
54  WERD_RES_IT word_res_it_to;
55  WERD_RES *word_res;
56  WERD_RES_LIST fuzzy_space_words;
57  inT16 new_length;
58  BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
59  inT32 word_index; // current word
60 
61  block_res_it.set_to_list(&page_res->block_res_list);
62  word_index = 0;
63  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64  block_res_it.forward()) {
65  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67  row_res_it.forward()) {
68  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69  while (!word_res_it_from.at_last()) {
70  word_res = word_res_it_from.data();
71  while (!word_res_it_from.at_last() &&
72  !(word_res->combination ||
73  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
74  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
75  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
76  block_res_it.data()->block);
77  word_res = word_res_it_from.forward();
78  word_index++;
79  if (monitor != NULL) {
80  monitor->ocr_alive = TRUE;
81  monitor->progress = 90 + 5 * word_index / word_count;
82  if (monitor->deadline_exceeded() ||
83  (monitor->cancel != NULL &&
84  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
85  return;
86  }
87  }
88 
89  if (!word_res_it_from.at_last()) {
90  word_res_it_to = word_res_it_from;
91  prevent_null_wd_fixsp =
92  word_res->word->cblob_list()->empty();
93  if (check_debug_pt(word_res, 60))
94  debug_fix_space_level.set_value(10);
95  word_res_it_to.forward();
96  word_index++;
97  if (monitor != NULL) {
98  monitor->ocr_alive = TRUE;
99  monitor->progress = 90 + 5 * word_index / word_count;
100  if (monitor->deadline_exceeded() ||
101  (monitor->cancel != NULL &&
102  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
103  return;
104  }
105  while (!word_res_it_to.at_last () &&
106  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
107  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
108  if (check_debug_pt(word_res, 60))
109  debug_fix_space_level.set_value(10);
110  if (word_res->word->cblob_list()->empty())
111  prevent_null_wd_fixsp = TRUE;
112  word_res = word_res_it_to.forward();
113  }
114  if (check_debug_pt(word_res, 60))
115  debug_fix_space_level.set_value(10);
116  if (word_res->word->cblob_list()->empty())
117  prevent_null_wd_fixsp = TRUE;
118  if (prevent_null_wd_fixsp) {
119  word_res_it_from = word_res_it_to;
120  } else {
121  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
122  &word_res_it_to);
123  fix_fuzzy_space_list(fuzzy_space_words,
124  row_res_it.data()->row,
125  block_res_it.data()->block);
126  new_length = fuzzy_space_words.length();
127  word_res_it_from.add_list_before(&fuzzy_space_words);
128  for (;
129  !word_res_it_from.at_last() && new_length > 0;
130  new_length--) {
131  word_res_it_from.forward();
132  }
133  }
134  if (test_pt)
135  debug_fix_space_level.set_value(0);
136  }
137  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
138  block_res_it.data()->block);
139  // Last word in row
140  }
141  }
142  }
143 }
144 
145 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
146  ROW *row,
147  BLOCK* block) {
148  inT16 best_score;
149  WERD_RES_LIST current_perm;
150  inT16 current_score;
151  BOOL8 improved = FALSE;
152 
153  best_score = eval_word_spacing(best_perm); // default score
154  dump_words(best_perm, best_score, 1, improved);
155 
156  if (best_score != PERFECT_WERDS)
157  initialise_search(best_perm, current_perm);
158 
159  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
160  match_current_words(current_perm, row, block);
161  current_score = eval_word_spacing(current_perm);
162  dump_words(current_perm, current_score, 2, improved);
163  if (current_score > best_score) {
164  best_perm.clear();
165  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
166  best_score = current_score;
167  improved = TRUE;
168  }
169  if (current_score < PERFECT_WERDS)
170  transform_to_next_perm(current_perm);
171  }
172  dump_words(best_perm, best_score, 3, improved);
173 }
174 
175 } // namespace tesseract
176 
177 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
178  WERD_RES_IT src_it(&src_list);
179  WERD_RES_IT new_it(&new_list);
180  WERD_RES *src_wd;
181  WERD_RES *new_wd;
182 
183  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
184  src_wd = src_it.data();
185  if (!src_wd->combination) {
186  new_wd = WERD_RES::deep_copy(src_wd);
187  new_wd->combination = FALSE;
188  new_wd->part_of_combo = FALSE;
189  new_it.add_after_then_move(new_wd);
190  }
191  }
192 }
193 
194 
195 namespace tesseract {
196 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
197  BLOCK* block) {
198  WERD_RES_IT word_it(&words);
199  WERD_RES *word;
200  // Since we are not using PAGE_RES to iterate over words, we need to update
201  // prev_word_best_choice_ before calling classify_word_pass2().
202  prev_word_best_choice_ = NULL;
203  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204  word = word_it.data();
205  if ((!word->part_of_combo) && (word->box_word == NULL)) {
206  WordData word_data(block, row, word);
207  SetupWordPassN(2, &word_data);
208  classify_word_and_language(2, NULL, &word_data);
209  }
211  }
212 }
213 
239 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
240  WERD_RES_IT word_res_it(&word_res_list);
241  inT16 total_score = 0;
242  inT16 word_count = 0;
243  inT16 done_word_count = 0;
244  inT16 word_len;
245  inT16 i;
246  inT16 offset;
247  WERD_RES *word; // current word
248  inT16 prev_word_score = 0;
249  BOOL8 prev_word_done = FALSE;
250  BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
251  BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
252  BOOL8 current_char_1 = FALSE;
253  BOOL8 current_word_ok_so_far;
254  STRING punct_chars = "!\"`',.:;";
255  BOOL8 prev_char_punct = FALSE;
256  BOOL8 current_char_punct = FALSE;
257  BOOL8 word_done = FALSE;
258 
259  do {
260  word = word_res_it.data();
261  word_done = fixspace_thinks_word_done(word);
262  word_count++;
263  if (word->tess_failed) {
264  total_score += prev_word_score;
265  if (prev_word_done)
266  done_word_count++;
267  prev_word_score = 0;
268  prev_char_1 = FALSE;
269  prev_char_digit = FALSE;
270  prev_word_done = FALSE;
271  } else {
272  /*
273  Can we add the prev word score and potentially count this word?
274  Yes IF it didn't end in a 1 when the first char of this word is a digit
275  AND it didn't end in a digit when the first char of this word is a 1
276  */
277  word_len = word->reject_map.length();
278  current_word_ok_so_far = FALSE;
279  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
280  (prev_char_digit && (
281  (word_done &&
282  word->best_choice->unichar_lengths().string()[0] == 1 &&
283  word->best_choice->unichar_string()[0] == '1') ||
284  (!word_done && STRING(conflict_set_I_l_1).contains(
285  word->best_choice->unichar_string()[0])))))) {
286  total_score += prev_word_score;
287  if (prev_word_done)
288  done_word_count++;
289  current_word_ok_so_far = word_done;
290  }
291 
292  if (current_word_ok_so_far) {
293  prev_word_done = TRUE;
294  prev_word_score = word_len;
295  } else {
296  prev_word_done = FALSE;
297  prev_word_score = 0;
298  }
299 
300  /* Add 1 to total score for every joined 1 regardless of context and
301  rejtn */
302  for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
303  current_char_1 = word->best_choice->unichar_string()[i] == '1';
304  if (prev_char_1 || (current_char_1 && (i > 0)))
305  total_score++;
306  prev_char_1 = current_char_1;
307  }
308 
309  /* Add 1 to total score for every joined punctuation regardless of context
310  and rejtn */
312  for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
313  offset += word->best_choice->unichar_lengths()[i++]) {
314  current_char_punct =
315  punct_chars.contains(word->best_choice->unichar_string()[offset]);
316  if (prev_char_punct || (current_char_punct && i > 0))
317  total_score++;
318  prev_char_punct = current_char_punct;
319  }
320  }
321  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
322  for (i = 0, offset = 0; i < word_len - 1;
323  offset += word->best_choice->unichar_lengths()[i++]);
324  prev_char_1 =
325  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
326  || (!word_done && STRING(conflict_set_I_l_1).contains(
327  word->best_choice->unichar_string()[offset])));
328  }
329  /* Find next word */
330  do {
331  word_res_it.forward();
332  } while (word_res_it.data()->part_of_combo);
333  } while (!word_res_it.at_first());
334  total_score += prev_word_score;
335  if (prev_word_done)
336  done_word_count++;
337  if (done_word_count == word_count)
338  return PERFECT_WERDS;
339  else
340  return total_score;
341 }
342 
344  int i;
345  int offset;
346 
347  for (i = 0, offset = 0; i < char_position;
348  offset += word->best_choice->unichar_lengths()[i++]);
349  return (
350  word->uch_set->get_isdigit(
351  word->best_choice->unichar_string().string() + offset,
352  word->best_choice->unichar_lengths()[i]) ||
353  (word->best_choice->permuter() == NUMBER_PERM &&
355  word->best_choice->unichar_string().string()[offset])));
356 }
357 
358 } // namespace tesseract
359 
360 
372 void transform_to_next_perm(WERD_RES_LIST &words) {
373  WERD_RES_IT word_it(&words);
374  WERD_RES_IT prev_word_it(&words);
375  WERD_RES *word;
376  WERD_RES *prev_word;
377  WERD_RES *combo;
378  WERD *copy_word;
379  inT16 prev_right = -MAX_INT16;
380  TBOX box;
381  inT16 gap;
382  inT16 min_gap = MAX_INT16;
383 
384  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
385  word = word_it.data();
386  if (!word->part_of_combo) {
387  box = word->word->bounding_box();
388  if (prev_right > -MAX_INT16) {
389  gap = box.left() - prev_right;
390  if (gap < min_gap)
391  min_gap = gap;
392  }
393  prev_right = box.right();
394  }
395  }
396  if (min_gap < MAX_INT16) {
397  prev_right = -MAX_INT16; // back to start
398  word_it.set_to_list(&words);
399  // Note: we can't use cycle_pt due to inserted combos at start of list.
400  for (; (prev_right == -MAX_INT16) || !word_it.at_first();
401  word_it.forward()) {
402  word = word_it.data();
403  if (!word->part_of_combo) {
404  box = word->word->bounding_box();
405  if (prev_right > -MAX_INT16) {
406  gap = box.left() - prev_right;
407  if (gap <= min_gap) {
408  prev_word = prev_word_it.data();
409  if (prev_word->combination) {
410  combo = prev_word;
411  } else {
412  /* Make a new combination and insert before
413  * the first word being joined. */
414  copy_word = new WERD;
415  *copy_word = *(prev_word->word);
416  // deep copy
417  combo = new WERD_RES(copy_word);
418  combo->combination = TRUE;
419  combo->x_height = prev_word->x_height;
420  prev_word->part_of_combo = TRUE;
421  prev_word_it.add_before_then_move(combo);
422  }
423  combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
424  if (word->combination) {
425  combo->word->join_on(word->word);
426  // Move blobs to combo
427  // old combo no longer needed
428  delete word_it.extract();
429  } else {
430  // Copy current wd to combo
431  combo->copy_on(word);
432  word->part_of_combo = TRUE;
433  }
434  combo->done = FALSE;
435  combo->ClearResults();
436  } else {
437  prev_word_it = word_it; // catch up
438  }
439  }
440  prev_right = box.right();
441  }
442  }
443  } else {
444  words.clear(); // signal termination
445  }
446 }
447 
448 namespace tesseract {
449 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
450  inT16 mode, BOOL8 improved) {
451  WERD_RES_IT word_res_it(&perm);
452 
453  if (debug_fix_space_level > 0) {
454  if (mode == 1) {
455  stats_.dump_words_str = "";
456  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
457  word_res_it.forward()) {
458  if (!word_res_it.data()->part_of_combo) {
459  stats_.dump_words_str +=
460  word_res_it.data()->best_choice->unichar_string();
461  stats_.dump_words_str += ' ';
462  }
463  }
464  }
465 
466  if (debug_fix_space_level > 1) {
467  switch (mode) {
468  case 1:
469  tprintf("EXTRACTED (%d): \"", score);
470  break;
471  case 2:
472  tprintf("TESTED (%d): \"", score);
473  break;
474  case 3:
475  tprintf("RETURNED (%d): \"", score);
476  break;
477  }
478 
479  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
480  word_res_it.forward()) {
481  if (!word_res_it.data()->part_of_combo) {
482  tprintf("%s/%1d ",
483  word_res_it.data()->best_choice->unichar_string().string(),
484  (int)word_res_it.data()->best_choice->permuter());
485  }
486  }
487  tprintf("\"\n");
488  } else if (improved) {
489  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
490  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
491  word_res_it.forward()) {
492  if (!word_res_it.data()->part_of_combo) {
493  tprintf("%s/%1d ",
494  word_res_it.data()->best_choice->unichar_string().string(),
495  (int)word_res_it.data()->best_choice->permuter());
496  }
497  }
498  tprintf("\"\n");
499  }
500  }
501 }
502 
504  if (word->done)
505  return TRUE;
506 
507  /*
508  Use all the standard pass 2 conditions for mode 5 in set_done() in
509  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
510  CARE WHETHER WE HAVE of/at on/an etc.
511  */
512  if (fixsp_done_mode > 0 &&
513  (word->tess_accepted ||
514  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
515  fixsp_done_mode == 3) &&
516  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
517  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
518  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
519  (word->best_choice->permuter() == USER_DAWG_PERM) ||
520  (word->best_choice->permuter() == NUMBER_PERM))) {
521  return TRUE;
522  } else {
523  return FALSE;
524  }
525 }
526 
527 
535 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
536  BLOCK* block) {
537  WERD_RES *word_res;
538  WERD_RES_LIST sub_word_list;
539  WERD_RES_IT sub_word_list_it(&sub_word_list);
540  inT16 blob_index;
541  inT16 new_length;
542  float junk;
543 
544  word_res = word_res_it.data();
545  if (word_res->word->flag(W_REP_CHAR) ||
546  word_res->combination ||
547  word_res->part_of_combo ||
548  !word_res->word->flag(W_DONT_CHOP))
549  return;
550 
551  blob_index = worst_noise_blob(word_res, &junk);
552  if (blob_index < 0)
553  return;
554 
555  if (debug_fix_space_level > 1) {
556  tprintf("FP fixspace working on \"%s\"\n",
557  word_res->best_choice->unichar_string().string());
558  }
559  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
560  sub_word_list_it.add_after_stay_put(word_res_it.extract());
561  fix_noisy_space_list(sub_word_list, row, block);
562  new_length = sub_word_list.length();
563  word_res_it.add_list_before(&sub_word_list);
564  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
565  word_res_it.forward();
566  }
567 }
568 
569 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
570  BLOCK* block) {
571  inT16 best_score;
572  WERD_RES_IT best_perm_it(&best_perm);
573  WERD_RES_LIST current_perm;
574  WERD_RES_IT current_perm_it(&current_perm);
575  WERD_RES *old_word_res;
576  inT16 current_score;
577  BOOL8 improved = FALSE;
578 
579  best_score = fp_eval_word_spacing(best_perm); // default score
580 
581  dump_words(best_perm, best_score, 1, improved);
582 
583  old_word_res = best_perm_it.data();
584  // Even deep_copy doesn't copy the underlying WERD unless its combination
585  // flag is true!.
586  old_word_res->combination = TRUE; // Kludge to force deep copy
587  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
588  old_word_res->combination = FALSE; // Undo kludge
589 
590  break_noisiest_blob_word(current_perm);
591 
592  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
593  match_current_words(current_perm, row, block);
594  current_score = fp_eval_word_spacing(current_perm);
595  dump_words(current_perm, current_score, 2, improved);
596  if (current_score > best_score) {
597  best_perm.clear();
598  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
599  best_score = current_score;
600  improved = TRUE;
601  }
602  if (current_score < PERFECT_WERDS) {
603  break_noisiest_blob_word(current_perm);
604  }
605  }
606  dump_words(best_perm, best_score, 3, improved);
607 }
608 
609 
615 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
616  WERD_RES_IT word_it(&words);
617  WERD_RES_IT worst_word_it;
618  float worst_noise_score = 9999;
619  int worst_blob_index = -1; // Noisiest blob of noisiest wd
620  int blob_index; // of wds noisiest blob
621  float noise_score; // of wds noisiest blob
622  WERD_RES *word_res;
623  C_BLOB_IT blob_it;
624  C_BLOB_IT rej_cblob_it;
625  C_BLOB_LIST new_blob_list;
626  C_BLOB_IT new_blob_it;
627  C_BLOB_IT new_rej_cblob_it;
628  WERD *new_word;
629  inT16 start_of_noise_blob;
630  inT16 i;
631 
632  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
633  blob_index = worst_noise_blob(word_it.data(), &noise_score);
634  if (blob_index > -1 && worst_noise_score > noise_score) {
635  worst_noise_score = noise_score;
636  worst_blob_index = blob_index;
637  worst_word_it = word_it;
638  }
639  }
640  if (worst_blob_index < 0) {
641  words.clear(); // signal termination
642  return;
643  }
644 
645  /* Now split the worst_word_it */
646 
647  word_res = worst_word_it.data();
648 
649  /* Move blobs before noise blob to a new bloblist */
650 
651  new_blob_it.set_to_list(&new_blob_list);
652  blob_it.set_to_list(word_res->word->cblob_list());
653  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
654  new_blob_it.add_after_then_move(blob_it.extract());
655  }
656  start_of_noise_blob = blob_it.data()->bounding_box().left();
657  delete blob_it.extract(); // throw out noise blob
658 
659  new_word = new WERD(&new_blob_list, word_res->word);
660  new_word->set_flag(W_EOL, FALSE);
661  word_res->word->set_flag(W_BOL, FALSE);
662  word_res->word->set_blanks(1); // After break
663 
664  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
665  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
666  for (;
667  (!rej_cblob_it.empty() &&
668  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
669  rej_cblob_it.forward()) {
670  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
671  }
672 
673  WERD_RES* new_word_res = new WERD_RES(new_word);
674  new_word_res->combination = TRUE;
675  worst_word_it.add_before_then_move(new_word_res);
676 
677  word_res->ClearResults();
678 }
679 
681  float *worst_noise_score) {
682  float noise_score[512];
683  int i;
684  int min_noise_blob; // 1st contender
685  int max_noise_blob; // last contender
686  int non_noise_count;
687  int worst_noise_blob; // Worst blob
688  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
689  float non_noise_limit = kBlnXHeight * 0.8;
690 
691  if (word_res->rebuild_word == NULL)
692  return -1; // Can't handle cube words.
693 
694  // Normalised.
695  int blob_count = word_res->box_word->length();
696  ASSERT_HOST(blob_count <= 512);
697  if (blob_count < 5)
698  return -1; // too short to split
699 
700  /* Get the noise scores for all blobs */
701 
702  #ifndef SECURE_NAMES
703  if (debug_fix_space_level > 5)
704  tprintf("FP fixspace Noise metrics for \"%s\": ",
705  word_res->best_choice->unichar_string().string());
706  #endif
707 
708  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
709  TBLOB* blob = word_res->rebuild_word->blobs[i];
710  if (word_res->reject_map[i].accepted())
711  noise_score[i] = non_noise_limit;
712  else
713  noise_score[i] = blob_noise_score(blob);
714 
715  if (debug_fix_space_level > 5)
716  tprintf("%1.1f ", noise_score[i]);
717  }
718  if (debug_fix_space_level > 5)
719  tprintf("\n");
720 
721  /* Now find the worst one which is far enough away from the end of the word */
722 
723  non_noise_count = 0;
724  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
725  if (noise_score[i] >= non_noise_limit) {
726  non_noise_count++;
727  }
728  }
729  if (non_noise_count < fixsp_non_noise_limit)
730  return -1;
731 
732  min_noise_blob = i;
733 
734  non_noise_count = 0;
735  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
736  i--) {
737  if (noise_score[i] >= non_noise_limit) {
738  non_noise_count++;
739  }
740  }
741  if (non_noise_count < fixsp_non_noise_limit)
742  return -1;
743 
744  max_noise_blob = i;
745 
746  if (min_noise_blob > max_noise_blob)
747  return -1;
748 
749  *worst_noise_score = small_limit;
750  worst_noise_blob = -1;
751  for (i = min_noise_blob; i <= max_noise_blob; i++) {
752  if (noise_score[i] < *worst_noise_score) {
753  worst_noise_blob = i;
754  *worst_noise_score = noise_score[i];
755  }
756  }
757  return worst_noise_blob;
758 }
759 
761  TBOX box; // BB of outline
762  inT16 outline_count = 0;
763  inT16 max_dimension;
764  inT16 largest_outline_dimension = 0;
765 
766  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
767  outline_count++;
768  box = ol->bounding_box();
769  if (box.height() > box.width()) {
770  max_dimension = box.height();
771  } else {
772  max_dimension = box.width();
773  }
774 
775  if (largest_outline_dimension < max_dimension)
776  largest_outline_dimension = max_dimension;
777  }
778 
779  if (outline_count > 5) {
780  // penalise LOTS of blobs
781  largest_outline_dimension *= 2;
782  }
783 
784  box = blob->bounding_box();
785  if (box.bottom() > kBlnBaselineOffset * 4 ||
786  box.top() < kBlnBaselineOffset / 2) {
787  // Lax blob is if high or low
788  largest_outline_dimension /= 2;
789  }
790 
791  return largest_outline_dimension;
792 }
793 } // namespace tesseract
794 
795 void fixspace_dbg(WERD_RES *word) {
796  TBOX box = word->word->bounding_box();
797  BOOL8 show_map_detail = FALSE;
798  inT16 i;
799 
800  box.print();
801  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
802  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
803  word->word->cblob_list()->length(),
804  word->rebuild_word->NumBlobs(),
805  word->box_word->length());
806  word->reject_map.print(debug_fp);
807  tprintf("\n");
808  if (show_map_detail) {
809  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
810  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
811  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
812  word->reject_map[i].full_print(debug_fp);
813  }
814  }
815 
816  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
817  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
818 }
819 
820 
829 namespace tesseract {
830 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
831  WERD_RES_IT word_it(&word_res_list);
832  WERD_RES *word;
833  inT16 score = 0;
834  inT16 i;
835  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
836 
837  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
838  word = word_it.data();
839  if (word->rebuild_word == NULL)
840  continue; // Can't handle cube words.
841  if (word->done ||
842  word->tess_accepted ||
843  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
844  word->best_choice->permuter() == FREQ_DAWG_PERM ||
845  word->best_choice->permuter() == USER_DAWG_PERM ||
846  safe_dict_word(word) > 0) {
847  int num_blobs = word->rebuild_word->NumBlobs();
848  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
849  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
850  TBLOB* blob = word->rebuild_word->blobs[i];
851  if (word->best_choice->unichar_id(i) == space ||
852  blob_noise_score(blob) < small_limit) {
853  score -= 1; // penalise possibly erroneous non-space
854  } else if (word->reject_map[i].accepted()) {
855  score++;
856  }
857  }
858  }
859  }
860  if (score < 0)
861  score = 0;
862  return score;
863 }
864 
865 } // namespace tesseract
BOOL8 combination
Definition: pageres.h:318
BOOL8 tess_accepted
Definition: pageres.h:280
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
void ClearResults()
Definition: pageres.cpp:1141
void print(FILE *fp)
Definition: rejctmap.cpp:394
const int kBlnBaselineOffset
Definition: normalis.h:29
#define TRUE
Definition: capi.h:45
short inT16
Definition: host.h:33
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: werd.h:36
TWERD * rebuild_word
Definition: pageres.h:244
FILE * debug_fp
Definition: tessvars.cpp:24
const STRING & unichar_string() const
Definition: ratngs.h:525
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:633
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void full_print(FILE *fp)
Definition: rejctmap.cpp:406
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:615
inT32 length() const
Definition: rejctmap.h:236
inT16 width() const
Definition: rect.h:111
void join_on(WERD *other)
Definition: werd.cpp:211
float x_height
Definition: pageres.h:295
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:569
inT16 reject_count()
Definition: rejctmap.h:242
int length() const
Definition: boxword.h:85
uinT8 permuter() const
Definition: ratngs.h:344
WERD_CHOICE * best_choice
Definition: pageres.h:219
TBOX bounding_box() const
Definition: blobs.cpp:482
unsigned char BOOL8
Definition: host.h:46
BOOL8 tess_failed
Definition: pageres.h:272
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:30
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:239
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
BOOL8 part_of_combo
Definition: pageres.h:319
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:830
inT16 bottom() const
Definition: rect.h:61
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
tesseract::BoxWord * box_word
Definition: pageres.h:250
const char * string() const
Definition: strngs.cpp:201
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:503
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:449
TESSLINE * next
Definition: blobs.h:258
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:795
#define MAX_INT16
Definition: host.h:52
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
Definition: werd.h:60
#define FALSE
Definition: capi.h:46
Definition: werd.h:35
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
CMD_EVENTS mode
Definition: pgedit.cpp:116
#define PERFECT_WERDS
Definition: fixspace.cpp:33
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:343
bool deadline_exceeded() const
Definition: ocrclass.h:158
inT16 left() const
Definition: rect.h:68
const int kBlnXHeight
Definition: normalis.h:28
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:372
void print() const
Definition: rect.h:270
int length() const
Definition: ratngs.h:301
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:168
inT16 height() const
Definition: rect.h:104
int inT32
Definition: host.h:35
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int NumBlobs() const
Definition: blobs.h:425
#define tprintf(...)
Definition: tprintf.h:31
Definition: ocrrow.h:32
Definition: strngs.h:44
TESSLINE * outlines
Definition: blobs.h:377
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
BOOL8 contains(const char c) const
Definition: strngs.cpp:192
Definition: blobs.h:261
inT16 top() const
Definition: rect.h:54
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:177
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1274
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
const STRING & unichar_lengths() const
Definition: ratngs.h:532
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:760
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
WERD * word
Definition: pageres.h:175
void copy_on(WERD_RES *word_res)
Definition: pageres.h:644
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1772
BOOL8 done
Definition: pageres.h:282
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:680
TBOX bounding_box() const
Definition: werd.cpp:160
REJMAP reject_map
Definition: pageres.h:271
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:535
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
int UNICHAR_ID
Definition: unichar.h:33