tesseract  3.05.02
control.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: control.cpp (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 11:09:58 BST 1992
6  * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <string.h>
27 #include <math.h>
28 #ifdef __UNIX__
29 #include <assert.h>
30 #include <unistd.h>
31 #include <errno.h>
32 #endif
33 #include <ctype.h>
34 #include "ocrclass.h"
35 #include "werdit.h"
36 #include "drawfx.h"
37 #include "tessbox.h"
38 #include "tessvars.h"
39 #include "pgedit.h"
40 #include "reject.h"
41 #include "fixspace.h"
42 #include "docqual.h"
43 #include "control.h"
44 #include "output.h"
45 #include "callcpp.h"
46 #include "globals.h"
47 #include "sorthelper.h"
48 #include "tesseractclass.h"
49 
50 #define MIN_FONT_ROW_COUNT 8
51 #define MAX_XHEIGHT_DIFF 3
52 
53 const char* const kBackUpConfigFile = "tempconfigdata.config";
54 // Min believable x-height for any text when refitting as a fraction of
55 // original x-height
56 const double kMinRefitXHeightFraction = 0.5;
57 
58 
65 namespace tesseract {
67  TBOX &selection_box) {
68  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
69  if (it != NULL) {
71  it->DeleteCurrentWord();
72  delete it;
73  }
74 }
75 
82  inT16 char_qual;
83  inT16 good_char_qual;
84 
85  WordData word_data(*pr_it);
86  SetupWordPassN(2, &word_data);
87  classify_word_and_language(2, pr_it, &word_data);
89  WERD_RES* word_res = pr_it->word();
90  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
91  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
92  "char_quality: %d; good_char_quality: %d\n",
93  word_res->reject_map.length(),
94  word_blob_quality(word_res, pr_it->row()->row),
95  word_outline_errs(word_res), char_qual, good_char_qual);
96  }
97  return TRUE;
98 }
99 
100 // Helper function to check for a target word and handle it appropriately.
101 // Inspired by Jetsoft's requirement to process only single words on pass2
102 // and beyond.
103 // If word_config is not null:
104 // If the word_box and target_word_box overlap, read the word_config file
105 // else reset to previous config data.
106 // return true.
107 // else
108 // If the word_box and target_word_box overlap or pass <= 1, return true.
109 // Note that this function uses a fixed temporary file for storing the previous
110 // configs, so it is neither thread-safe, nor process-safe, but the assumption
111 // is that it will only be used for one debug window at a time.
112 //
113 // Since this function is used for debugging (and not to change OCR results)
114 // set only debug params from the word config file.
115 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
116  const TBOX& target_word_box,
117  const char* word_config,
118  int pass) {
119  if (word_config != NULL) {
120  if (word_box.major_overlap(target_word_box)) {
121  if (backup_config_file_ == NULL) {
122  backup_config_file_ = kBackUpConfigFile;
123  FILE* config_fp = fopen(backup_config_file_, "wb");
124  ParamUtils::PrintParams(config_fp, params());
125  fclose(config_fp);
126  ParamUtils::ReadParamsFile(word_config,
128  params());
129  }
130  } else {
131  if (backup_config_file_ != NULL) {
132  ParamUtils::ReadParamsFile(backup_config_file_,
134  params());
135  backup_config_file_ = NULL;
136  }
137  }
138  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
139  return false;
140  }
141  return true;
142 }
143 
146  const TBOX* target_word_box,
147  const char* word_config,
148  PAGE_RES* page_res,
149  GenericVector<WordData>* words) {
150  // Prepare all the words.
151  PAGE_RES_IT page_res_it(page_res);
152  for (page_res_it.restart_page(); page_res_it.word() != NULL;
153  page_res_it.forward()) {
154  if (target_word_box == NULL ||
155  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
156  *target_word_box, word_config, 1)) {
157  words->push_back(WordData(page_res_it));
158  }
159  }
160  // Setup all the words for recognition with polygonal approximation.
161  for (int w = 0; w < words->size(); ++w) {
162  SetupWordPassN(pass_n, &(*words)[w]);
163  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
164  }
165 }
166 
167 // Sets up the single word ready for whichever engine is to be run.
168 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
169  if (pass_n == 1 || !word->word->done) {
170  if (pass_n == 1) {
171  word->word->SetupForRecognition(unicharset, this, BestPix(),
176  word->row, word->block);
177  } else if (pass_n == 2) {
178  // TODO(rays) Should we do this on pass1 too?
179  word->word->caps_height = 0.0;
180  if (word->word->x_height == 0.0f)
181  word->word->x_height = word->row->x_height();
182  }
183  word->lang_words.truncate(0);
184  for (int s = 0; s <= sub_langs_.size(); ++s) {
185  // The sub_langs_.size() entry is for the master language.
186  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
187  WERD_RES* word_res = new WERD_RES;
188  word_res->InitForRetryRecognition(*word->word);
189  word->lang_words.push_back(word_res);
190  // Cube doesn't get setup for pass2.
191  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
192  word_res->SetupForRecognition(
193  lang_t->unicharset, lang_t, BestPix(),
194  lang_t->tessedit_ocr_engine_mode, NULL,
196  lang_t->textord_use_cjk_fp_model,
197  lang_t->poly_allow_detailed_fx, word->row, word->block);
198  }
199  }
200  }
201 }
202 
203 // Runs word recognition on all the words.
204 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
205  PAGE_RES_IT* pr_it,
206  GenericVector<WordData>* words) {
207  // TODO(rays) Before this loop can be parallelized (it would yield a massive
208  // speed-up) all remaining member globals need to be converted to local/heap
209  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
210  // added. The results will be significantly different with adaption on, and
211  // deterioration will need investigation.
212  pr_it->restart_page();
213  for (int w = 0; w < words->size(); ++w) {
214  WordData* word = &(*words)[w];
215  if (w > 0) word->prev_word = &(*words)[w - 1];
216  if (monitor != NULL) {
217  monitor->ocr_alive = TRUE;
218  if (pass_n == 1) {
219  monitor->progress = 70 * w / words->size();
220  if (monitor->progress_callback != NULL) {
221  TBOX box = pr_it->word()->word->bounding_box();
222  (*monitor->progress_callback)(monitor->progress, box.left(),
223  box.right(), box.top(), box.bottom());
224  }
225  } else {
226  monitor->progress = 70 + 30 * w / words->size();
227  if (monitor->progress_callback != NULL) {
228  (*monitor->progress_callback)(monitor->progress, 0, 0, 0, 0);
229  }
230  }
231  if (monitor->deadline_exceeded() ||
232  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
233  words->size()))) {
234  // Timeout. Fake out the rest of the words.
235  for (; w < words->size(); ++w) {
236  (*words)[w].word->SetupFake(unicharset);
237  }
238  return false;
239  }
240  }
241  if (word->word->tess_failed) {
242  int s;
243  for (s = 0; s < word->lang_words.size() &&
244  word->lang_words[s]->tess_failed; ++s) {}
245  // If all are failed, skip it. Image words are skipped by this test.
246  if (s > word->lang_words.size()) continue;
247  }
248  // Sync pr_it with the wth WordData.
249  while (pr_it->word() != NULL && pr_it->word() != word->word)
250  pr_it->forward();
251  ASSERT_HOST(pr_it->word() != NULL);
252  bool make_next_word_fuzzy = false;
253  if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
254  // Needs to be setup again to see the new outlines in the chopped_word.
255  SetupWordPassN(pass_n, word);
256  }
257 
258  classify_word_and_language(pass_n, pr_it, word);
260  tprintf("Pass%d: %s [%s]\n", pass_n,
262  word->word->best_choice->debug_string().string());
263  }
264  pr_it->forward();
265  if (make_next_word_fuzzy && pr_it->word() != NULL) {
266  pr_it->MakeCurrentWordFuzzy();
267  }
268  }
269  return true;
270 }
271 
294  ETEXT_DESC* monitor,
295  const TBOX* target_word_box,
296  const char* word_config,
297  int dopasses) {
298  PAGE_RES_IT page_res_it(page_res);
299 
301  tessedit_test_adaption.set_value (TRUE);
302  tessedit_minimal_rejection.set_value (TRUE);
303  }
304 
305  if (dopasses==0 || dopasses==1) {
306  page_res_it.restart_page();
307  // ****************** Pass 1 *******************
308 
309  // If the adaptive classifier is full switch to one we prepared earlier,
310  // ie on the previous page. If the current adaptive classifier is non-empty,
311  // prepare a backup starting at this page, in case it fills up. Do all this
312  // independently for each language.
313  if (AdaptiveClassifierIsFull()) {
315  } else if (!AdaptiveClassifierIsEmpty()) {
317  }
318  // Now check the sub-langs as well.
319  for (int i = 0; i < sub_langs_.size(); ++i) {
320  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
321  sub_langs_[i]->SwitchAdaptiveClassifier();
322  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
323  sub_langs_[i]->StartBackupAdaptiveClassifier();
324  }
325  }
326  // Set up all words ready for recognition, so that if parallelism is on
327  // all the input and output classes are ready to run the classifier.
329  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
330  if (tessedit_parallelize) {
331  PrerecAllWordsPar(words);
332  }
333 
334  stats_.word_count = words.size();
335 
336  stats_.dict_words = 0;
337  stats_.doc_blob_quality = 0;
338  stats_.doc_outline_errs = 0;
339  stats_.doc_char_quality = 0;
340  stats_.good_char_count = 0;
341  stats_.doc_good_char_quality = 0;
342 
343  most_recently_used_ = this;
344  // Run pass 1 word recognition.
345  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
346  // Pass 1 post-processing.
347  for (page_res_it.restart_page(); page_res_it.word() != NULL;
348  page_res_it.forward()) {
349  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
350  fix_rep_char(&page_res_it);
351  continue;
352  }
353 
354  // Count dict words.
355  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
356  ++(stats_.dict_words);
357 
358  // Update misadaption log (we only need to do it on pass 1, since
359  // adaption only happens on this pass).
360  if (page_res_it.word()->blamer_bundle != NULL &&
361  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362  page_res->misadaption_log.push_back(
363  page_res_it.word()->blamer_bundle->misadaption_debug());
364  }
365  }
366  }
367 
368  if (dopasses == 1) return true;
369 
370  // ****************** Pass 2 *******************
372  AnyTessLang()) {
373  page_res_it.restart_page();
375  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
376  if (tessedit_parallelize) {
377  PrerecAllWordsPar(words);
378  }
379  most_recently_used_ = this;
380  // Run pass 2 word recognition.
381  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
382  }
383 
384  // The next passes can only be run if tesseract has been used, as cube
385  // doesn't set all the necessary outputs in WERD_RES.
386  if (AnyTessLang()) {
387  // ****************** Pass 3 *******************
388  // Fix fuzzy spaces.
390 
393  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
394 
395  // ****************** Pass 4 *******************
398 
399  // ****************** Pass 5,6 *******************
400  rejection_passes(page_res, monitor, target_word_box, word_config);
401 
402 #ifndef NO_CUBE_BUILD
403  // ****************** Pass 7 *******************
404  // Cube combiner.
405  // If cube is loaded and its combiner is present, run it.
407  run_cube_combiner(page_res);
408  }
409 #endif
410 
411  // ****************** Pass 8 *******************
412  font_recognition_pass(page_res);
413 
414  // ****************** Pass 9 *******************
415  // Check the correctness of the final results.
416  blamer_pass(page_res);
417  script_pos_pass(page_res);
418  }
419 
420  // Write results pass.
422  // This is now redundant, but retained commented so show how to obtain
423  // bounding boxes and style information.
424 
425  // changed by jetsoft
426  // needed for dll to output memory structure
427  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
428  output_pass(page_res_it, target_word_box);
429  // end jetsoft
430  PageSegMode pageseg_mode = static_cast<PageSegMode>(
431  static_cast<int>(tessedit_pageseg_mode));
432  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
433 
434  // Remove empty words, as these mess up the result iterators.
435  for (page_res_it.restart_page(); page_res_it.word() != NULL;
436  page_res_it.forward()) {
437  WERD_RES* word = page_res_it.word();
438  if (word->best_choice == NULL || word->best_choice->length() == 0)
439  page_res_it.DeleteCurrentWord();
440  }
441 
442  if (monitor != NULL) {
443  monitor->progress = 100;
444  }
445  return true;
446 }
447 
449  PAGE_RES_IT word_it(page_res);
450 
451  WERD_RES *w_prev = NULL;
452  WERD_RES *w = word_it.word();
453  while (1) {
454  w_prev = w;
455  while (word_it.forward() != NULL &&
456  (!word_it.word() || word_it.word()->part_of_combo)) {
457  // advance word_it, skipping over parts of combos
458  }
459  if (!word_it.word()) break;
460  w = word_it.word();
461  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
462  continue;
463  }
464  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
465  if (tessedit_bigram_debug) {
466  tprintf("Skipping because one of the words is W_REP_CHAR\n");
467  }
468  continue;
469  }
470  // Two words sharing the same language model, excellent!
471  GenericVector<WERD_CHOICE *> overrides_word1;
472  GenericVector<WERD_CHOICE *> overrides_word2;
473 
474  STRING orig_w1_str = w_prev->best_choice->unichar_string();
475  STRING orig_w2_str = w->best_choice->unichar_string();
476  WERD_CHOICE prev_best(w->uch_set);
477  {
478  int w1start, w1end;
479  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
480  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
481  }
482  WERD_CHOICE this_best(w->uch_set);
483  {
484  int w2start, w2end;
485  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
486  this_best = w->best_choice->shallow_copy(w2start, w2end);
487  }
488 
489  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
490  if (tessedit_bigram_debug) {
491  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
492  orig_w1_str.string(), orig_w2_str.string());
493  }
494  continue;
495  }
496  if (tessedit_bigram_debug > 2) {
497  tprintf("Examining alt choices for \"%s %s\".\n",
498  orig_w1_str.string(), orig_w2_str.string());
499  }
500  if (tessedit_bigram_debug > 1) {
501  if (!w_prev->best_choices.singleton()) {
502  w_prev->PrintBestChoices();
503  }
504  if (!w->best_choices.singleton()) {
505  w->PrintBestChoices();
506  }
507  }
508  float best_rating = 0.0;
509  int best_idx = 0;
510  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
511  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
512  WERD_CHOICE *p1 = prev_it.data();
513  WERD_CHOICE strip1(w->uch_set);
514  {
515  int p1start, p1end;
516  p1->GetNonSuperscriptSpan(&p1start, &p1end);
517  strip1 = p1->shallow_copy(p1start, p1end);
518  }
519  WERD_CHOICE_IT w_it(&w->best_choices);
520  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
521  WERD_CHOICE *p2 = w_it.data();
522  WERD_CHOICE strip2(w->uch_set);
523  {
524  int p2start, p2end;
525  p2->GetNonSuperscriptSpan(&p2start, &p2end);
526  strip2 = p2->shallow_copy(p2start, p2end);
527  }
528  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
529  overrides_word1.push_back(p1);
530  overrides_word2.push_back(p2);
531  if (overrides_word1.size() == 1 ||
532  p1->rating() + p2->rating() < best_rating) {
533  best_rating = p1->rating() + p2->rating();
534  best_idx = overrides_word1.size() - 1;
535  }
536  }
537  }
538  }
539  if (!overrides_word1.empty()) {
540  // Excellent, we have some bigram matches.
542  *overrides_word1[best_idx]) &&
544  *overrides_word2[best_idx])) {
545  if (tessedit_bigram_debug > 1) {
546  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
547  "model.\n", orig_w1_str.string(), orig_w2_str.string());
548  }
549  continue;
550  }
551  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
552  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
553  if (new_w1_str != orig_w1_str) {
554  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
555  }
556  if (new_w2_str != orig_w2_str) {
557  w->ReplaceBestChoice(overrides_word2[best_idx]);
558  }
559  if (tessedit_bigram_debug > 0) {
560  STRING choices_description;
561  int num_bigram_choices
562  = overrides_word1.size() * overrides_word2.size();
563  if (num_bigram_choices == 1) {
564  choices_description = "This was the unique bigram choice.";
565  } else {
566  if (tessedit_bigram_debug > 1) {
567  STRING bigrams_list;
568  const int kMaxChoicesToPrint = 20;
569  for (int i = 0; i < overrides_word1.size() &&
570  i < kMaxChoicesToPrint; i++) {
571  if (i > 0) { bigrams_list += ", "; }
572  WERD_CHOICE *p1 = overrides_word1[i];
573  WERD_CHOICE *p2 = overrides_word2[i];
574  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
575  if (i == kMaxChoicesToPrint) {
576  bigrams_list += " ...";
577  }
578  }
579  choices_description = "There were many choices: {";
580  choices_description += bigrams_list;
581  choices_description += "}";
582  } else {
583  choices_description.add_str_int("There were ", num_bigram_choices);
584  choices_description += " compatible bigrams.";
585  }
586  }
587  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
588  orig_w1_str.string(), orig_w2_str.string(),
589  new_w1_str.string(), new_w2_str.string(),
590  choices_description.string());
591  }
592  }
593  }
594 }
595 
597  ETEXT_DESC* monitor,
598  const TBOX* target_word_box,
599  const char* word_config) {
600  PAGE_RES_IT page_res_it(page_res);
601  // ****************** Pass 5 *******************
602  // Gather statistics on rejects.
603  int word_index = 0;
604  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
606  WERD_RES* word = page_res_it.word();
607  word_index++;
608  if (monitor != NULL) {
609  monitor->ocr_alive = TRUE;
610  monitor->progress = 95 + 5 * word_index / stats_.word_count;
611  }
612  if (word->rebuild_word == NULL) {
613  // Word was not processed by tesseract.
614  page_res_it.forward();
615  continue;
616  }
617  check_debug_pt(word, 70);
618 
619  // changed by jetsoft
620  // specific to its needs to extract one word when need
621  if (target_word_box &&
623  *target_word_box, word_config, 4)) {
624  page_res_it.forward();
625  continue;
626  }
627  // end jetsoft
628 
629  page_res_it.rej_stat_word();
630  int chars_in_word = word->reject_map.length();
631  int rejects_in_word = word->reject_map.reject_count();
632 
633  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
634  stats_.doc_blob_quality += blob_quality;
635  int outline_errs = word_outline_errs(word);
636  stats_.doc_outline_errs += outline_errs;
637  inT16 all_char_quality;
638  inT16 accepted_all_char_quality;
639  word_char_quality(word, page_res_it.row()->row,
640  &all_char_quality, &accepted_all_char_quality);
641  stats_.doc_char_quality += all_char_quality;
642  uinT8 permuter_type = word->best_choice->permuter();
643  if ((permuter_type == SYSTEM_DAWG_PERM) ||
644  (permuter_type == FREQ_DAWG_PERM) ||
645  (permuter_type == USER_DAWG_PERM)) {
646  stats_.good_char_count += chars_in_word - rejects_in_word;
647  stats_.doc_good_char_quality += accepted_all_char_quality;
648  }
649  check_debug_pt(word, 80);
651  (blob_quality == 0) && (outline_errs >= chars_in_word))
653  check_debug_pt(word, 90);
654  page_res_it.forward();
655  }
656 
658  tprintf
659  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
660  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
661  page_res->char_count, page_res->rej_count,
662  page_res->rej_count / static_cast<float>(page_res->char_count),
663  stats_.doc_blob_quality,
664  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
665  stats_.doc_outline_errs,
666  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
667  stats_.doc_char_quality,
668  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
669  stats_.doc_good_char_quality,
670  (stats_.good_char_count > 0) ?
671  (stats_.doc_good_char_quality /
672  static_cast<float>(stats_.good_char_count)) : 0.0);
673  }
674  BOOL8 good_quality_doc =
675  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
676  quality_rej_pc) &&
677  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
678  quality_blob_pc) &&
679  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
681  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
683 
684  // ****************** Pass 6 *******************
685  // Do whole document or whole block rejection pass
686  if (!tessedit_test_adaption) {
688  quality_based_rejection(page_res_it, good_quality_doc);
689  }
690 }
691 
693  if (!wordrec_run_blamer) return;
694  PAGE_RES_IT page_res_it(page_res);
695  for (page_res_it.restart_page(); page_res_it.word() != NULL;
696  page_res_it.forward()) {
697  WERD_RES *word = page_res_it.word();
700  }
701  tprintf("Blame reasons:\n");
702  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
704  static_cast<IncorrectResultReason>(bl)),
705  page_res->blame_reasons[bl]);
706  }
707  if (page_res->misadaption_log.length() > 0) {
708  tprintf("Misadaption log:\n");
709  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
710  tprintf("%s\n", page_res->misadaption_log[i].string());
711  }
712  }
713 }
714 
715 // Sets script positions and detects smallcaps on all output words.
717  PAGE_RES_IT page_res_it(page_res);
718  for (page_res_it.restart_page(); page_res_it.word() != NULL;
719  page_res_it.forward()) {
720  WERD_RES* word = page_res_it.word();
721  if (word->word->flag(W_REP_CHAR)) {
722  page_res_it.forward();
723  continue;
724  }
725  float x_height = page_res_it.block()->block->x_height();
726  float word_x_height = word->x_height;
727  if (word_x_height < word->best_choice->min_x_height() ||
728  word_x_height > word->best_choice->max_x_height()) {
729  word_x_height = (word->best_choice->min_x_height() +
730  word->best_choice->max_x_height()) / 2.0f;
731  }
732  // Test for small caps. Word capheight must be close to block xheight,
733  // and word must contain no lower case letters, and at least one upper case.
734  double small_cap_xheight = x_height * kXHeightCapRatio;
735  double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
736  if (word->uch_set->script_has_xheight() &&
737  small_cap_xheight - small_cap_delta <= word_x_height &&
738  word_x_height <= small_cap_xheight + small_cap_delta) {
739  // Scan for upper/lower.
740  int num_upper = 0;
741  int num_lower = 0;
742  for (int i = 0; i < word->best_choice->length(); ++i) {
743  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
744  ++num_upper;
745  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
746  ++num_lower;
747  }
748  if (num_upper > 0 && num_lower == 0)
749  word->small_caps = true;
750  }
751  word->SetScriptPositions();
752  }
753 }
754 
755 // Factored helper considers the indexed word and updates all the pointed
756 // values.
757 static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
758  float* rating, float* certainty, bool* bad,
759  bool* valid_permuter, int* right, int* next_left) {
760  *right = -MAX_INT32;
761  *next_left = MAX_INT32;
762  if (index < words.size()) {
763  WERD_CHOICE* choice = words[index]->best_choice;
764  if (choice == NULL) {
765  *bad = true;
766  } else {
767  *rating += choice->rating();
768  *certainty = MIN(*certainty, choice->certainty());
769  if (!Dict::valid_word_permuter(choice->permuter(), false))
770  *valid_permuter = false;
771  }
772  *right = words[index]->word->bounding_box().right();
773  if (index + 1 < words.size())
774  *next_left = words[index + 1]->word->bounding_box().left();
775  } else {
776  *valid_permuter = false;
777  *bad = true;
778  }
779 }
780 
781 // Helper chooses the best combination of words, transferring good ones from
782 // new_words to best_words. To win, a new word must have (better rating and
783 // certainty) or (better permuter status and rating within rating ratio and
784 // certainty within certainty margin) than current best.
785 // All the new_words are consumed (moved to best_words or deleted.)
786 // The return value is the number of new_words used minus the number of
787 // best_words that remain in the output.
788 static int SelectBestWords(double rating_ratio,
789  double certainty_margin,
790  bool debug,
791  PointerVector<WERD_RES>* new_words,
792  PointerVector<WERD_RES>* best_words) {
793  // Process the smallest groups of words that have an overlapping word
794  // boundary at the end.
795  GenericVector<WERD_RES*> out_words;
796  // Index into each word vector (best, new).
797  int b = 0, n = 0;
798  int num_best = 0, num_new = 0;
799  while (b < best_words->size() || n < new_words->size()) {
800  // Start of the current run in each.
801  int start_b = b, start_n = n;
802  // Rating of the current run in each.
803  float b_rating = 0.0f, n_rating = 0.0f;
804  // Certainty of the current run in each.
805  float b_certainty = 0.0f, n_certainty = 0.0f;
806  // True if any word is missing its best choice.
807  bool b_bad = false, n_bad = false;
808  // True if all words have a valid permuter.
809  bool b_valid_permuter = true, n_valid_permuter = true;
810 
811  while (b < best_words->size() || n < new_words->size()) {
812  int b_right = -MAX_INT32;
813  int next_b_left = MAX_INT32;
814  EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
815  &b_valid_permuter, &b_right, &next_b_left);
816  int n_right = -MAX_INT32;
817  int next_n_left = MAX_INT32;
818  EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
819  &n_valid_permuter, &n_right, &next_n_left);
820  if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
821  // The word breaks overlap. [start_b,b] and [start_n, n] match.
822  break;
823  }
824  // Keep searching for the matching word break.
825  if ((b_right < n_right && b < best_words->size()) ||
826  n == new_words->size())
827  ++b;
828  else
829  ++n;
830  }
831  bool new_better = false;
832  if (!n_bad && (b_bad || (n_certainty > b_certainty &&
833  n_rating < b_rating) ||
834  (!b_valid_permuter && n_valid_permuter &&
835  n_rating < b_rating * rating_ratio &&
836  n_certainty > b_certainty - certainty_margin))) {
837  // New is better.
838  for (int i = start_n; i <= n; ++i) {
839  out_words.push_back((*new_words)[i]);
840  (*new_words)[i] = NULL;
841  ++num_new;
842  }
843  new_better = true;
844  } else if (!b_bad) {
845  // Current best is better.
846  for (int i = start_b; i <= b; ++i) {
847  out_words.push_back((*best_words)[i]);
848  (*best_words)[i] = NULL;
849  ++num_best;
850  }
851  }
852  int end_b = b < best_words->size() ? b + 1 : b;
853  int end_n = n < new_words->size() ? n + 1 : n;
854  if (debug) {
855  tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
856  " valid dict: %d v %d\n",
857  end_n - start_n, new_better ? "better" : "worse",
858  end_b - start_b, n_rating, b_rating,
859  n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
860  }
861  // Move on to the next group.
862  b = end_b;
863  n = end_n;
864  }
865  // Transfer from out_words to best_words.
866  best_words->clear();
867  for (int i = 0; i < out_words.size(); ++i)
868  best_words->push_back(out_words[i]);
869  return num_new - num_best;
870 }
871 
872 // Helper to recognize the word using the given (language-specific) tesseract.
873 // Returns positive if this recognizer found more new best words than the
874 // number kept from best_words.
876  WordRecognizer recognizer,
877  WERD_RES** in_word,
878  PointerVector<WERD_RES>* best_words) {
879  bool debug = classify_debug_level || cube_debug_level;
880  if (debug) {
881  tprintf("Trying word using lang %s, oem %d\n",
882  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
883  }
884  // Run the recognizer on the word.
885  PointerVector<WERD_RES> new_words;
886  (this->*recognizer)(word_data, in_word, &new_words);
887  if (new_words.empty()) {
888  // Transfer input word to new_words, as the classifier must have put
889  // the result back in the input.
890  new_words.push_back(*in_word);
891  *in_word = NULL;
892  }
893  if (debug) {
894  for (int i = 0; i < new_words.size(); ++i)
895  new_words[i]->DebugTopChoice("Lang result");
896  }
897  // Initial version is a bit of a hack based on better certainty and rating
898  // (to reduce false positives from cube) or a dictionary vs non-dictionary
899  // word.
900  return SelectBestWords(classify_max_rating_ratio,
902  debug, &new_words, best_words);
903 }
904 
905 // Helper returns true if all the words are acceptable.
906 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
907  for (int w = 0; w < words.size(); ++w) {
908  if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
909  }
910  return true;
911 }
912 
913 // Moves good-looking "noise"/diacritics from the reject list to the main
914 // blob list on the current word. Returns true if anything was done, and
915 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
917  bool* make_next_word_fuzzy) {
918  *make_next_word_fuzzy = false;
919  WERD* real_word = pr_it->word()->word;
920  if (real_word->rej_cblob_list()->empty() ||
921  real_word->cblob_list()->empty() ||
922  real_word->rej_cblob_list()->length() > noise_maxperword)
923  return false;
924  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
925  // Get the noise outlines into a vector with matching bool map.
926  GenericVector<C_OUTLINE*> outlines;
927  real_word->GetNoiseOutlines(&outlines);
928  GenericVector<bool> word_wanted;
929  GenericVector<bool> overlapped_any_blob;
930  GenericVector<C_BLOB*> target_blobs;
931  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
932  &word_wanted, &overlapped_any_blob,
933  &target_blobs);
934  // Filter the outlines that overlapped any blob and put them into the word
935  // now. This simplifies the remaining task and also makes it more accurate
936  // as it has more completed blobs to work on.
937  GenericVector<bool> wanted;
938  GenericVector<C_BLOB*> wanted_blobs;
939  GenericVector<C_OUTLINE*> wanted_outlines;
940  int num_overlapped = 0;
941  int num_overlapped_used = 0;
942  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
943  if (overlapped_any_blob[i]) {
944  ++num_overlapped;
945  if (word_wanted[i]) ++num_overlapped_used;
946  wanted.push_back(word_wanted[i]);
947  wanted_blobs.push_back(target_blobs[i]);
948  wanted_outlines.push_back(outlines[i]);
949  outlines[i] = NULL;
950  }
951  }
952  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
953  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
954  &target_blobs);
955  int non_overlapped = 0;
956  int non_overlapped_used = 0;
957  for (int i = 0; i < word_wanted.size(); ++i) {
958  if (word_wanted[i]) ++non_overlapped_used;
959  if (outlines[i] != NULL) ++non_overlapped_used;
960  }
961  if (debug_noise_removal) {
962  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
963  num_overlapped_used, num_overlapped, non_overlapped_used,
964  non_overlapped);
965  real_word->bounding_box().print();
966  }
967  // Now we have decided which outlines we want, put them into the real_word.
968  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
969  make_next_word_fuzzy)) {
970  pr_it->MakeCurrentWordFuzzy();
971  }
972  // TODO(rays) Parts of combos have a deep copy of the real word, and need
973  // to have their noise outlines moved/assigned in the same way!!
974  return num_overlapped_used != 0 || non_overlapped_used != 0;
975 }
976 
977 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
978 // Input: a set of noisy outlines that probably belong to the real_word.
979 // Output: word_wanted indicates which outlines are to be assigned to a blob,
980 // target_blobs indicates which to assign to, and overlapped_any_blob is
981 // true for all outlines that overlapped a blob.
983  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
984  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
985  GenericVector<bool>* overlapped_any_blob,
986  GenericVector<C_BLOB*>* target_blobs) {
987  GenericVector<bool> blob_wanted;
988  word_wanted->init_to_size(outlines.size(), false);
989  overlapped_any_blob->init_to_size(outlines.size(), false);
990  target_blobs->init_to_size(outlines.size(), NULL);
991  // For each real blob, find the outlines that seriously overlap it.
992  // A single blob could be several merged characters, so there can be quite
993  // a few outlines overlapping, and the full engine needs to be used to chop
994  // and join to get a sensible result.
995  C_BLOB_IT blob_it(real_word->cblob_list());
996  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
997  C_BLOB* blob = blob_it.data();
998  TBOX blob_box = blob->bounding_box();
999  blob_wanted.init_to_size(outlines.size(), false);
1000  int num_blob_outlines = 0;
1001  for (int i = 0; i < outlines.size(); ++i) {
1002  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1003  !(*word_wanted)[i]) {
1004  blob_wanted[i] = true;
1005  (*overlapped_any_blob)[i] = true;
1006  ++num_blob_outlines;
1007  }
1008  }
1009  if (debug_noise_removal) {
1010  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1011  blob_box.print();
1012  }
1013  // If any outlines overlap the blob, and not too many, classify the blob
1014  // (using the full engine, languages and all), and choose the maximal
1015  // combination of outlines that doesn't hurt the end-result classification
1016  // by too much. Mark them as wanted.
1017  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1018  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1019  outlines, num_blob_outlines,
1020  &blob_wanted)) {
1021  for (int i = 0; i < blob_wanted.size(); ++i) {
1022  if (blob_wanted[i]) {
1023  // Claim the outline and record where it is going.
1024  (*word_wanted)[i] = true;
1025  (*target_blobs)[i] = blob;
1026  }
1027  }
1028  }
1029  }
1030  }
1031 }
1032 
1033 // Attempts to assign non-overlapping outlines to their nearest blobs or
1034 // make new blobs out of them.
1036  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1037  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1038  GenericVector<C_BLOB*>* target_blobs) {
1039  GenericVector<bool> blob_wanted;
1040  word_wanted->init_to_size(outlines.size(), false);
1041  target_blobs->init_to_size(outlines.size(), NULL);
1042  // Check for outlines that need to be turned into stand-alone blobs.
1043  for (int i = 0; i < outlines.size(); ++i) {
1044  if (outlines[i] == NULL) continue;
1045  // Get a set of adjacent outlines that don't overlap any existing blob.
1046  blob_wanted.init_to_size(outlines.size(), false);
1047  int num_blob_outlines = 0;
1048  TBOX total_ol_box(outlines[i]->bounding_box());
1049  while (i < outlines.size() && outlines[i] != NULL) {
1050  blob_wanted[i] = true;
1051  total_ol_box += outlines[i]->bounding_box();
1052  ++i;
1053  ++num_blob_outlines;
1054  }
1055  // Find the insertion point.
1056  C_BLOB_IT blob_it(real_word->cblob_list());
1057  while (!blob_it.at_last() &&
1058  blob_it.data_relative(1)->bounding_box().left() <=
1059  total_ol_box.left()) {
1060  blob_it.forward();
1061  }
1062  // Choose which combination of them we actually want and where to put
1063  // them.
1064  if (debug_noise_removal)
1065  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1066  C_BLOB* left_blob = blob_it.data();
1067  TBOX left_box = left_blob->bounding_box();
1068  C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
1069  if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
1070  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1071  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1072  outlines, num_blob_outlines,
1073  &blob_wanted)) {
1074  if (debug_noise_removal) tprintf("Added to left blob\n");
1075  for (int j = 0; j < blob_wanted.size(); ++j) {
1076  if (blob_wanted[j]) {
1077  (*word_wanted)[j] = true;
1078  (*target_blobs)[j] = left_blob;
1079  }
1080  }
1081  } else if (right_blob != NULL &&
1082  (!left_box.x_overlap(total_ol_box) ||
1083  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1085  right_blob, outlines,
1086  num_blob_outlines, &blob_wanted)) {
1087  if (debug_noise_removal) tprintf("Added to right blob\n");
1088  for (int j = 0; j < blob_wanted.size(); ++j) {
1089  if (blob_wanted[j]) {
1090  (*word_wanted)[j] = true;
1091  (*target_blobs)[j] = right_blob;
1092  }
1093  }
1094  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
1095  outlines, num_blob_outlines,
1096  &blob_wanted)) {
1097  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1098  for (int j = 0; j < blob_wanted.size(); ++j) {
1099  if (blob_wanted[j]) {
1100  (*word_wanted)[j] = true;
1101  (*target_blobs)[j] = NULL;
1102  }
1103  }
1104  }
1105  }
1106 }
1107 
1108 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1109 // chooses the optimal set (approximately) and returns true if any outlines
1110 // are desired, in which case ok_outlines indicates which ones.
1112  int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
1113  const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
1114  GenericVector<bool>* ok_outlines) {
1115  STRING best_str;
1116  float target_cert = certainty_threshold;
1117  if (blob != NULL) {
1118  float target_c2;
1119  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1120  if (debug_noise_removal) {
1121  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1122  target_cert, target_c2);
1123  blob->bounding_box().print();
1124  }
1125  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1126  }
1127  GenericVector<bool> test_outlines = *ok_outlines;
1128  // Start with all the outlines in.
1129  STRING all_str;
1130  GenericVector<bool> best_outlines = *ok_outlines;
1131  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1132  pr_it, blob, &all_str);
1133  if (debug_noise_removal) {
1134  TBOX ol_box;
1135  for (int i = 0; i < test_outlines.size(); ++i) {
1136  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1137  }
1138  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1139  all_str.string(), best_cert, best_cert - target_cert);
1140  ol_box.print();
1141  }
1142  // Iteratively zero out the bit that improves the certainty the most, until
1143  // we get past the threshold, have zero bits, or fail to improve.
1144  int best_index = 0; // To zero out.
1145  while (num_outlines > 1 && best_index >= 0 &&
1146  (blob == NULL || best_cert < target_cert || blob != NULL)) {
1147  // Find the best bit to zero out.
1148  best_index = -1;
1149  for (int i = 0; i < outlines.size(); ++i) {
1150  if (test_outlines[i]) {
1151  test_outlines[i] = false;
1152  STRING str;
1153  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1154  pr_it, blob, &str);
1155  if (debug_noise_removal) {
1156  TBOX ol_box;
1157  for (int j = 0; j < outlines.size(); ++j) {
1158  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1159  tprintf("%d", test_outlines[j]);
1160  }
1161  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1162  cert, cert - target_cert);
1163  ol_box.print();
1164  }
1165  if (cert > best_cert) {
1166  best_cert = cert;
1167  best_index = i;
1168  best_outlines = test_outlines;
1169  }
1170  test_outlines[i] = true;
1171  }
1172  }
1173  if (best_index >= 0) {
1174  test_outlines[best_index] = false;
1175  --num_outlines;
1176  }
1177  }
1178  if (best_cert >= target_cert) {
1179  // Save the best combination.
1180  *ok_outlines = best_outlines;
1181  if (debug_noise_removal) {
1182  tprintf("%s noise combination ", blob ? "Adding" : "New");
1183  for (int i = 0; i < best_outlines.size(); ++i) {
1184  tprintf("%d", best_outlines[i]);
1185  }
1186  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1187  target_cert);
1188  }
1189  return true;
1190  }
1191  return false;
1192 }
1193 
1194 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1195 // the inclusion of the outlines, and returns the certainty of the raw choice.
1197  const GenericVector<bool>& ok_outlines,
1198  const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
1199  C_BLOB* blob, STRING* best_str) {
1200  C_OUTLINE_IT ol_it;
1201  C_OUTLINE* first_to_keep = NULL;
1202  if (blob != NULL) {
1203  // Add the required outlines to the blob.
1204  ol_it.set_to_list(blob->out_list());
1205  first_to_keep = ol_it.data();
1206  }
1207  for (int i = 0; i < ok_outlines.size(); ++i) {
1208  if (ok_outlines[i]) {
1209  // This outline is to be added.
1210  if (blob == NULL) {
1211  blob = new C_BLOB(outlines[i]);
1212  ol_it.set_to_list(blob->out_list());
1213  } else {
1214  ol_it.add_before_stay_put(outlines[i]);
1215  }
1216  }
1217  }
1218  float c2;
1219  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1220  ol_it.move_to_first();
1221  if (first_to_keep == NULL) {
1222  // We created blob. Empty its outlines and delete it.
1223  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1224  delete blob;
1225  cert = -c2;
1226  } else {
1227  // Remove the outlines that we put in.
1228  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1229  ol_it.extract();
1230  }
1231  }
1232  return cert;
1233 }
1234 
1235 // Classifies the given blob (part of word_data->word->word) as an individual
1236 // word, using languages, chopper etc, returning only the certainty of the
1237 // best raw choice, and undoing all the work done to fake out the word.
1239  C_BLOB* blob, STRING* best_str, float* c2) {
1240  WERD* real_word = pr_it->word()->word;
1241  WERD* word = real_word->ConstructFromSingleBlob(
1242  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1243  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1244  // Get a new iterator that points to the new word.
1245  PAGE_RES_IT it(pr_it->page_res);
1246  while (it.word() != word_res && it.word() != NULL) it.forward();
1247  ASSERT_HOST(it.word() == word_res);
1248  WordData wd(it);
1249  // Force full initialization.
1250  SetupWordPassN(1, &wd);
1251  classify_word_and_language(pass_n, &it, &wd);
1252  if (debug_noise_removal) {
1253  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1254  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1255  wd.word->raw_choice->max_x_height());
1256  }
1257  float cert = wd.word->raw_choice->certainty();
1258  float rat = wd.word->raw_choice->rating();
1259  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1260  *best_str = wd.word->raw_choice->unichar_string();
1261  it.DeleteCurrentWord();
1262  pr_it->ResetWordIterator();
1263  return cert;
1264 }
1265 
1266 // Generic function for classifying a word. Can be used either for pass1 or
1267 // pass2 according to the function passed to recognizer.
1268 // word_data holds the word to be recognized, and its block and row, and
1269 // pr_it points to the word as well, in case we are running LSTM and it wants
1270 // to output multiple words.
1271 // Recognizes in the current language, and if successful that is all.
1272 // If recognition was not successful, tries all available languages until
1273 // it gets a successful result or runs out of languages. Keeps the best result.
1275  WordData* word_data) {
1276  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1278  // Best result so far.
1279  PointerVector<WERD_RES> best_words;
1280  // Points to the best result. May be word or in lang_words.
1281  WERD_RES* word = word_data->word;
1282  clock_t start_t = clock();
1284  tprintf("%s word with lang %s at:",
1285  word->done ? "Already done" : "Processing",
1286  most_recently_used_->lang.string());
1287  word->word->bounding_box().print();
1288  }
1289  if (word->done) {
1290  // If done on pass1, leave it as-is.
1291  if (!word->tess_failed)
1292  most_recently_used_ = word->tesseract;
1293  return;
1294  }
1295  int sub = sub_langs_.size();
1296  if (most_recently_used_ != this) {
1297  // Get the index of the most_recently_used_.
1298  for (sub = 0; sub < sub_langs_.size() &&
1299  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1300  }
1301  most_recently_used_->RetryWithLanguage(
1302  *word_data, recognizer, &word_data->lang_words[sub], &best_words);
1303  Tesseract* best_lang_tess = most_recently_used_;
1304  if (!WordsAcceptable(best_words)) {
1305  // Try all the other languages to see if they are any better.
1306  if (most_recently_used_ != this &&
1307  this->RetryWithLanguage(*word_data, recognizer,
1308  &word_data->lang_words[sub_langs_.size()],
1309  &best_words) > 0) {
1310  best_lang_tess = this;
1311  }
1312  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1313  ++i) {
1314  if (most_recently_used_ != sub_langs_[i] &&
1315  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
1316  &word_data->lang_words[i],
1317  &best_words) > 0) {
1318  best_lang_tess = sub_langs_[i];
1319  }
1320  }
1321  }
1322  most_recently_used_ = best_lang_tess;
1323  if (!best_words.empty()) {
1324  if (best_words.size() == 1 && !best_words[0]->combination) {
1325  // Move the best single result to the main word.
1326  word_data->word->ConsumeWordResults(best_words[0]);
1327  } else {
1328  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1329  word_data->word = best_words.back();
1330  pr_it->ReplaceCurrentWord(&best_words);
1331  }
1332  ASSERT_HOST(word_data->word->box_word != NULL);
1333  } else {
1334  tprintf("no best words!!\n");
1335  }
1336  clock_t ocr_t = clock();
1337  if (tessedit_timing_debug) {
1338  tprintf("%s (ocr took %.2f sec)\n",
1339  word->best_choice->unichar_string().string(),
1340  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1341  }
1342 }
1343 
1351  WERD_RES** in_word,
1352  PointerVector<WERD_RES>* out_words) {
1353  ROW* row = word_data.row;
1354  BLOCK* block = word_data.block;
1355  prev_word_best_choice_ = word_data.prev_word != NULL
1356  ? word_data.prev_word->word->best_choice : NULL;
1357 #ifndef NO_CUBE_BUILD
1358  // If we only intend to run cube - run it and return.
1360  cube_word_pass1(block, row, *in_word);
1361  return;
1362  }
1363 #endif
1364  WERD_RES* word = *in_word;
1365  match_word_pass_n(1, word, row, block);
1366  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1367  word->tess_would_adapt = AdaptableWord(word);
1368  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1369 
1370  if (adapt_ok) {
1371  // Send word to adaptive classifier for training.
1372  word->BestChoiceToCorrectText();
1373  LearnWord(NULL, word);
1374  // Mark misadaptions if running blamer.
1375  if (word->blamer_bundle != NULL) {
1378  }
1379  }
1380 
1381  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1383  }
1384 }
1385 
1386 // Helper to report the result of the xheight fix.
1387 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
1388  WERD_RES* word, WERD_RES* new_word) {
1389  tprintf("New XHT Match:%s = %s ",
1390  word->best_choice->unichar_string().string(),
1391  word->best_choice->debug_string().string());
1392  word->reject_map.print(debug_fp);
1393  tprintf(" -> %s = %s ",
1394  new_word->best_choice->unichar_string().string(),
1395  new_word->best_choice->debug_string().string());
1396  new_word->reject_map.print(debug_fp);
1397  tprintf(" %s->%s %s %s\n",
1398  word->guessed_x_ht ? "GUESS" : "CERT",
1399  new_word->guessed_x_ht ? "GUESS" : "CERT",
1400  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1401  accept_new_word ? "ACCEPTED" : "");
1402 }
1403 
1404 // Run the x-height fix-up, based on min/max top/bottom information in
1405 // unicharset.
1406 // Returns true if the word was changed.
1407 // See the comment in fixxht.cpp for a description of the overall process.
1408 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
1409  int original_misfits = CountMisfitTops(word);
1410  if (original_misfits == 0)
1411  return false;
1412  float baseline_shift = 0.0f;
1413  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1414  if (baseline_shift != 0.0f) {
1415  // Try the shift on its own first.
1416  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1417  word, block, row))
1418  return false;
1419  original_misfits = CountMisfitTops(word);
1420  if (original_misfits > 0) {
1421  float new_baseline_shift;
1422  // Now recompute the new x_height.
1423  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1424  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1425  // No test of return value here, as we are definitely making a change
1426  // to the word by shifting the baseline.
1427  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1428  word, block, row);
1429  }
1430  }
1431  return true;
1432  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1433  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1434  word, block, row);
1435  } else {
1436  return false;
1437  }
1438 }
1439 
1440 // Runs recognition with the test baseline shift and x-height and returns true
1441 // if there was an improvement in recognition result.
1442 bool Tesseract::TestNewNormalization(int original_misfits,
1443  float baseline_shift, float new_x_ht,
1444  WERD_RES *word, BLOCK* block, ROW *row) {
1445  bool accept_new_x_ht = false;
1446  WERD_RES new_x_ht_word(word->word);
1447  if (word->blamer_bundle != NULL) {
1448  new_x_ht_word.blamer_bundle = new BlamerBundle();
1449  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1450  }
1451  new_x_ht_word.x_height = new_x_ht;
1452  new_x_ht_word.baseline_shift = baseline_shift;
1453  new_x_ht_word.caps_height = 0.0;
1454  new_x_ht_word.SetupForRecognition(
1457  poly_allow_detailed_fx, row, block);
1458  match_word_pass_n(2, &new_x_ht_word, row, block);
1459  if (!new_x_ht_word.tess_failed) {
1460  int new_misfits = CountMisfitTops(&new_x_ht_word);
1461  if (debug_x_ht_level >= 1) {
1462  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1463  original_misfits, word->x_height,
1464  new_misfits, new_x_ht);
1465  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1466  word->best_choice->rating(), word->best_choice->certainty(),
1467  new_x_ht_word.best_choice->rating(),
1468  new_x_ht_word.best_choice->certainty());
1469  }
1470  // The misfits must improve and either the rating or certainty.
1471  accept_new_x_ht = new_misfits < original_misfits &&
1472  (new_x_ht_word.best_choice->certainty() >
1473  word->best_choice->certainty() ||
1474  new_x_ht_word.best_choice->rating() <
1475  word->best_choice->rating());
1476  if (debug_x_ht_level >= 1) {
1477  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1478  }
1479  }
1480  if (accept_new_x_ht) {
1481  word->ConsumeWordResults(&new_x_ht_word);
1482  return true;
1483  }
1484  return false;
1485 }
1486 
1494  WERD_RES** in_word,
1495  PointerVector<WERD_RES>* out_words) {
1496  // Return if we do not want to run Tesseract.
1499  word_data.word->best_choice != NULL)
1500  return;
1502  return;
1503  }
1504  ROW* row = word_data.row;
1505  BLOCK* block = word_data.block;
1506  WERD_RES* word = *in_word;
1507  prev_word_best_choice_ = word_data.prev_word != NULL
1508  ? word_data.prev_word->word->best_choice : NULL;
1509 
1511  check_debug_pt(word, 30);
1512  if (!word->done) {
1513  word->caps_height = 0.0;
1514  if (word->x_height == 0.0f)
1515  word->x_height = row->x_height();
1516  match_word_pass_n(2, word, row, block);
1517  check_debug_pt(word, 40);
1518  }
1519 
1520  SubAndSuperscriptFix(word);
1521 
1522  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1524  block->classify_rotation().y() == 0.0f) {
1525  // Use the tops and bottoms since they are available.
1526  TrainedXheightFix(word, block, row);
1527  }
1528 
1530  }
1531 #ifndef GRAPHICS_DISABLED
1533  if (fx_win == NULL)
1534  create_fx_win();
1535  clear_fx_win();
1536  word->rebuild_word->plot(fx_win);
1537  TBOX wbox = word->rebuild_word->bounding_box();
1538  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1539  wbox.right(), wbox.bottom());
1541  }
1542 #endif
1544  check_debug_pt(word, 50);
1545 }
1546 
1547 
1555  ROW *row, BLOCK* block) {
1556  if (word->tess_failed) return;
1557  tess_segment_pass_n(pass_n, word);
1558 
1559  if (!word->tess_failed) {
1560  if (!word->word->flag (W_REP_CHAR)) {
1561  word->fix_quotes();
1563  word->fix_hyphens();
1564  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1565  if (word->best_choice->length() != word->box_word->length()) {
1566  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1567  " #Blobs=%d\n",
1568  word->best_choice->debug_string().string(),
1569  word->best_choice->length(),
1570  word->box_word->length());
1571 
1572  }
1573  word->tess_accepted = tess_acceptable_word(word);
1574 
1575  // Also sets word->done flag
1576  make_reject_map(word, row, pass_n);
1577  }
1578  }
1579  set_word_fonts(word);
1580 
1581  ASSERT_HOST(word->raw_choice != NULL);
1582 }
1583 
1584 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1585 // the given char_id, or NULL if none can be found.
1586 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1587  WERD_RES* word_res) {
1588  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1589  BLOB_CHOICE* best_choice = NULL;
1590  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1591  BLOB_CHOICE* choice = FindMatchingChoice(char_id,
1592  word_res->GetBlobChoices(i));
1593  if (choice != NULL) {
1594  if (best_choice == NULL || choice->rating() < best_choice->rating())
1595  best_choice = choice;
1596  }
1597  }
1598  return best_choice;
1599 }
1600 
1601 // Helper to insert blob_choice in each location in the leader word if there is
1602 // no matching BLOB_CHOICE there already, and correct any incorrect results
1603 // in the best_choice.
1604 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1605  WERD_RES* word_res) {
1606  WERD_CHOICE* word = word_res->best_choice;
1607  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1608  BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1609  word_res->GetBlobChoices(i));
1610  if (choice == NULL) {
1611  BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1612  choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1613  }
1614  }
1615  // Correct any incorrect results in word.
1616  for (int i = 0; i < word->length(); ++i) {
1617  if (word->unichar_id(i) != blob_choice->unichar_id())
1618  word->set_unichar_id(blob_choice->unichar_id(), i);
1619  }
1620 }
1621 
1630  WERD_RES *word_res = page_res_it->word();
1631  const WERD_CHOICE &word = *(word_res->best_choice);
1632 
1633  // Find the frequency of each unique character in the word.
1634  SortHelper<UNICHAR_ID> rep_ch(word.length());
1635  for (int i = 0; i < word.length(); ++i) {
1636  rep_ch.Add(word.unichar_id(i), 1);
1637  }
1638 
1639  // Find the most frequent result.
1640  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1641  int max_count = rep_ch.MaxCount(&maxch_id);
1642  // Find the best exemplar of a classifier result for maxch_id.
1643  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1644  if (best_choice == NULL) {
1645  tprintf("Failed to find a choice for %s, occurring %d times\n",
1646  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1647  return;
1648  }
1649  word_res->done = TRUE;
1650 
1651  // Measure the mean space.
1652  int gap_count = 0;
1653  WERD* werd = word_res->word;
1654  C_BLOB_IT blob_it(werd->cblob_list());
1655  C_BLOB* prev_blob = blob_it.data();
1656  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1657  C_BLOB* blob = blob_it.data();
1658  int gap = blob->bounding_box().left();
1659  gap -= prev_blob->bounding_box().right();
1660  ++gap_count;
1661  prev_blob = blob;
1662  }
1663  // Just correct existing classification.
1664  CorrectRepcharChoices(best_choice, word_res);
1665  word_res->reject_map.initialise(word.length());
1666 }
1667 
1669  const UNICHARSET& char_set, const char *s, const char *lengths) {
1670  int i = 0;
1671  int offset = 0;
1672  int leading_punct_count;
1673  int upper_count = 0;
1674  int hyphen_pos = -1;
1676 
1677  if (strlen (lengths) > 20)
1678  return word_type;
1679 
1680  /* Single Leading punctuation char*/
1681 
1682  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1683  offset += lengths[i++];
1684  leading_punct_count = i;
1685 
1686  /* Initial cap */
1687  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1688  offset += lengths[i++];
1689  upper_count++;
1690  }
1691  if (upper_count > 1) {
1692  word_type = AC_UPPER_CASE;
1693  } else {
1694  /* Lower case word, possibly with an initial cap */
1695  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1696  offset += lengths[i++];
1697  }
1698  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1699  goto not_a_word;
1700  /*
1701  Allow a single hyphen in a lower case word
1702  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1703  */
1704  if (lengths[i] == 1 && s[offset] == '-') {
1705  hyphen_pos = i;
1706  offset += lengths[i++];
1707  if (s[offset] != '\0') {
1708  while ((s[offset] != '\0') &&
1709  char_set.get_islower(s + offset, lengths[i])) {
1710  offset += lengths[i++];
1711  }
1712  if (i < hyphen_pos + 3)
1713  goto not_a_word;
1714  }
1715  } else {
1716  /* Allow "'s" in NON hyphenated lower case words */
1717  if (lengths[i] == 1 && (s[offset] == '\'') &&
1718  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1719  offset += lengths[i++];
1720  offset += lengths[i++];
1721  }
1722  }
1723  if (upper_count > 0)
1724  word_type = AC_INITIAL_CAP;
1725  else
1726  word_type = AC_LOWER_CASE;
1727  }
1728 
1729  /* Up to two different, constrained trailing punctuation chars */
1730  if (lengths[i] == 1 && s[offset] != '\0' &&
1731  STRING(chs_trailing_punct1).contains(s[offset]))
1732  offset += lengths[i++];
1733  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1734  s[offset - lengths[i - 1]] != s[offset] &&
1735  STRING(chs_trailing_punct2).contains (s[offset]))
1736  offset += lengths[i++];
1737 
1738  if (s[offset] != '\0')
1739  word_type = AC_UNACCEPTABLE;
1740 
1741  not_a_word:
1742 
1743  if (word_type == AC_UNACCEPTABLE) {
1744  /* Look for abbreviation string */
1745  i = 0;
1746  offset = 0;
1747  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1748  word_type = AC_UC_ABBREV;
1749  while (s[offset] != '\0' &&
1750  char_set.get_isupper(s + offset, lengths[i]) &&
1751  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1752  offset += lengths[i++];
1753  offset += lengths[i++];
1754  }
1755  }
1756  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1757  word_type = AC_LC_ABBREV;
1758  while (s[offset] != '\0' &&
1759  char_set.get_islower(s + offset, lengths[i]) &&
1760  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1761  offset += lengths[i++];
1762  offset += lengths[i++];
1763  }
1764  }
1765  if (s[offset] != '\0')
1766  word_type = AC_UNACCEPTABLE;
1767  }
1768 
1769  return word_type;
1770 }
1771 
1773  BOOL8 show_map_detail = FALSE;
1774  inT16 i;
1775 
1776  if (!test_pt)
1777  return FALSE;
1778 
1779  tessedit_rejection_debug.set_value (FALSE);
1780  debug_x_ht_level.set_value(0);
1781 
1782  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1783  if (location < 0)
1784  return TRUE; // For breakpoint use
1785  tessedit_rejection_debug.set_value (TRUE);
1786  debug_x_ht_level.set_value(2);
1787  tprintf ("\n\nTESTWD::");
1788  switch (location) {
1789  case 0:
1790  tprintf ("classify_word_pass1 start\n");
1791  word->word->print();
1792  break;
1793  case 10:
1794  tprintf ("make_reject_map: initial map");
1795  break;
1796  case 20:
1797  tprintf ("make_reject_map: after NN");
1798  break;
1799  case 30:
1800  tprintf ("classify_word_pass2 - START");
1801  break;
1802  case 40:
1803  tprintf ("classify_word_pass2 - Pre Xht");
1804  break;
1805  case 50:
1806  tprintf ("classify_word_pass2 - END");
1807  show_map_detail = TRUE;
1808  break;
1809  case 60:
1810  tprintf ("fixspace");
1811  break;
1812  case 70:
1813  tprintf ("MM pass START");
1814  break;
1815  case 80:
1816  tprintf ("MM pass END");
1817  break;
1818  case 90:
1819  tprintf ("After Poor quality rejection");
1820  break;
1821  case 100:
1822  tprintf ("unrej_good_quality_words - START");
1823  break;
1824  case 110:
1825  tprintf ("unrej_good_quality_words - END");
1826  break;
1827  case 120:
1828  tprintf ("Write results pass");
1829  show_map_detail = TRUE;
1830  break;
1831  }
1832  if (word->best_choice != NULL) {
1833  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1834  word->reject_map.print(debug_fp);
1835  tprintf("\n");
1836  if (show_map_detail) {
1837  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1838  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1839  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1840  word->reject_map[i].full_print(debug_fp);
1841  }
1842  }
1843  } else {
1844  tprintf("null best choice\n");
1845  }
1846  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1847  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1848  return TRUE;
1849  } else {
1850  return FALSE;
1851  }
1852 }
1853 
1859 static void find_modal_font( //good chars in word
1860  STATS *fonts, //font stats
1861  inT16 *font_out, //output font
1862  inT8 *font_count //output count
1863  ) {
1864  inT16 font; //font index
1865  inT32 count; //pile couat
1866 
1867  if (fonts->get_total () > 0) {
1868  font = (inT16) fonts->mode ();
1869  *font_out = font;
1870  count = fonts->pile_count (font);
1871  *font_count = count < MAX_INT8 ? count : MAX_INT8;
1872  fonts->add (font, -*font_count);
1873  }
1874  else {
1875  *font_out = -1;
1876  *font_count = 0;
1877  }
1878 }
1879 
1886  // Don't try to set the word fonts for a cube word, as the configs
1887  // will be meaningless.
1888  if (word->chopped_word == NULL) return;
1889  ASSERT_HOST(word->best_choice != NULL);
1890 
1891  int fontinfo_size = get_fontinfo_table().size();
1892  if (fontinfo_size == 0) return;
1893  GenericVector<int> font_total_score;
1894  font_total_score.init_to_size(fontinfo_size, 0);
1895 
1896  word->italic = 0;
1897  word->bold = 0;
1898  // Compute the font scores for the word
1899  if (tessedit_debug_fonts) {
1900  tprintf("Examining fonts in %s\n",
1901  word->best_choice->debug_string().string());
1902  }
1903  for (int b = 0; b < word->best_choice->length(); ++b) {
1904  BLOB_CHOICE* choice = word->GetBlobChoice(b);
1905  if (choice == NULL) continue;
1906  const GenericVector<ScoredFont>& fonts = choice->fonts();
1907  for (int f = 0; f < fonts.size(); ++f) {
1908  int fontinfo_id = fonts[f].fontinfo_id;
1909  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1910  font_total_score[fontinfo_id] += fonts[f].score;
1911  }
1912  }
1913  }
1914  // Find the top and 2nd choice for the word.
1915  int score1 = 0, score2 = 0;
1916  inT16 font_id1 = -1, font_id2 = -1;
1917  for (int f = 0; f < fontinfo_size; ++f) {
1918  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1919  tprintf("Font %s, total score = %d\n",
1920  fontinfo_table_.get(f).name, font_total_score[f]);
1921  }
1922  if (font_total_score[f] > score1) {
1923  score2 = score1;
1924  font_id2 = font_id1;
1925  score1 = font_total_score[f];
1926  font_id1 = f;
1927  } else if (font_total_score[f] > score2) {
1928  score2 = font_total_score[f];
1929  font_id2 = f;
1930  }
1931  }
1932  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1933  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1934  // Each score has a limit of MAX_UINT16, so divide by that to get the number
1935  // of "votes" for that font, ie number of perfect scores.
1936  word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
1937  word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
1938  if (score1 > 0) {
1939  FontInfo fi = fontinfo_table_.get(font_id1);
1940  if (tessedit_debug_fonts) {
1941  if (word->fontinfo_id2_count > 0) {
1942  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1943  fi.name, word->fontinfo_id_count,
1944  fontinfo_table_.get(font_id2).name,
1945  word->fontinfo_id2_count);
1946  } else {
1947  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1948  fi.name, word->fontinfo_id_count);
1949  }
1950  }
1951  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
1952  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
1953  }
1954 }
1955 
1956 
1964  PAGE_RES_IT page_res_it(page_res);
1965  WERD_RES *word; // current word
1966  STATS doc_fonts(0, font_table_size_); // font counters
1967 
1968  // Gather font id statistics.
1969  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1970  page_res_it.forward()) {
1971  word = page_res_it.word();
1972  if (word->fontinfo != NULL) {
1973  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1974  }
1975  if (word->fontinfo2 != NULL) {
1976  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1977  }
1978  }
1979  inT16 doc_font; // modal font
1980  inT8 doc_font_count; // modal font
1981  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1982  if (doc_font_count == 0)
1983  return;
1984  // Get the modal font pointer.
1985  const FontInfo* modal_font = NULL;
1986  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1987  page_res_it.forward()) {
1988  word = page_res_it.word();
1989  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
1990  modal_font = word->fontinfo;
1991  break;
1992  }
1993  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
1994  modal_font = word->fontinfo2;
1995  break;
1996  }
1997  }
1998  ASSERT_HOST(modal_font != NULL);
1999 
2000  // Assign modal font to weak words.
2001  for (page_res_it.restart_page(); page_res_it.word() != NULL;
2002  page_res_it.forward()) {
2003  word = page_res_it.word();
2004  int length = word->best_choice->length();
2005 
2006  int count = word->fontinfo_id_count;
2007  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2008  word->fontinfo = modal_font;
2009  // Counts only get 1 as it came from the doc.
2010  word->fontinfo_id_count = 1;
2011  word->italic = modal_font->is_italic() ? 1 : -1;
2012  word->bold = modal_font->is_bold() ? 1 : -1;
2013  }
2014  }
2015 }
2016 
2017 // If a word has multiple alternates check if the best choice is in the
2018 // dictionary. If not, replace it with an alternate that exists in the
2019 // dictionary.
2021  PAGE_RES_IT word_it(page_res);
2022  for (WERD_RES* word = word_it.word(); word != NULL;
2023  word = word_it.forward()) {
2024  if (word->best_choices.singleton())
2025  continue; // There are no alternates.
2026 
2027  WERD_CHOICE* best = word->best_choice;
2028  if (word->tesseract->getDict().valid_word(*best) != 0)
2029  continue; // The best choice is in the dictionary.
2030 
2031  WERD_CHOICE_IT choice_it(&word->best_choices);
2032  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2033  choice_it.forward()) {
2034  WERD_CHOICE* alternate = choice_it.data();
2035  if (word->tesseract->getDict().valid_word(*alternate)) {
2036  // The alternate choice is in the dictionary.
2037  if (tessedit_bigram_debug) {
2038  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2039  best->unichar_string().string(),
2040  alternate->unichar_string().string());
2041  }
2042  // Replace the 'best' choice with a better choice.
2043  word->ReplaceBestChoice(alternate);
2044  break;
2045  }
2046  }
2047  }
2048 }
2049 
2050 } // namespace tesseract
ROW * row
Definition: pageres.h:127
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:982
BLOCK * block
Definition: pageres.h:99
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
PROGRESS_FUNC progress_callback
returns true to cancel
Definition: ocrclass.h:126
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1885
void create_fx_win()
Definition: drawfx.cpp:60
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
int count(LIST var_list)
Definition: oldlist.cpp:103
BOOL8 tess_accepted
Definition: pageres.h:280
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:716
bool is_italic() const
Definition: fontinfo.h:113
const STRING & misadaption_debug() const
Definition: blamer.h:119
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:145
void rej_word_bad_quality()
Definition: rejctmap.cpp:488
void PrintBestChoices() const
Definition: pageres.cpp:709
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
inT32 mode() const
Definition: statistc.cpp:115
ALL but initial lc.
Definition: control.h:39
bool right_to_left() const
void print(FILE *fp)
Definition: rejctmap.cpp:394
#define TRUE
Definition: capi.h:45
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:644
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1493
short inT16
Definition: host.h:33
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1554
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1269
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:692
const double kMinRefitXHeightFraction
Definition: control.cpp:56
bool top_bottom_useful() const
Definition: unicharset.h:495
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:916
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1408
bool wordrec_run_blamer
Definition: wordrec.h:168
WERD_RES * word() const
Definition: pageres.h:736
Definition: werd.h:36
TWERD * rebuild_word
Definition: pageres.h:244
FILE * debug_fp
Definition: tessvars.cpp:24
STRING lang
Definition: ccutil.h:67
BOOL8 tess_would_adapt
Definition: pageres.h:281
const STRING & unichar_string() const
Definition: ratngs.h:525
static void Update()
Definition: scrollview.cpp:715
inT32 length() const
Definition: strngs.cpp:196
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void fix_hyphens()
Definition: pageres.cpp:1041
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
void clear_fx_win()
Definition: drawfx.cpp:73
void full_print(FILE *fp)
Definition: rejctmap.cpp:406
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119
void initialise(inT16 length)
Definition: rejctmap.cpp:318
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791
#define MAX_INT8
Definition: host.h:51
const FontInfo * fontinfo2
Definition: pageres.h:289
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
inT32 length() const
Definition: rejctmap.h:236
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
bool IsAmbiguous()
Definition: pageres.cpp:443
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
#define MIN(x, y)
Definition: ndminx.h:28
BOOL8 guessed_x_ht
Definition: pageres.h:292
TBOX bounding_box() const
Definition: stepblob.cpp:250
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
float certainty() const
Definition: ratngs.h:328
bool SubAndSuperscriptFix(WERD_RES *word_res)
inT8 fontinfo_id2_count
Definition: pageres.h:291
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1629
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:115
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
float x_height
Definition: pageres.h:295
unsigned char uinT8
Definition: host.h:32
bool tessedit_enable_bigram_correction
ROW_RES * row() const
Definition: pageres.h:739
inT16 reject_count()
Definition: rejctmap.h:242
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
int length() const
Definition: boxword.h:85
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:81
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1668
T & back() const
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1196
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392
uinT8 permuter() const
Definition: ratngs.h:344
WERD_RES * restart_page()
Definition: pageres.h:683
WERD_CHOICE * best_choice
Definition: pageres.h:219
unsigned char BOOL8
Definition: host.h:46
BOOL8 tess_failed
Definition: pageres.h:272
void run_cube_combiner(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39
void SetScriptPositions()
Definition: pageres.cpp:853
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int push_back(T object)
void rej_stat_word()
Definition: pageres.cpp:1674
void add_str_int(const char *str, int number)
Definition: strngs.cpp:384
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
BOOL8 part_of_combo
Definition: pageres.h:319
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488
inT32 get_total() const
Definition: statistc.h:86
GenericVector< int > blame_reasons
Definition: pageres.h:68
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1350
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
void print()
Definition: werd.cpp:266
inT16 bottom() const
Definition: rect.h:61
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:325
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:66
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
tesseract::BoxWord * box_word
Definition: pageres.h:250
const char * string() const
Definition: strngs.cpp:201
bool x_overlap(const TBOX &box) const
Definition: rect.h:391
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
const FontInfo * fontinfo
Definition: pageres.h:288
inT32 rej_count
Definition: pageres.h:61
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
void plot(ScrollView *window)
Definition: blobs.cpp:918
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
bool contains(const FCOORD pt) const
Definition: rect.h:323
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1322
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
Definition: werd.h:60
static const double kXHeightCapRatio
Definition: ccstruct.h:37
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1111
float rating() const
Definition: ratngs.h:79
#define FALSE
Definition: capi.h:46
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
Definition: werd.h:35
WERD_RES * forward()
Definition: pageres.h:716
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1035
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
SIGNED char inT8
Definition: host.h:31
BlamerBundle * blamer_bundle
Definition: pageres.h:230
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1963
bool deadline_exceeded() const
Definition: ocrclass.h:158
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:285
ALL lower case.
Definition: control.h:37
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:875
inT16 left() const
Definition: rect.h:68
TWERD * chopped_word
Definition: pageres.h:201
float y() const
Definition: points.h:212
TBOX bounding_box() const
Definition: blobs.cpp:881
float min_x_height() const
Definition: ratngs.h:334
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
ALL upper case.
Definition: control.h:38
PAGE_RES * page_res
Definition: pageres.h:661
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1483
Dict & getDict()
Definition: classify.h:65
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:448
void print() const
Definition: rect.h:270
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
int length() const
Definition: ratngs.h:301
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2020
void Add(T value, int count)
Definition: sorthelper.h:65
float rating() const
Definition: ratngs.h:325
bool AnyTessLang() const
a.b.c.
Definition: control.h:40
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:751
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
BLOCK_RES * block() const
Definition: pageres.h:742
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:453
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:168
float caps_height
Definition: pageres.h:296
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
#define MAX(x, y)
Definition: ndminx.h:24
PointerVector< WERD_RES > lang_words
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
int inT32
Definition: host.h:35
const char *const kBackUpConfigFile
Definition: control.cpp:53
void ResetWordIterator()
Definition: pageres.cpp:1533
ParamsVectors * params()
Definition: ccutil.h:63
void fix_quotes()
Definition: pageres.cpp:1012
#define tprintf(...)
Definition: tprintf.h:31
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51
bool wordrec_debug_blamer
Definition: wordrec.h:167
inT8 italic
Definition: pageres.h:285
Definition: ocrrow.h:32
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
#define MAX_INT32
Definition: host.h:53
Definition: strngs.h:44
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:204
double classify_max_certainty_margin
Definition: classify.h:404
bool small_caps
Definition: pageres.h:283
int size() const
Definition: genericvector.h:72
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:530
Definition: points.h:189
Definition: ocrblock.h:30
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
const UNICHARSET * uch_set
Definition: pageres.h:192
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1387
BOOL8 contains(const char c) const
Definition: strngs.cpp:192
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
Pix * BestPix() const
Unacceptable word.
Definition: control.h:36
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:293
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548
inT16 top() const
Definition: rect.h:54
static bool TESS_API ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
inT8 bold
Definition: pageres.h:286
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
int length() const
Definition: genericvector.h:79
float max_x_height() const
Definition: ratngs.h:337
#define SUBLOC_NORM
Definition: errcode.h:59
tesseract::Tesseract * tesseract
Definition: pageres.h:266
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1238
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:187
FCOORD classify_rotation() const
Definition: ocrblock.h:144
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:36
#define LOC_MM_ADAPT
Definition: errcode.h:52
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
#define MAX_UINT16
Definition: host.h:55
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1274
bool AdaptiveClassifierIsFull() const
Definition: classify.h:284
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375
inT32 char_count
Definition: pageres.h:60
float baseline_shift
Definition: pageres.h:297
inT32 x_height() const
return xheight
Definition: ocrblock.h:110
bool classify_bln_numeric_mode
Definition: classify.h:500
bool script_has_xheight() const
Definition: unicharset.h:849
double classify_max_rating_ratio
Definition: classify.h:402
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
bool is_bold() const
Definition: fontinfo.h:114
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
float x_height() const
Definition: ocrrow.h:61
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:179
inT8 fontinfo_id_count
Definition: pageres.h:290
UNICHARSET unicharset
Definition: ccutil.h:70
WERD * word
Definition: pageres.h:175
void DeleteCurrentWord()
Definition: pageres.cpp:1450
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1772
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1442
BOOL8 done
Definition: pageres.h:282
const STRING debug_string() const
Definition: ratngs.h:503
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
bool empty() const
Definition: genericvector.h:84
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:596
void init_to_size(int size, T t)
Definition: statistc.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:84
A.B.C.
Definition: control.h:41
void BestChoiceToCorrectText()
Definition: pageres.cpp:917
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:628
TBOX bounding_box() const
Definition: werd.cpp:160
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
REJMAP reject_map
Definition: pageres.h:271
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850
int UNICHAR_ID
Definition: unichar.h:33
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:763