tesseract  3.05.02
stopper.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: stopper.c
3  ** Purpose: Stopping criteria for word classifier.
4  ** Author: Dan Johnson
5  ** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 #include <stdio.h>
20 #include <string.h>
21 #include <ctype.h>
22 #include <math.h>
23 
24 #include "stopper.h"
25 #include "ambigs.h"
26 #include "ccutil.h"
27 #include "const.h"
28 #include "danerror.h"
29 #include "dict.h"
30 #include "efio.h"
31 #include "helpers.h"
32 #include "matchdefs.h"
33 #include "pageres.h"
34 #include "params.h"
35 #include "ratngs.h"
36 #include "scanutils.h"
37 #include "unichar.h"
38 
39 #ifdef _MSC_VER
40 #pragma warning(disable:4244) // Conversion warnings
41 #pragma warning(disable:4800) // int/bool warnings
42 #endif
43 
44 /*----------------------------------------------------------------------------
45  Private Code
46 ----------------------------------------------------------------------------*/
47 
48 namespace tesseract {
49 
50 bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
51  XHeightConsistencyEnum xheight_consistency) {
52  float CertaintyThreshold = stopper_nondict_certainty_base;
53  int WordSize;
54 
55  if (stopper_no_acceptable_choices) return false;
56 
57  if (best_choice.length() == 0) return false;
58 
59  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
60  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
61  bool is_case_ok = case_ok(best_choice, getUnicharset());
62 
63  if (stopper_debug_level >= 1) {
64  const char *xht = "UNKNOWN";
65  switch (xheight_consistency) {
66  case XH_GOOD: xht = "NORMAL"; break;
67  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
68  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
69  default: xht = "UNKNOWN";
70  }
71  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
72  best_choice.unichar_string().string(),
73  (is_valid_word ? 'y' : 'n'),
74  (is_case_ok ? 'y' : 'n'),
75  xht,
76  best_choice.min_x_height(),
77  best_choice.max_x_height());
78  }
79  // Do not accept invalid words in PASS1.
80  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
81  if (is_valid_word && is_case_ok) {
82  WordSize = LengthOfShortestAlphaRun(best_choice);
83  WordSize -= stopper_smallword_size;
84  if (WordSize < 0)
85  WordSize = 0;
86  CertaintyThreshold += WordSize * stopper_certainty_per_char;
87  }
88 
89  if (stopper_debug_level >= 1)
90  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
91  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
92 
93  if (no_dang_ambigs &&
94  best_choice.certainty() > CertaintyThreshold &&
95  xheight_consistency < XH_INCONSISTENT &&
96  UniformCertainties(best_choice)) {
97  return true;
98  } else {
99  if (stopper_debug_level >= 1) {
100  tprintf("AcceptableChoice() returned false"
101  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
102  no_dang_ambigs, best_choice.certainty(),
103  CertaintyThreshold,
104  UniformCertainties(best_choice));
105  }
106  return false;
107  }
108 }
109 
111  if (word->best_choice == NULL) return false;
112  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
113  int WordSize;
114 
115  if (stopper_debug_level >= 1) {
116  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
117  word->best_choice->debug_string().string(),
118  (valid_word(*word->best_choice) ? 'y' : 'n'),
119  (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
120  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
121  word->best_choices.singleton() ? 'n' : 'y');
122  }
123 
124  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
125  return false;
126  if (valid_word(*word->best_choice) &&
127  case_ok(*word->best_choice, getUnicharset())) {
128  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
129  WordSize -= stopper_smallword_size;
130  if (WordSize < 0)
131  WordSize = 0;
132  CertaintyThreshold += WordSize * stopper_certainty_per_char;
133  }
134 
135  if (stopper_debug_level >= 1)
136  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
137  word->best_choice->certainty(), CertaintyThreshold);
138 
139  if (word->best_choice->certainty() > CertaintyThreshold &&
141  if (stopper_debug_level >= 1)
142  tprintf("ACCEPTED\n");
143  return true;
144  } else {
145  if (stopper_debug_level >= 1)
146  tprintf("REJECTED\n");
147  return false;
148  }
149 }
150 
152  DANGERR *fixpt,
153  bool fix_replaceable,
154  MATRIX *ratings) {
155  if (stopper_debug_level > 2) {
156  tprintf("\nRunning NoDangerousAmbig() for %s\n",
157  best_choice->debug_string().string());
158  }
159 
160  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
161  // for each unichar id in BestChoice.
162  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
163  int i;
164  bool ambigs_found = false;
165  // For each position in best_choice:
166  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
167  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
168  // -- look for ambiguities corresponding to wrong_ngram in the list while
169  // adding the following unichar_ids from best_choice to wrong_ngram
170  //
171  // Repeat the above procedure twice: first time look through
172  // ambigs to be replaced and replace all the ambiguities found;
173  // second time look through dangerous ambiguities and construct
174  // ambig_blob_choices with fake a blob choice for each ambiguity
175  // and pass them to dawg_permute_and_select() to search for
176  // ambiguous words in the dictionaries.
177  //
178  // Note that during the execution of the for loop (on the first pass)
179  // if replacements are made the length of best_choice might change.
180  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
181  bool replace = (fix_replaceable && pass == 0);
182  const UnicharAmbigsVector &table = replace ?
184  if (!replace) {
185  // Initialize ambig_blob_choices with lists containing a single
186  // unichar id for the correspoding position in best_choice.
187  // best_choice consisting from only the original letters will
188  // have a rating of 0.0.
189  for (i = 0; i < best_choice->length(); ++i) {
190  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
191  BLOB_CHOICE_IT lst_it(lst);
192  // TODO(rays/antonova) Put real xheights and y shifts here.
193  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
194  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
195  ambig_blob_choices.push_back(lst);
196  }
197  }
198  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
199  int wrong_ngram_index;
200  int next_index;
201  int blob_index = 0;
202  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
203  ++i) {
204  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
205  if (stopper_debug_level > 2) {
206  tprintf("Looking for %s ngrams starting with %s:\n",
207  replace ? "replaceable" : "ambiguous",
208  getUnicharset().debug_str(curr_unichar_id).string());
209  }
210  int num_wrong_blobs = best_choice->state(i);
211  wrong_ngram_index = 0;
212  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213  if (curr_unichar_id == INVALID_UNICHAR_ID ||
214  curr_unichar_id >= table.size() ||
215  table[curr_unichar_id] == NULL) {
216  continue; // there is no ambig spec for this unichar id
217  }
218  AmbigSpec_IT spec_it(table[curr_unichar_id]);
219  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
220  const AmbigSpec *ambig_spec = spec_it.data();
221  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
222  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
223  ambig_spec->wrong_ngram);
224  if (stopper_debug_level > 2) {
225  tprintf("candidate ngram: ");
227  tprintf("current ngram from spec: ");
229  tprintf("comparison result: %d\n", compare);
230  }
231  if (compare == 0) {
232  // Record the place where we found an ambiguity.
233  if (fixpt != NULL) {
234  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
235  fixpt->push_back(DANGERR_INFO(
236  blob_index, blob_index + num_wrong_blobs, replace,
237  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
238  leftmost_id));
239  if (stopper_debug_level > 1) {
240  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
241  blob_index + num_wrong_blobs, false,
242  getUnicharset().get_isngram(
243  ambig_spec->correct_ngram_id),
244  getUnicharset().id_to_unichar(leftmost_id));
245  }
246  }
247 
248  if (replace) {
249  if (stopper_debug_level > 2) {
250  tprintf("replace ambiguity with %s : ",
251  getUnicharset().id_to_unichar(
252  ambig_spec->correct_ngram_id));
254  ambig_spec->correct_fragments, getUnicharset());
255  }
256  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
257  ambig_spec->correct_ngram_id,
258  best_choice, ratings);
259  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
260  // We found dang ambig - update ambig_blob_choices.
261  if (stopper_debug_level > 2) {
262  tprintf("found ambiguity: ");
264  ambig_spec->correct_fragments, getUnicharset());
265  }
266  ambigs_found = true;
267  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
268  ++tmp_index) {
269  // Add a blob choice for the corresponding fragment of the
270  // ambiguity. These fake blob choices are initialized with
271  // negative ratings (which are not possible for real blob
272  // choices), so that dawg_permute_and_select() considers any
273  // word not consisting of only the original letters a better
274  // choice and stops searching for alternatives once such a
275  // choice is found.
276  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
277  bc_it.add_to_end(new BLOB_CHOICE(
278  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
279  -1, 0, 1, 0, BCC_AMBIG));
280  }
281  }
282  spec_it.forward();
283  } else if (compare == -1) {
284  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
285  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
286  // Add the next unichar id to wrong_ngram and keep looking for
287  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
288  wrong_ngram[++wrong_ngram_index] =
289  best_choice->unichar_id(next_index);
290  num_wrong_blobs += best_choice->state(next_index);
291  } else {
292  break; // no more matching ambigs in this AMBIG_SPEC_LIST
293  }
294  } else {
295  spec_it.forward();
296  }
297  } // end searching AmbigSpec_LIST
298  } // end searching best_choice
299  } // end searching replace and dangerous ambigs
300 
301  // If any ambiguities were found permute the constructed ambig_blob_choices
302  // to see if an alternative dictionary word can be found.
303  if (ambigs_found) {
304  if (stopper_debug_level > 2) {
305  tprintf("\nResulting ambig_blob_choices:\n");
306  for (i = 0; i < ambig_blob_choices.length(); ++i) {
307  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
308  tprintf("\n");
309  }
310  }
311  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
312  ambigs_found = (alt_word->rating() < 0.0);
313  if (ambigs_found) {
314  if (stopper_debug_level >= 1) {
315  tprintf ("Stopper: Possible ambiguous word = %s\n",
316  alt_word->debug_string().string());
317  }
318  if (fixpt != NULL) {
319  // Note: Currently character choices combined from fragments can only
320  // be generated by NoDangrousAmbigs(). This code should be updated if
321  // the capability to produce classifications combined from character
322  // fragments is added to other functions.
323  int orig_i = 0;
324  for (i = 0; i < alt_word->length(); ++i) {
325  const UNICHARSET &uchset = getUnicharset();
326  bool replacement_is_ngram =
327  uchset.get_isngram(alt_word->unichar_id(i));
328  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
329  if (replacement_is_ngram) {
330  // we have to extract the leftmost unichar from the ngram.
331  const char *str = uchset.id_to_unichar(leftmost_id);
332  int step = uchset.step(str);
333  if (step) leftmost_id = uchset.unichar_to_id(str, step);
334  }
335  int end_i = orig_i + alt_word->state(i);
336  if (alt_word->state(i) > 1 ||
337  (orig_i + 1 == end_i && replacement_is_ngram)) {
338  // Compute proper blob indices.
339  int blob_start = 0;
340  for (int j = 0; j < orig_i; ++j)
341  blob_start += best_choice->state(j);
342  int blob_end = blob_start;
343  for (int j = orig_i; j < end_i; ++j)
344  blob_end += best_choice->state(j);
345  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
346  replacement_is_ngram, leftmost_id));
347  if (stopper_debug_level > 1) {
348  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
349  true, replacement_is_ngram,
350  uchset.id_to_unichar(leftmost_id));
351  }
352  }
353  orig_i += alt_word->state(i);
354  }
355  }
356  }
357  delete alt_word;
358  }
359  if (output_ambig_words_file_ != NULL) {
360  fprintf(output_ambig_words_file_, "\n");
361  }
362 
363  ambig_blob_choices.delete_data_pointers();
364  return !ambigs_found;
365 }
366 
368 
370  reject_offset_ = 0.0;
371 }
372 
375 }
376 
377 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
378  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
379  MATRIX *ratings) {
380  int num_blobs_to_replace = 0;
381  int begin_blob_index = 0;
382  int i;
383  // Rating and certainty for the new BLOB_CHOICE are derived from the
384  // replaced choices.
385  float new_rating = 0.0f;
386  float new_certainty = 0.0f;
387  BLOB_CHOICE* old_choice = NULL;
388  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
389  if (i >= wrong_ngram_begin_index) {
390  int num_blobs = werd_choice->state(i);
391  int col = begin_blob_index + num_blobs_to_replace;
392  int row = col + num_blobs - 1;
393  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
394  ASSERT_HOST(choices != NULL);
395  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
396  ASSERT_HOST(old_choice != NULL);
397  new_rating += old_choice->rating();
398  new_certainty += old_choice->certainty();
399  num_blobs_to_replace += num_blobs;
400  } else {
401  begin_blob_index += werd_choice->state(i);
402  }
403  }
404  new_certainty /= wrong_ngram_size;
405  // If there is no entry in the ratings matrix, add it.
406  MATRIX_COORD coord(begin_blob_index,
407  begin_blob_index + num_blobs_to_replace - 1);
408  if (!coord.Valid(*ratings)) {
409  ratings->IncreaseBandSize(coord.row - coord.col + 1);
410  }
411  if (ratings->get(coord.col, coord.row) == NULL)
412  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
413  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
414  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
415  if (choice != NULL) {
416  // Already there. Upgrade if new rating better.
417  if (new_rating < choice->rating())
418  choice->set_rating(new_rating);
419  if (new_certainty < choice->certainty())
420  choice->set_certainty(new_certainty);
421  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
422  } else {
423  // Need a new choice with the correct_ngram_id.
424  choice = new BLOB_CHOICE(*old_choice);
425  choice->set_unichar_id(correct_ngram_id);
426  choice->set_rating(new_rating);
427  choice->set_certainty(new_certainty);
428  choice->set_classifier(BCC_AMBIG);
429  choice->set_matrix_cell(coord.col, coord.row);
430  BLOB_CHOICE_IT it (new_choices);
431  it.add_to_end(choice);
432  }
433  // Remove current unichar from werd_choice. On the last iteration
434  // set the correct replacement unichar instead of removing a unichar.
435  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
436  ++replaced_count) {
437  if (replaced_count + 1 == wrong_ngram_size) {
438  werd_choice->set_blob_choice(wrong_ngram_begin_index,
439  num_blobs_to_replace, choice);
440  } else {
441  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
442  }
443  }
444  if (stopper_debug_level >= 1) {
445  werd_choice->print("ReplaceAmbig() ");
446  tprintf("Modified blob_choices: ");
447  print_ratings_list("\n", new_choices, getUnicharset());
448  }
449 }
450 
452  int shortest = MAX_INT32;
453  int curr_len = 0;
454  for (int w = 0; w < WordChoice.length(); ++w) {
455  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
456  curr_len++;
457  } else if (curr_len > 0) {
458  if (curr_len < shortest) shortest = curr_len;
459  curr_len = 0;
460  }
461  }
462  if (curr_len > 0 && curr_len < shortest) {
463  shortest = curr_len;
464  } else if (shortest == MAX_INT32) {
465  shortest = 0;
466  }
467  return shortest;
468 }
469 
471  float Certainty;
472  float WorstCertainty = MAX_FLOAT32;
473  float CertaintyThreshold;
474  FLOAT64 TotalCertainty;
475  FLOAT64 TotalCertaintySquared;
476  FLOAT64 Variance;
477  FLOAT32 Mean, StdDev;
478  int word_length = word.length();
479 
480  if (word_length < 3)
481  return true;
482 
483  TotalCertainty = TotalCertaintySquared = 0.0;
484  for (int i = 0; i < word_length; ++i) {
485  Certainty = word.certainty(i);
486  TotalCertainty += Certainty;
487  TotalCertaintySquared += Certainty * Certainty;
488  if (Certainty < WorstCertainty)
489  WorstCertainty = Certainty;
490  }
491 
492  // Subtract off worst certainty from statistics.
493  word_length--;
494  TotalCertainty -= WorstCertainty;
495  TotalCertaintySquared -= WorstCertainty * WorstCertainty;
496 
497  Mean = TotalCertainty / word_length;
498  Variance = ((word_length * TotalCertaintySquared -
499  TotalCertainty * TotalCertainty) /
500  (word_length * (word_length - 1)));
501  if (Variance < 0.0)
502  Variance = 0.0;
503  StdDev = sqrt(Variance);
504 
505  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
506  if (CertaintyThreshold > stopper_nondict_certainty_base)
507  CertaintyThreshold = stopper_nondict_certainty_base;
508 
509  if (word.certainty() < CertaintyThreshold) {
510  if (stopper_debug_level >= 1)
511  tprintf("Stopper: Non-uniform certainty = %4.1f"
512  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
513  word.certainty(), Mean, StdDev, CertaintyThreshold);
514  return false;
515  } else {
516  return true;
517  }
518 }
519 
520 } // namespace tesseract
void remove_unichar_id(int index)
Definition: ratngs.h:482
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:62
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:369
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160
T & get(int index) const
double stopper_allowable_character_badness
Definition: dict.h:619
const STRING & unichar_string() const
Definition: ratngs.h:525
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
AmbigType type
Definition: ambigs.h:136
void print() const
Definition: ratngs.h:564
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
int state(int index) const
Definition: ratngs.h:317
void set_rating(float newrat)
Definition: ratngs.h:147
XHeightConsistencyEnum
Definition: dict.h:74
int stopper_debug_level
Definition: dict.h:620
T get(ICOORD pos) const
Definition: matrix.h:228
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:133
float certainty() const
Definition: ratngs.h:328
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:373
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
Definition: matrix.h:572
void EndDangerousAmbigs()
Definition: stopper.cpp:367
uinT8 permuter() const
Definition: ratngs.h:344
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:613
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool Valid(const MATRIX &m) const
Definition: matrix.h:610
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:98
int push_back(T object)
#define MAX_FLOAT32
Definition: host.h:57
bool AcceptableResult(WERD_RES *word)
Definition: stopper.cpp:110
const char * string() const
Definition: strngs.cpp:201
bool dangerous_ambig_found() const
Definition: ratngs.h:361
double stopper_certainty_per_char
Definition: dict.h:617
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:154
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
float rating() const
Definition: ratngs.h:79
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:135
void set_certainty(float newrat)
Definition: ratngs.h:150
float FLOAT32
Definition: host.h:44
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
double FLOAT64
Definition: host.h:45
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
float min_x_height() const
Definition: ratngs.h:334
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:50
void delete_data_pointers()
int length() const
Definition: ratngs.h:301
float rating() const
Definition: ratngs.h:325
bool stopper_no_acceptable_choices
Definition: dict.h:623
void set_matrix_cell(int col, int row)
Definition: ratngs.h:156
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:453
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:470
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
#define tprintf(...)
Definition: tprintf.h:31
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:730
#define MAX_INT32
Definition: host.h:53
int size() const
Definition: genericvector.h:72
float certainty() const
Definition: ratngs.h:82
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:155
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:484
int length() const
Definition: genericvector.h:79
float max_x_height() const
Definition: ratngs.h:337
double stopper_nondict_certainty_base
Definition: dict.h:611
bool TESS_API NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:151
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:166
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:451
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:134
const STRING debug_string() const
Definition: ratngs.h:503
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:377
int stopper_smallword_size
Definition: dict.h:615
#define ASSERT_HOST(x)
Definition: errcode.h:84
int step(const char *str) const
Definition: unicharset.cpp:211
int UNICHAR_ID
Definition: unichar.h:33