40 #pragma warning(disable:4244) // Conversion warnings 41 #pragma warning(disable:4800) // int/bool warnings 57 if (best_choice.
length() == 0)
return false;
64 const char *xht =
"UNKNOWN";
65 switch (xheight_consistency) {
66 case XH_GOOD: xht =
"NORMAL";
break;
69 default: xht =
"UNKNOWN";
71 tprintf(
"\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
73 (is_valid_word ?
'y' :
'n'),
74 (is_case_ok ?
'y' :
'n'),
80 if (reject_offset_ <= 0.0f && !is_valid_word)
return false;
81 if (is_valid_word && is_case_ok) {
90 tprintf(
"Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
94 best_choice.
certainty() > CertaintyThreshold &&
100 tprintf(
"AcceptableChoice() returned false" 101 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
116 tprintf(
"\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
136 tprintf(
"Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
153 bool fix_replaceable,
156 tprintf(
"\nRunning NoDangerousAmbig() for %s\n",
164 bool ambigs_found =
false;
180 for (
int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
181 bool replace = (fix_replaceable && pass == 0);
189 for (i = 0; i < best_choice->
length(); ++i) {
190 BLOB_CHOICE_LIST *lst =
new BLOB_CHOICE_LIST();
191 BLOB_CHOICE_IT lst_it(lst);
199 int wrong_ngram_index;
202 for (i = 0; i < best_choice->
length(); blob_index += best_choice->
state(i),
206 tprintf(
"Looking for %s ngrams starting with %s:\n",
207 replace ?
"replaceable" :
"ambiguous",
210 int num_wrong_blobs = best_choice->
state(i);
211 wrong_ngram_index = 0;
212 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213 if (curr_unichar_id == INVALID_UNICHAR_ID ||
214 curr_unichar_id >= table.
size() ||
215 table[curr_unichar_id] == NULL) {
218 AmbigSpec_IT spec_it(table[curr_unichar_id]);
219 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
220 const AmbigSpec *ambig_spec = spec_it.data();
221 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
227 tprintf(
"current ngram from spec: ");
229 tprintf(
"comparison result: %d\n", compare);
236 blob_index, blob_index + num_wrong_blobs, replace,
240 tprintf(
"fixpt+=(%d %d %d %d %s)\n", blob_index,
241 blob_index + num_wrong_blobs,
false,
250 tprintf(
"replace ambiguity with %s : ",
258 best_choice, ratings);
267 for (
int tmp_index = 0; tmp_index <= wrong_ngram_index;
276 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
283 }
else if (compare == -1) {
285 ((next_index = wrong_ngram_index+1+i) < best_choice->
length())) {
288 wrong_ngram[++wrong_ngram_index] =
290 num_wrong_blobs += best_choice->
state(next_index);
305 tprintf(
"\nResulting ambig_blob_choices:\n");
306 for (i = 0; i < ambig_blob_choices.
length(); ++i) {
312 ambigs_found = (alt_word->
rating() < 0.0);
315 tprintf (
"Stopper: Possible ambiguous word = %s\n",
324 for (i = 0; i < alt_word->
length(); ++i) {
326 bool replacement_is_ngram =
329 if (replacement_is_ngram) {
332 int step = uchset.
step(str);
335 int end_i = orig_i + alt_word->
state(i);
336 if (alt_word->
state(i) > 1 ||
337 (orig_i + 1 == end_i && replacement_is_ngram)) {
340 for (
int j = 0; j < orig_i; ++j)
341 blob_start += best_choice->
state(j);
342 int blob_end = blob_start;
343 for (
int j = orig_i; j < end_i; ++j)
344 blob_end += best_choice->
state(j);
346 replacement_is_ngram, leftmost_id));
348 tprintf(
"fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
349 true, replacement_is_ngram,
353 orig_i += alt_word->
state(i);
359 if (output_ambig_words_file_ != NULL) {
360 fprintf(output_ambig_words_file_,
"\n");
364 return !ambigs_found;
370 reject_offset_ = 0.0;
380 int num_blobs_to_replace = 0;
381 int begin_blob_index = 0;
385 float new_rating = 0.0f;
386 float new_certainty = 0.0f;
388 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
389 if (i >= wrong_ngram_begin_index) {
390 int num_blobs = werd_choice->
state(i);
391 int col = begin_blob_index + num_blobs_to_replace;
392 int row = col + num_blobs - 1;
393 BLOB_CHOICE_LIST* choices = ratings->
get(col, row);
397 new_rating += old_choice->
rating();
398 new_certainty += old_choice->
certainty();
399 num_blobs_to_replace += num_blobs;
401 begin_blob_index += werd_choice->
state(i);
404 new_certainty /= wrong_ngram_size;
407 begin_blob_index + num_blobs_to_replace - 1);
408 if (!coord.
Valid(*ratings)) {
411 if (ratings->
get(coord.
col, coord.
row) == NULL)
412 ratings->
put(coord.
col, coord.
row,
new BLOB_CHOICE_LIST);
413 BLOB_CHOICE_LIST* new_choices = ratings->
get(coord.
col, coord.
row);
415 if (choice != NULL) {
417 if (new_rating < choice->rating())
419 if (new_certainty < choice->certainty())
430 BLOB_CHOICE_IT it (new_choices);
431 it.add_to_end(choice);
435 for (
int replaced_count = 0; replaced_count < wrong_ngram_size;
437 if (replaced_count + 1 == wrong_ngram_size) {
439 num_blobs_to_replace, choice);
445 werd_choice->
print(
"ReplaceAmbig() ");
446 tprintf(
"Modified blob_choices: ");
454 for (
int w = 0; w < WordChoice.
length(); ++w) {
457 }
else if (curr_len > 0) {
458 if (curr_len < shortest) shortest = curr_len;
462 if (curr_len > 0 && curr_len < shortest) {
473 float CertaintyThreshold;
478 int word_length = word.
length();
483 TotalCertainty = TotalCertaintySquared = 0.0;
484 for (
int i = 0; i < word_length; ++i) {
486 TotalCertainty += Certainty;
487 TotalCertaintySquared += Certainty * Certainty;
488 if (Certainty < WorstCertainty)
489 WorstCertainty = Certainty;
494 TotalCertainty -= WorstCertainty;
495 TotalCertaintySquared -= WorstCertainty * WorstCertainty;
497 Mean = TotalCertainty / word_length;
498 Variance = ((word_length * TotalCertaintySquared -
499 TotalCertainty * TotalCertainty) /
500 (word_length * (word_length - 1)));
503 StdDev = sqrt(Variance);
509 if (word.
certainty() < CertaintyThreshold) {
511 tprintf(
"Stopper: Non-uniform certainty = %4.1f" 512 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
void remove_unichar_id(int index)
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
void IncreaseBandSize(int bandwidth)
bool get_isalpha(UNICHAR_ID unichar_id) const
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
double stopper_allowable_character_badness
const STRING & unichar_string() const
UNICHAR_ID unichar_id(int index) const
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
int state(int index) const
void set_rating(float newrat)
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
const UNICHARSET & getUnicharset() const
void EndDangerousAmbigs()
double stopper_phase2_certainty_rejection_offset
WERD_CHOICE * best_choice
bool Valid(const MATRIX &m) const
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
bool AcceptableResult(WERD_RES *word)
const char * string() const
bool dangerous_ambig_found() const
double stopper_certainty_per_char
const UnicharAmbigsVector & dang_ambigs() const
void put(ICOORD pos, const T &thing)
UNICHAR_ID correct_ngram_id
void set_certainty(float newrat)
const UnicharAmbigs & getUnicharAmbigs() const
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
float min_x_height() const
void set_unichar_id(UNICHAR_ID newunichar_id)
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
void delete_data_pointers()
bool stopper_no_acceptable_choices
void set_matrix_cell(int col, int row)
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
WERD_CHOICE_LIST best_choices
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
int UniformCertainties(const WERD_CHOICE &word)
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
const UnicharAmbigsVector & replace_ambigs() const
bool get_isngram(UNICHAR_ID unichar_id) const
float max_x_height() const
double stopper_nondict_certainty_base
bool TESS_API NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
void set_classifier(BlobChoiceClassifier classifier)
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
const char * id_to_unichar(UNICHAR_ID id) const
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
const STRING debug_string() const
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
int stopper_smallword_size
int step(const char *str) const