tesseract  3.05.02
reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 48 of file reject.cpp.

56  {
57 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
58  word->done = word->tess_accepted &&
59  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
60  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
61  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
62  word->best_choice->permuter() == FREQ_DAWG_PERM ||
64  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65  one_ell_conflict(word, FALSE)) {
66  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
67  word->done = FALSE;
68  }
69  if (word->done && ((!word_from_dict &&
70  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
71  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
72  word->done = FALSE;
73  }
74  if (tessedit_rejection_debug) {
75  tprintf("set_done(): done=%d\n", word->done);
76  word->best_choice->print("");
77  }
78 }
79 
80 
81 /*************************************************************************
82  * make_reject_map()
83  *
84  * Sets the done flag to indicate whether the resylt is acceptable.
85  *
86  * Sets a reject map for the word.
87  *************************************************************************/
88 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
89  int i;
90  int offset;
91 
92  flip_0O(word);
93  check_debug_pt(word, -1); // For trap only
94  set_done(word, pass); // Set acceptance
96  reject_blanks(word);
97  /*
98  0: Rays original heuristic - the baseline
99  */
100  if (tessedit_reject_mode == 0) {
101  if (!word->done)
102  reject_poor_matches(word);
103  } else if (tessedit_reject_mode == 5) {
104  /*
105  5: Reject I/1/l from words where there is no strong contextual confirmation;
106  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
107  and the whole of any words which are very small
108  */
109  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
111  } else {
112  one_ell_conflict(word, TRUE);
113  /*
114  Originally the code here just used the done flag. Now I have duplicated
115  and unpacked the conditions for setting the done flag so that each
116  mechanism can be turned on or off independently. This works WITHOUT
117  affecting the done flag setting.
118  */
119  if (rej_use_tess_accepted && !word->tess_accepted)
121 
122  if (rej_use_tess_blanks &&
123  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
125 
126  WERD_CHOICE* best_choice = word->best_choice;
127  if (rej_use_good_perm) {
128  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
129  best_choice->permuter() == FREQ_DAWG_PERM ||
130  best_choice->permuter() == USER_DAWG_PERM) &&
131  (!rej_use_sensible_wd ||
132  acceptable_word_string(*word->uch_set,
133  best_choice->unichar_string().string(),
134  best_choice->unichar_lengths().string()) !=
135  AC_UNACCEPTABLE)) {
136  // PASSED TEST
137  } else if (best_choice->permuter() == NUMBER_PERM) {
138  if (rej_alphas_in_number_perm) {
139  for (i = 0, offset = 0;
140  best_choice->unichar_string()[offset] != '\0';
141  offset += best_choice->unichar_lengths()[i++]) {
142  if (word->reject_map[i].accepted() &&
143  word->uch_set->get_isalpha(
144  best_choice->unichar_string().string() + offset,
145  best_choice->unichar_lengths()[i]))
146  word->reject_map[i].setrej_bad_permuter();
147  // rej alpha
148  }
149  }
150  } else {
152  }
153  }
154  /* Ambig word rejection was here once !!*/
155  }
156  } else {
157  tprintf("BAD tessedit_reject_mode\n");
158  err_exit();
159  }
160 
161  if (tessedit_image_border > -1)
162  reject_edge_blobs(word);
163 
164  check_debug_pt (word, 10);
165  if (tessedit_rejection_debug) {
166  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
167  tprintf("Certainty: %f Rating: %f\n",
168  word->best_choice->certainty (), word->best_choice->rating ());
169  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
170  }
171 
172  flip_hyphens(word);
173  check_debug_pt(word, 20);
174 }
175 } // namespace tesseract
BOOL8 tess_accepted
Definition: pageres.h:280
#define TRUE
Definition: capi.h:45
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
short inT16
Definition: host.h:33
void err_exit()
Definition: globaloc.cpp:74
const STRING & unichar_string() const
Definition: ratngs.h:525
inT32 length() const
Definition: strngs.cpp:196
void print() const
Definition: ratngs.h:564
void flip_0O(WERD_RES *word)
void initialise(inT16 length)
Definition: rejctmap.cpp:318
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:434
float y_scale() const
Definition: normalis.h:272
float certainty() const
Definition: ratngs.h:328
uinT8 permuter() const
Definition: ratngs.h:344
WERD_CHOICE * best_choice
Definition: pageres.h:219
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
void flip_hyphens(WERD_RES *word)
const char * string() const
Definition: strngs.cpp:201
bool dangerous_ambig_found() const
Definition: ratngs.h:361
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
#define FALSE
Definition: capi.h:46
const int kBlnXHeight
Definition: normalis.h:28
void rej_word_small_xht()
Definition: rejctmap.cpp:416
float rating() const
Definition: ratngs.h:325
void rej_word_bad_permuter()
Definition: rejctmap.cpp:452
#define tprintf(...)
Definition: tprintf.h:31
Definition: ocrrow.h:32
const UNICHARSET * uch_set
Definition: pageres.h:192
Unacceptable word.
Definition: control.h:36
void rej_word_contains_blanks()
Definition: rejctmap.cpp:443
const STRING & unichar_lengths() const
Definition: ratngs.h:532
BOOL8 done
Definition: pageres.h:282
DENORM denorm
Definition: pageres.h:190
REJMAP reject_map
Definition: pageres.h:271

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 226 of file reject.cpp.

226  {
227  float threshold; // rejection threshold
228  float bestgap = 0.0f; // biggest gap
229  float gapstart; // bottom of gap
230  // super iterator
231  BLOB_CHOICE_IT choice_it; // real iterator
232 
233  int blob_count = word->length();
234  GenericVector<float> ratings;
235  ratings.resize_no_init(blob_count);
236  for (int i = 0; i < blob_count; ++i) {
237  ratings[i] = word->certainty(i);
238  }
239  ratings.sort();
240  gapstart = ratings[0] - 1; // all reject if none better
241  if (blob_count >= 3) {
242  for (int index = 0; index < blob_count - 1; index++) {
243  if (ratings[index + 1] - ratings[index] > bestgap) {
244  bestgap = ratings[index + 1] - ratings[index];
245  // find biggest
246  gapstart = ratings[index];
247  }
248  }
249  }
250  threshold = gapstart + bestgap / 2;
251 
252  return threshold;
253 }
void resize_no_init(int size)
Definition: genericvector.h:66
float certainty() const
Definition: ratngs.h:328
int length() const
Definition: ratngs.h:301

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

178  {
179  inT16 i;
180  inT16 offset;
181 
182  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
183  offset += word->best_choice->unichar_lengths()[i], i += 1) {
184  if (word->best_choice->unichar_string()[offset] == ' ')
185  //rej unrecognised blobs
186  word->reject_map[i].setrej_tess_failure ();
187  }
188 }
short inT16
Definition: host.h:33
const STRING & unichar_string() const
Definition: ratngs.h:525
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:532
REJMAP reject_map
Definition: pageres.h:271

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

207  {
208  float threshold = compute_reject_threshold(word->best_choice);
209  for (int i = 0; i < word->best_choice->length(); ++i) {
210  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
211  word->reject_map[i].setrej_tess_failure();
212  else if (word->best_choice->certainty(i) < threshold)
213  word->reject_map[i].setrej_poor_match();
214  }
215 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
float certainty() const
Definition: ratngs.h:328
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
REJMAP reject_map
Definition: pageres.h:271