tesseract  3.05.02
tesseractclass.h
Go to the documentation of this file.
1 // File: tesseractclass.h
3 // Description: The Tesseract class. It holds/owns everything needed
4 // to run Tesseract on a single language, and also a set of
5 // sub-Tesseracts to run sub-languages. For thread safety, *every*
6 // global variable goes in here, directly, or indirectly.
7 // This makes it safe to run multiple Tesseracts in different
8 // threads in parallel, and keeps the different language
9 // instances separate.
10 // Author: Ray Smith
11 // Created: Fri Mar 07 08:17:01 PST 2008
12 //
13 // (C) Copyright 2008, Google Inc.
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 // http://www.apache.org/licenses/LICENSE-2.0
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the License is distributed on an "AS IS" BASIS,
20 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 // See the License for the specific language governing permissions and
22 // limitations under the License.
23 //
25 
26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
28 
29 #include "allheaders.h"
30 #include "control.h"
31 #include "docqual.h"
32 #include "devanagari_processing.h"
33 #include "genericvector.h"
34 #include "params.h"
35 #include "ocrclass.h"
36 #include "textord.h"
37 #include "wordrec.h"
38 
39 class BLOB_CHOICE_LIST_CLIST;
40 class BLOCK_LIST;
41 struct OSResults;
42 class PAGE_RES;
43 class PAGE_RES_IT;
44 struct Pix;
45 class ROW;
46 class SVMenuNode;
47 class TBOX;
48 class TO_BLOCK_LIST;
49 class WERD;
50 class WERD_CHOICE;
51 class WERD_RES;
52 
53 
54 // Top-level class for all tesseract global instance data.
55 // This class either holds or points to all data used by an instance
56 // of Tesseract, including the memory allocator. When this is
57 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
58 //
59 // NOTE to developers: Do not create cyclic dependencies through this class!
60 // The directory dependency tree must remain a tree! The keep this clean,
61 // lower-level code (eg in ccutil, the bottom level) must never need to
62 // know about the content of a higher-level directory.
63 // The following scheme will grant the easiest access to lower-level
64 // global members without creating a cyclic dependency:
65 //
66 // Class Hierarchy (^ = inheritance):
67 //
68 // CCUtil (ccutil/ccutil.h)
69 // ^ Members include: UNICHARSET
70 // CUtil (cutil/cutil_class.h)
71 // ^ Members include: TBLOB*, TEXTBLOCK*
72 // CCStruct (ccstruct/ccstruct.h)
73 // ^ Members include: Image
74 // Classify (classify/classify.h)
75 // ^ Members include: Dict
76 // WordRec (wordrec/wordrec.h)
77 // ^ Members include: WERD*, DENORM*
78 // Tesseract (ccmain/tesseractclass.h)
79 // Members include: Pix*, CubeRecoContext*,
80 // TesseractCubeCombiner*
81 //
82 // Other important classes:
83 //
84 // TessBaseAPI (api/baseapi.h)
85 // Members include: BLOCK_LIST*, PAGE_RES*,
86 // Tesseract*, ImageThresholder*
87 // Dict (dict/dict.h)
88 // Members include: Image* (private)
89 //
90 // NOTE: that each level contains members that correspond to global
91 // data that is defined (and used) at that level, not necessarily where
92 // the type is defined so for instance:
93 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
94 // goes inside the Textord class, not the cc_util class.
95 
96 namespace tesseract {
97 
98 class ColumnFinder;
99 #ifndef NO_CUBE_BUILD
100 class CharSamp;
101 class CubeLineObject;
102 class CubeObject;
103 class CubeRecoContext;
104 #endif
105 class EquationDetect;
106 class Tesseract;
107 #ifndef NO_CUBE_BUILD
108 class TesseractCubeCombiner;
109 #endif
110 
111 // A collection of various variables for statistics and debugging.
115  doc_blob_quality(0),
116  doc_outline_errs(0),
117  doc_char_quality(0),
118  good_char_count(0),
120  word_count(0),
121  dict_words(0),
122  tilde_crunch_written(false),
123  last_char_was_newline(true),
124  last_char_was_tilde(false),
126 
133  inT32 word_count; // count of word in the document
134  inT32 dict_words; // number of dicitionary words in the document
135  STRING dump_words_str; // accumulator used by dump_words()
136  // Flags used by write_results()
141 };
142 
143 // Struct to hold all the pointers to relevant data for processing a word.
144 struct WordData {
145  WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {}
146  explicit WordData(const PAGE_RES_IT& page_res_it)
147  : word(page_res_it.word()), row(page_res_it.row()->row),
148  block(page_res_it.block()->block), prev_word(NULL) {}
149  WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
150  : word(word_res), row(row_in), block(block_in), prev_word(NULL) {}
151 
157 };
158 
159 // Definition of a Tesseract WordRecognizer. The WordData provides the context
160 // of row/block, in_word holds an initialized, possibly pre-classified word,
161 // that the recognizer may or may not consume (but if so it sets *in_word=NULL)
162 // and produces one or more output words in out_words, which may be the
163 // consumed in_word, or may be generated independently.
164 // This api allows both a conventional tesseract classifier to work, or a
165 // line-level classifier that generates multiple words from a merged input.
166 typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
167  WERD_RES** in_word,
168  PointerVector<WERD_RES>* out_words);
169 
170 class Tesseract : public Wordrec {
171  public:
172  Tesseract();
173  ~Tesseract();
174 
175  // Clear as much used memory as possible without resetting the adaptive
176  // classifier or losing any other classifier data.
177  void Clear();
178  // Clear all memory of adaption for this and all subclassifiers.
180  // Clear the document dictionary for this and all subclassifiers.
182 
183  // Set the equation detector.
184  void SetEquationDetect(EquationDetect* detector);
185 
186  // Simple accessors.
187  const FCOORD& reskew() const {
188  return reskew_;
189  }
190  // Destroy any existing pix and return a pointer to the pointer.
192  pixDestroy(&pix_binary_);
193  return &pix_binary_;
194  }
195  Pix* pix_binary() const {
196  return pix_binary_;
197  }
198  Pix* pix_grey() const {
199  return pix_grey_;
200  }
201  void set_pix_grey(Pix* grey_pix) {
202  pixDestroy(&pix_grey_);
203  pix_grey_ = grey_pix;
204  }
205  Pix* pix_original() const { return pix_original_; }
206  // Takes ownership of the given original_pix.
207  void set_pix_original(Pix* original_pix) {
208  pixDestroy(&pix_original_);
209  pix_original_ = original_pix;
210  }
211  // Returns a pointer to a Pix representing the best available (original) image
212  // of the page. Can be of any bit depth, but never color-mapped, as that has
213  // always been dealt with. Note that in grey and color, 0 is black and 255 is
214  // white. If the input was binary, then black is 1 and white is 0.
215  // To tell the difference pixGetDepth() will return 32, 8 or 1.
216  // In any case, the return value is a borrowed Pix, and should not be
217  // deleted or pixDestroyed.
218  Pix* BestPix() const { return pix_original_; }
219  void set_pix_thresholds(Pix* thresholds) {
220  pixDestroy(&pix_thresholds_);
221  pix_thresholds_ = thresholds;
222  }
223  int source_resolution() const {
224  return source_resolution_;
225  }
226  void set_source_resolution(int ppi) {
227  source_resolution_ = ppi;
228  }
229  int ImageWidth() const {
230  return pixGetWidth(pix_binary_);
231  }
232  int ImageHeight() const {
233  return pixGetHeight(pix_binary_);
234  }
235  Pix* scaled_color() const {
236  return scaled_color_;
237  }
238  int scaled_factor() const {
239  return scaled_factor_;
240  }
241  void SetScaledColor(int factor, Pix* color) {
242  scaled_factor_ = factor;
243  scaled_color_ = color;
244  }
245  const Textord& textord() const {
246  return textord_;
247  }
249  return &textord_;
250  }
251 
252  bool right_to_left() const {
253  return right_to_left_;
254  }
255  int num_sub_langs() const {
256  return sub_langs_.size();
257  }
258  Tesseract* get_sub_lang(int index) const {
259  return sub_langs_[index];
260  }
261  // Returns true if any language uses Tesseract (as opposed to cube).
262  bool AnyTessLang() const {
263  if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
264  for (int i = 0; i < sub_langs_.size(); ++i) {
265  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
266  return true;
267  }
268  return false;
269  }
270 
271  void SetBlackAndWhitelist();
272 
273  // Perform steps to prepare underlying binary image/other data structures for
274  // page segmentation. Uses the strategy specified in the global variable
275  // pageseg_devanagari_split_strategy for perform splitting while preparing for
276  // page segmentation.
277  void PrepareForPageseg();
278 
279  // Perform steps to prepare underlying binary image/other data structures for
280  // Tesseract OCR. The current segmentation is required by this method.
281  // Uses the strategy specified in the global variable
282  // ocr_devanagari_split_strategy for performing splitting while preparing for
283  // Tesseract ocr.
284  void PrepareForTessOCR(BLOCK_LIST* block_list,
285  Tesseract* osd_tess, OSResults* osr);
286 
287  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
288  Tesseract* osd_tess, OSResults* osr);
289  void SetupWordScripts(BLOCK_LIST* blocks);
290  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
291  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
292  Tesseract* osd_tess, OSResults* osr);
294  PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
295  OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
296  Pix** music_mask_pix);
297  // par_control.cpp
298  void PrerecAllWordsPar(const GenericVector<WordData>& words);
299 
301  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
302  const char* word_config, int pass);
303  // Sets up the words ready for whichever engine is to be run
304  void SetupAllWordsPassN(int pass_n,
305  const TBOX* target_word_box,
306  const char* word_config,
307  PAGE_RES* page_res,
308  GenericVector<WordData>* words);
309  // Sets up the single word ready for whichever engine is to be run.
310  void SetupWordPassN(int pass_n, WordData* word);
311  // Runs word recognition on all the words.
312  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
313  PAGE_RES_IT* pr_it,
314  GenericVector<WordData>* words);
315  bool recog_all_words(PAGE_RES* page_res,
316  ETEXT_DESC* monitor,
317  const TBOX* target_word_box,
318  const char* word_config,
319  int dopasses);
320  void rejection_passes(PAGE_RES* page_res,
321  ETEXT_DESC* monitor,
322  const TBOX* target_word_box,
323  const char* word_config);
324  void bigram_correction_pass(PAGE_RES *page_res);
325  void blamer_pass(PAGE_RES* page_res);
326  // Sets script positions and detects smallcaps on all output words.
327  void script_pos_pass(PAGE_RES* page_res);
328  // Helper to recognize the word using the given (language-specific) tesseract.
329  // Returns positive if this recognizer found more new best words than the
330  // number kept from best_words.
331  int RetryWithLanguage(const WordData& word_data,
332  WordRecognizer recognizer,
333  WERD_RES** in_word,
334  PointerVector<WERD_RES>* best_words);
335  // Moves good-looking "noise"/diacritics from the reject list to the main
336  // blob list on the current word. Returns true if anything was done, and
337  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
338  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
339  bool* make_next_word_fuzzy);
340  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
341  // Input: a set of noisy outlines that probably belong to the real_word.
342  // Output: outlines that overlapped blobs are set to NULL and put back into
343  // the word, either in the blobs or in the reject list.
345  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
346  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
347  GenericVector<bool>* overlapped_any_blob,
348  GenericVector<C_BLOB*>* target_blobs);
349  // Attempts to assign non-overlapping outlines to their nearest blobs or
350  // make new blobs out of them.
352  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
353  GenericVector<bool>* word_wanted,
354  GenericVector<C_BLOB*>* target_blobs);
355  // Starting with ok_outlines set to indicate which outlines overlap the blob,
356  // chooses the optimal set (approximately) and returns true if any outlines
357  // are desired, in which case ok_outlines indicates which ones.
358  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
359  PAGE_RES_IT* pr_it, C_BLOB* blob,
360  const GenericVector<C_OUTLINE*>& outlines,
361  int num_outlines,
362  GenericVector<bool>* ok_outlines);
363  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
364  // the inclusion of the outlines, and returns the certainty of the raw choice.
365  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
366  const GenericVector<C_OUTLINE*>& outlines,
367  int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
368  STRING* best_str);
369  // Classifies the given blob (part of word_data->word->word) as an individual
370  // word, using languages, chopper etc, returning only the certainty of the
371  // best raw choice, and undoing all the work done to fake out the word.
372  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
373  STRING* best_str, float* c2);
374  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
375  WordData* word_data);
376  void classify_word_pass1(const WordData& word_data,
377  WERD_RES** in_word,
378  PointerVector<WERD_RES>* out_words);
379  void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
380  TBOX &selection_box);
381 
382  void fix_rep_char(PAGE_RES_IT* page_res_it);
383 
385  const char *s,
386  const char *lengths);
387  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
388  void classify_word_pass2(const WordData& word_data,
389  WERD_RES** in_word,
390  PointerVector<WERD_RES>* out_words);
391  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
392  WERD_RES* word, WERD_RES* new_word);
393  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
394  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
395  // Runs recognition with the test baseline shift and x-height and returns true
396  // if there was an improvement in recognition result.
397  bool TestNewNormalization(int original_misfits, float baseline_shift,
398  float new_x_ht, WERD_RES *word, BLOCK* block,
399  ROW *row);
401 
402  // Set fonts of this word.
403  void set_word_fonts(WERD_RES *word);
404  void font_recognition_pass(PAGE_RES* page_res);
405  void dictionary_correction_pass(PAGE_RES* page_res);
406  BOOL8 check_debug_pt(WERD_RES *word, int location);
407 
409  bool SubAndSuperscriptFix(WERD_RES *word_res);
410  void GetSubAndSuperscriptCandidates(const WERD_RES *word,
411  int *num_rebuilt_leading,
412  ScriptPos *leading_pos,
413  float *leading_certainty,
414  int *num_rebuilt_trailing,
415  ScriptPos *trailing_pos,
416  float *trailing_certainty,
417  float *avg_certainty,
418  float *unlikely_threshold);
419  WERD_RES *TrySuperscriptSplits(int num_chopped_leading,
420  float leading_certainty,
421  ScriptPos leading_pos,
422  int num_chopped_trailing,
423  float trailing_certainty,
424  ScriptPos trailing_pos,
425  WERD_RES *word,
426  bool *is_good,
427  int *retry_leading,
428  int *retry_trailing);
429  bool BelievableSuperscript(bool debug,
430  const WERD_RES &word,
431  float certainty_threshold,
432  int *left_ok,
433  int *right_ok) const;
434 
436 #ifndef NO_CUBE_BUILD
437  bool init_cube_objects(bool load_combiner,
439  // Iterates through tesseract's results and calls cube on each word,
440  // combining the results with the existing tesseract result.
441  void run_cube_combiner(PAGE_RES *page_res);
442  // Recognizes a single word using (only) cube. Compatible with
443  // Tesseract's classify_word_pass1/classify_word_pass2.
444  void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
445  // Cube recognizer to recognize a single word as with classify_word_pass1
446  // but also returns the cube object in case the combiner is needed.
448  // Combines the cube and tesseract results for a single word, leaving the
449  // result in tess_word.
450  void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
451  WERD_RES* tess_word);
452  // Call cube on the current word, and write the result to word.
453  // Sets up a fake result and returns false if something goes wrong.
454  bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
455  void fill_werd_res(const BoxWord& cube_box_word,
456  const char* cube_best_str,
457  WERD_RES* tess_werd_res);
458  bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
459  Boxa** char_boxes, CharSamp*** char_samples);
460  bool create_cube_box_word(Boxa *char_boxes, int num_chars,
461  TBOX word_box, BoxWord* box_word);
462 #endif
463 
465  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
466  void write_results(PAGE_RES_IT &page_res_it, // full info
467  char newline_type, // type of newline
468  BOOL8 force_eol // override tilde crunch?
469  );
470  void set_unlv_suspects(WERD_RES *word);
471  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
472  BOOL8 acceptable_number_string(const char *s,
473  const char *lengths);
474  inT16 count_alphanums(const WERD_CHOICE &word);
475  inT16 count_alphas(const WERD_CHOICE &word);
477  void read_config_file(const char *filename, SetParamConstraint constraint);
478  // Initialize for potentially a set of languages defined by the language
479  // string and recursively any additional languages required by any language
480  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
481  // See init_tesseract_internal for args.
482  int init_tesseract(const char *arg0,
483  const char *textbase,
484  const char *language,
485  OcrEngineMode oem,
486  char **configs,
487  int configs_size,
488  const GenericVector<STRING> *vars_vec,
489  const GenericVector<STRING> *vars_values,
490  bool set_only_init_params);
491  int init_tesseract(const char *datapath,
492  const char *language,
493  OcrEngineMode oem) {
494  return init_tesseract(datapath, NULL, language, oem,
495  NULL, 0, NULL, NULL, false);
496  }
497  // Common initialization for a single language.
498  // arg0 is the datapath for the tessdata directory, which could be the
499  // path of the tessdata directory with no trailing /, or (if tessdata
500  // lives in the same directory as the executable, the path of the executable,
501  // hence the name arg0.
502  // textbase is an optional output file basename (used only for training)
503  // language is the language code to load.
504  // oem controls which engine(s) will operate on the image
505  // configs (argv) is an array of config filenames to load variables from.
506  // May be NULL.
507  // configs_size (argc) is the number of elements in configs.
508  // vars_vec is an optional vector of variables to set.
509  // vars_values is an optional corresponding vector of values for the variables
510  // in vars_vec.
511  // If set_only_init_params is true, then only the initialization variables
512  // will be set.
513  int init_tesseract_internal(const char *arg0,
514  const char *textbase,
515  const char *language,
516  OcrEngineMode oem,
517  char **configs,
518  int configs_size,
519  const GenericVector<STRING> *vars_vec,
520  const GenericVector<STRING> *vars_values,
521  bool set_only_init_params);
522 
523  // Set the universal_id member of each font to be unique among all
524  // instances of the same font loaded.
525  void SetupUniversalFontIds();
526 
527  int init_tesseract_lm(const char *arg0,
528  const char *textbase,
529  const char *language);
530 
531  void recognize_page(STRING& image_name);
532  void end_tesseract();
533 
534  bool init_tesseract_lang_data(const char *arg0,
535  const char *textbase,
536  const char *language,
537  OcrEngineMode oem,
538  char **configs,
539  int configs_size,
540  const GenericVector<STRING> *vars_vec,
541  const GenericVector<STRING> *vars_values,
542  bool set_only_init_params);
543 
544  void ParseLanguageString(const char* lang_str,
545  GenericVector<STRING>* to_load,
546  GenericVector<STRING>* not_to_load);
547 
550  #ifndef GRAPHICS_DISABLED
551  void pgeditor_main(int width, int height, PAGE_RES* page_res);
552  #endif // GRAPHICS_DISABLED
553  void process_image_event( // action in image win
554  const SVEvent &event);
555  BOOL8 process_cmd_win_event( // UI command semantics
556  inT32 cmd_event, // which menu item?
557  char *new_value // any prompt data
558  );
559  void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
560  void do_re_display(
561  BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
566  // #ifndef GRAPHICS_DISABLED
567  BOOL8 word_dumper(PAGE_RES_IT* pr_it);
568  // #endif // GRAPHICS_DISABLED
569  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
571  // make rej map for word
572  void make_reject_map(WERD_RES *word, ROW *row, inT16 pass);
573  BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
574  inT16 first_alphanum_index(const char *word,
575  const char *word_lengths);
576  inT16 first_alphanum_offset(const char *word,
577  const char *word_lengths);
578  inT16 alpha_count(const char *word,
579  const char *word_lengths);
580  BOOL8 word_contains_non_1_digit(const char *word,
581  const char *word_lengths);
582  void dont_allow_1Il(WERD_RES *word);
583  inT16 count_alphanums( //how many alphanums
584  WERD_RES *word);
585  void flip_0O(WERD_RES *word);
586  BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
587  BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
589  void nn_match_word( //Match a word
590  WERD_RES *word,
591  ROW *row);
592  void nn_recover_rejects(WERD_RES *word, ROW *row);
593  void set_done( //set done flag
594  WERD_RES *word,
595  inT16 pass);
596  inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?
597  void flip_hyphens(WERD_RES *word);
598  void reject_I_1_L(WERD_RES *word);
599  void reject_edge_blobs(WERD_RES *word);
600  void reject_mostly_rejects(WERD_RES *word);
602  BOOL8 word_adaptable( //should we adapt?
603  WERD_RES *word,
604  uinT16 mode);
605 
607  void recog_word_recursive(WERD_RES* word);
608  void recog_word(WERD_RES *word);
609  void split_and_recog_word(WERD_RES* word);
610  void split_word(WERD_RES *word,
611  int split_pt,
612  WERD_RES **right_piece,
613  BlamerBundle **orig_blamer_bundle) const;
614  void join_words(WERD_RES *word,
615  WERD_RES *word2,
616  BlamerBundle *orig_bb) const;
618  BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
619  inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
620  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
621  inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
622  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
623  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
624  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
625  void fix_fuzzy_spaces( //find fuzzy words
626  ETEXT_DESC *monitor, //progress monitor
627  inT32 word_count, //count of words in doc
628  PAGE_RES *page_res);
629  void dump_words(WERD_RES_LIST &perm, inT16 score,
630  inT16 mode, BOOL8 improved);
632  inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
633  float blob_noise_score(TBLOB *blob);
634  void break_noisiest_blob_word(WERD_RES_LIST &words);
636  GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
638  GARBAGE_LEVEL garbage_level,
639  BOOL8 ok_dict_word);
640  void tilde_crunch(PAGE_RES_IT &page_res_it);
641  void unrej_good_quality_words( //unreject potential
642  PAGE_RES_IT &page_res_it);
643  void doc_and_block_rejection( //reject big chunks
644  PAGE_RES_IT &page_res_it,
645  BOOL8 good_quality_doc);
646  void quality_based_rejection(PAGE_RES_IT &page_res_it,
647  BOOL8 good_quality_doc);
648  void convert_bad_unlv_chs(WERD_RES *word_res);
649  void tilde_delete(PAGE_RES_IT &page_res_it);
650  inT16 word_blob_quality(WERD_RES *word, ROW *row);
651  void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
652  inT16 *accepted_match_count);
653  void unrej_good_chs(WERD_RES *word, ROW *row);
654  inT16 count_outline_errs(char c, inT16 outline_count);
656  BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
657  CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
659  BOOL8 noise_outlines(TWERD *word);
661  void
663  PAGE_RES* page_res, // blocks to check
664  //function to call
665  TBOX & selection_box,
666  BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
668  void tess_add_doc_word( //test acceptability
669  WERD_CHOICE *word_choice //after context
670  );
671  void tess_segment_pass_n(int pass_n, WERD_RES *word);
672  bool tess_acceptable_word(WERD_RES *word);
673 
675  // Applies the box file based on the image name fname, and resegments
676  // the words in the block_list (page), with:
677  // blob-mode: one blob per line in the box file, words as input.
678  // word/line-mode: one blob per space-delimited unit after the #, and one word
679  // per line in the box file. (See comment above for box file format.)
680  // If find_segmentation is true, (word/line mode) then the classifier is used
681  // to re-segment words/lines to match the space-delimited truth string for
682  // each box. In this case, the input box may be for a word or even a whole
683  // text line, and the output words will contain multiple blobs corresponding
684  // to the space-delimited input string.
685  // With find_segmentation false, no classifier is needed, but the chopper
686  // can still be used to correctly segment touching characters with the help
687  // of the input boxes.
688  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
689  // from normal classification, ie. with a word, chopped_word, rebuild_word,
690  // seam_array, denorm, box_word, and best_state, but NO best_choice or
691  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
692  // Instead, the correct_text member of WERD_RES is set, and this may be later
693  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
694  // is not required before calling ApplyBoxTraining.
695  PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
696  BLOCK_LIST *block_list);
697 
698  // Any row xheight that is significantly different from the median is set
699  // to the median.
700  void PreenXHeights(BLOCK_LIST *block_list);
701 
702  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
703  // All fuzzy spaces are removed, and all the words are maximally chopped.
705  BLOCK_LIST *block_list);
706  // Tests the chopper by exhaustively running chop_one_blob.
707  // The word_res will contain filled chopped_word, seam_array, denorm,
708  // box_word and best_state for the maximally chopped word.
709  void MaximallyChopWord(const GenericVector<TBOX>& boxes,
710  BLOCK* block, ROW* row, WERD_RES* word_res);
711  // Gather consecutive blobs that match the given box into the best_state
712  // and corresponding correct_text.
713  // Fights over which box owns which blobs are settled by pre-chopping and
714  // applying the blobs to box or next_box with the least non-overlap.
715  // Returns false if the box was in error, which can only be caused by
716  // failing to find an appropriate blob for a box.
717  // This means that occasionally, blobs may be incorrectly segmented if the
718  // chopper fails to find a suitable chop point.
719  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
720  const TBOX& box, const TBOX& next_box,
721  const char* correct_text);
722  // Consume all source blobs that strongly overlap the given box,
723  // putting them into a new word, with the correct_text label.
724  // Fights over which box owns which blobs are settled by
725  // applying the blobs to box or next_box with the least non-overlap.
726  // Returns false if the box was in error, which can only be caused by
727  // failing to find an overlapping blob for a box.
728  bool ResegmentWordBox(BLOCK_LIST *block_list,
729  const TBOX& box, const TBOX& next_box,
730  const char* correct_text);
731  // Resegments the words by running the classifier in an attempt to find the
732  // correct segmentation that produces the required string.
733  void ReSegmentByClassification(PAGE_RES* page_res);
734  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
735  // Returns false if an invalid UNICHAR_ID is encountered.
736  bool ConvertStringToUnichars(const char* utf8,
737  GenericVector<UNICHAR_ID>* class_ids);
738  // Resegments the word to achieve the target_text from the classifier.
739  // Returns false if the re-segmentation fails.
740  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
741  // applies a full search on the classifier results to find the best classified
742  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
743  // substitutions ARE used.
744  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
745  WERD_RES* word_res);
746  // Recursive helper to find a match to the target_text (from text_index
747  // position) in the choices (from choices_pos position).
748  // Choices is an array of GenericVectors, of length choices_length, with each
749  // element representing a starting position in the word, and the
750  // GenericVector holding classification results for a sequence of consecutive
751  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
753  int choices_pos, int choices_length,
754  const GenericVector<UNICHAR_ID>& target_text,
755  int text_index,
756  float rating, GenericVector<int>* segmentation,
757  float* best_rating, GenericVector<int>* best_segmentation);
758  // Counts up the labelled words and the blobs within.
759  // Deletes all unused or emptied words, counting the unused ones.
760  // Resets W_BOL and W_EOL flags correctly.
761  // Builds the rebuild_word and rebuilds the box_word.
762  void TidyUp(PAGE_RES* page_res);
763  // Logs a bad box by line in the box file and box coords.
764  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
765  const char *err_msg);
766  // Creates a fake best_choice entry in each WERD_RES with the correct text.
767  void CorrectClassifyWords(PAGE_RES* page_res);
768  // Call LearnWord to extract features for labelled blobs within each word.
769  // Features are stored in an internal buffer.
770  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
771 
773  // Returns the number of misfit blob tops in this word.
774  int CountMisfitTops(WERD_RES *word_res);
775  // Returns a new x-height in pixels (original image coords) that is
776  // maximally compatible with the result in word_res.
777  // Returns 0.0f if no x-height is found that is better than the current
778  // estimate.
779  float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
781  // TODO(ocr-team): Find and remove obsolete parameters.
783  "Take segmentation and labeling from box file");
785  "Conversion of word/line box file to char box file");
787  "Generate training data from boxed chars");
789  "Generate more boxes from boxed chars");
791  "Dump intermediate images made during page segmentation");
793  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
794  " 5=line, 6=word, 7=char"
795  " (Values from PageSegMode enum in publictypes.h)");
797  "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
798  " to loading and running only Tesseract (no Cube, no combiner)."
799  " (Values from OcrEngineMode enum in tesseractclass.h)");
801  "Blacklist of chars not to recognize");
803  "Whitelist of chars to recognize");
805  "List of chars to override tessedit_char_blacklist");
807  "Perform training for ambiguities");
810  "Whether to use the top-line splitting process for Devanagari "
811  "documents while performing page-segmentation.");
814  "Whether to use the top-line splitting process for Devanagari "
815  "documents while performing ocr.");
817  "Write all parameters to the given file.");
819  "Generate and print debug information for adaption");
820  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
821  INT_VAR_H(applybox_debug, 1, "Debug level");
822  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
824  "Exposure value follows this pattern in the image"
825  " filename. The name of the image files are expected"
826  " to be in the form [lang].[fontname].exp[num].tif");
828  "Learn both character fragments (as is done in the"
829  " special low exposure mode) as well as unfragmented"
830  " characters.");
832  "Each bounding box is assumed to contain ngrams. Only"
833  " learn the ngrams whose outlines overlap horizontally.");
834  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
835  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
836  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
838  "Try to improve fuzzy spaces");
840  "Don't bother with word plausibility");
841  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
842  BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
844  "Add words to the document dictionary");
845  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
846  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
848  "Enable correction based on the word bigram dictionary.");
850  "Enable single word correction based on the dictionary.");
851  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
852  "correction.");
854  "Remove and conditionally reassign small outlines when they"
855  " confuse layout analysis, determining diacritics vs noise");
856  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
857  // Worst (min) certainty, for which a diacritic is allowed to make the base
858  // character worse and still be included.
859  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
860  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
861  // make the base character worse and still be included.
862  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
863  // Worst (min) certainty, for which a diacritic is allowed to make a new
864  // stand-alone blob.
865  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
866  // Factor of certainty margin for adding diacritics to not count as worse.
868  "Scaling on certainty diff from Hingepoint");
869  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
870  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
871  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
872  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
873  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
874  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
875  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
876  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
877  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
879  "good_quality_doc lte outline error limit");
880  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
881  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
883  "Adaptation decision algorithm for tess");
885  "Do minimal rejection on pass 1 output");
886  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
887  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
889  "Adaptation decision algorithm for tess");
890  BOOL_VAR_H(test_pt, false, "Test for point");
891  double_VAR_H(test_pt_x, 99999.99, "xcoord");
892  double_VAR_H(test_pt_y, 99999.99, "ycoord");
893  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
895  "Run paragraph detection on the post-text-recognition "
896  "(more accurate)");
897  INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
898  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
899  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
901  "Allow outline errs in unrejection?");
903  "Reduce rejection on good docs");
904  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
906  "%rej allowed before rej whole doc");
908  "%rej allowed before rej whole block");
910  "%rej allowed before rej whole row");
912  "Number of row rejects in whole word rejects"
913  "which prevents whole row rejection");
915  "Only rej partially rejected words in block rejection");
917  "Only rej partially rejected words in row rejection");
919  "Use word segmentation quality metric");
921  "Use word segmentation quality metric");
923  "Only preserve wds longer than this");
925  "Apply row rejection to good docs");
927  "rej good doc wd if more than this fraction rejected");
929  "Reject all bad quality wds");
932  "Output data to debug file");
933  BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs");
935  "good_quality_doc gte good char limit");
937  "Mark v.bad words for tilde crunch");
938  BOOL_VAR_H(hocr_font_info, false,
939  "Add font info to hocr output");
940  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
941  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
942  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
943  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
945  "crunch garbage cert lt this");
946  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
947  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
948  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
949  BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
950  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
951  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
952  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
953  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
954  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
956  "Del if word gt xht x this above bl");
957  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
958  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
959  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
960  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
961  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
962  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
964  "Don't pot crunch sensible strings");
965  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
967  "Don't crunch words with long lower case strings");
969  "Don't crunch words with long lower case strings");
970  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
971  INT_VAR_H(crunch_debug, 0, "As it says");
973  "How many non-noise blbs either side?");
974  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
975  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
976  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
977  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
979  "Punct. chs expected WITHIN numbers");
981  "Max allowed deviation of blob top outside of font data");
982  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
983  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
984  double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse "
985  "certainty does a superscript position glyph need to be for us "
986  "to try classifying it as a char with a different baseline?");
987  double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in "
988  "badness do we think sufficient to choose a superscript over "
989  "what we'd thought. For example, a value of 0.6 means we want "
990  "to reduce badness of certainty by 40%");
992  "A superscript scaled down more than this is unbelievably "
993  "small. For example, 0.3 means we expect the font size to "
994  "be no smaller than 30% of the text line font size.");
996  "Maximum top of a character measured as a multiple of x-height "
997  "above the baseline for us to reconsider whether it's a "
998  "subscript.");
1000  "Minimum bottom of a character measured as a multiple of "
1001  "x-height above the baseline for us to reconsider whether it's "
1002  "a superscript.");
1004  "Write block separators in output");
1006  "Write repetition char code");
1007  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
1008  BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
1009  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
1010  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
1011  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
1012  BOOL_VAR_H(textonly_pdf, false, "Create PDF with only one invisible text layer");
1014  "Output char for unidentified blobs");
1015  INT_VAR_H(suspect_level, 99, "Suspect marker level");
1017  "Min suspect level for rejecting spaces");
1018  INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
1019  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
1020  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
1021  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
1022  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
1023  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
1025  "Make output have exactly one word per WERD");
1027  "Don't reject ANYTHING AT ALL");
1028  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
1029  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
1030  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
1031  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
1033  "Aspect ratio dot/hyphen test");
1035  "Aspect ratio dot/hyphen test");
1036  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
1037  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
1038  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
1039  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
1040  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
1041  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
1042  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
1043  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
1045  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
1047  "Allow NN to unrej");
1048  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
1049  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
1050  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
1052  "-1 -> All pages, else specifc page to process");
1053  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
1054  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
1055  STRING_VAR_H(file_type, ".tif", "Filename extension");
1056  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
1058  "Debug level for TessdataManager functions.");
1060  "List of languages to load with this one");
1062  "In multilingual mode use params model of the primary language");
1063  // Min acceptable orientation margin (difference in scores between top and 2nd
1064  // choice in OSResults::orientations) to believe the page orientation.
1066  "Min acceptable orientation margin");
1067  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
1068  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
1070  "Allow feature extractors to see the original outline");
1072  "Only initialize with the config file. Useful if the instance is "
1073  "not going to be used for OCR but say only for layout analysis.");
1074  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
1075  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
1077  "Force using vertical text page mode");
1079  "Fraction of textlines deemed vertical to use vertical page "
1080  "mode");
1082  "Fraction of height used as a minimum gap for aligned blobs.");
1083  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
1085  "Preserve multiple interword spaces");
1087  "Include page separator string in output text after each "
1088  "image/page.");
1090  "Page separator (default is form feed control character)");
1091 
1092  // The following parameters were deprecated and removed from their original
1093  // locations. The parameters are temporarily kept here to give Tesseract
1094  // users a chance to updated their [lang].traineddata and config files
1095  // without introducing failures during Tesseract initialization.
1096  // TODO(ocr-team): remove these parameters from the code once we are
1097  // reasonably sure that Tesseract users have updated their data files.
1098  //
1099  // BEGIN DEPRECATED PARAMETERS
1101  "find horizontal lines such as headers in vertical page mode");
1102  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
1103  BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
1104  " dawgs (e.g. for non-space delimited languages)");
1105  INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
1106  BOOL_VAR_H(permute_debug, 0, "char permutation debug");
1107  double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
1108  " current best rate to prune other hypotheses");
1110  "Turn on word script consistency permuter");
1112  "incorporate segmentation cost in word rating?");
1114  "Score multipler for script consistency within a word. "
1115  "Being a 'reward' factor, it should be <= 1. "
1116  "Smaller value implies bigger reward.");
1118  "Turn on fixed-length phrasebook search permuter");
1120  "Turn on character type (property) consistency permuter");
1122  "Score multipler for char type consistency within a word. ");
1124  "Score multipler for ngram permuter's best choice"
1125  " (only used in the Han script path).");
1127  "Activate character-level n-gram-based permuter");
1128  BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
1130  "Depth of blob choice lists to explore"
1131  " when fixed length dawgs are on");
1133  "use new state cost heuristics for segmentation state evaluation");
1135  "base factor for adding segmentation cost into word rating."
1136  "It's a multiplying factor, the larger the value above 1, "
1137  "the bigger the effect of segmentation cost.");
1139  "weight associated with char rating in combined cost of state");
1141  "weight associated with width evidence in combined cost of"
1142  " state");
1144  "weight associated with seam cut in combined cost of state");
1146  "max char width-to-height ratio allowed in segmentation");
1148  "Enable new segmentation search path.");
1150  "Maximum character width-to-height ratio for"
1151  "fixed pitch fonts");
1152  // END DEPRECATED PARAMETERS
1153 
1155  FILE *init_recog_training(const STRING &fname);
1156  void recog_training_segmented(const STRING &fname,
1157  PAGE_RES *page_res,
1158  volatile ETEXT_DESC *monitor,
1159  FILE *output_file);
1160  void ambigs_classify_and_output(const char *label,
1161  PAGE_RES_IT* pr_it,
1162  FILE *output_file);
1163 
1164 #ifndef NO_CUBE_BUILD
1165  inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
1166 #endif
1167 
1168  private:
1169  // The filename of a backup config file. If not null, then we currently
1170  // have a temporary debug config file loaded, and backup_config_file_
1171  // will be loaded, and set to null when debug is complete.
1172  const char* backup_config_file_;
1173  // The filename of a config file to read when processing a debug word.
1174  STRING word_config_;
1175  // Image used for input to layout analysis and tesseract recognition.
1176  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
1177  Pix* pix_binary_;
1178  // Unmodified image used for input to cube. Always valid.
1179  Pix* cube_binary_;
1180  // Grey-level input image if the input was not binary, otherwise NULL.
1181  Pix* pix_grey_;
1182  // Original input image. Color if the input was color.
1183  Pix* pix_original_;
1184  // Thresholds that were used to generate the thresholded image from grey.
1185  Pix* pix_thresholds_;
1186  // Input image resolution after any scaling. The resolution is not well
1187  // transmitted by operations on Pix, so we keep an independent record here.
1188  int source_resolution_;
1189  // The shiro-rekha splitter object which is used to split top-lines in
1190  // Devanagari words to provide a better word and grapheme segmentation.
1191  ShiroRekhaSplitter splitter_;
1192  // Page segmentation/layout
1193  Textord textord_;
1194  // True if the primary language uses right_to_left reading order.
1195  bool right_to_left_;
1196  Pix* scaled_color_;
1197  int scaled_factor_;
1198  FCOORD deskew_;
1199  FCOORD reskew_;
1200  TesseractStats stats_;
1201  // Sub-languages to be tried in addition to this.
1202  GenericVector<Tesseract*> sub_langs_;
1203  // Most recently used Tesseract out of this and sub_langs_. The default
1204  // language for the next word.
1205  Tesseract* most_recently_used_;
1206  // The size of the font table, ie max possible font id + 1.
1207  int font_table_size_;
1208 #ifndef NO_CUBE_BUILD
1209  // Cube objects.
1210  CubeRecoContext* cube_cntxt_;
1211  TesseractCubeCombiner *tess_cube_combiner_;
1212 #endif
1213  // Equation detector. Note: this pointer is NOT owned by the class.
1214  EquationDetect* equ_detect_;
1215 };
1216 
1217 } // namespace tesseract
1218 
1219 
1220 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:982
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
CubeObject * cube_recognize_word(BLOCK *block, WERD_RES *word)
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1885
void SetScaledColor(int factor, Pix *color)
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
BOOL8 word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:729
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:716
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool tessedit_preserve_blk_rej_perfect_wds
double tessedit_reject_block_percent
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:145
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
Pix * scaled_color() const
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:509
BOOL8 word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:717
bool right_to_left() const
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
int source_resolution() const
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:789
void process_image_event(const SVEvent &event)
Definition: pgedit.cpp:565
void set_pix_thresholds(Pix *thresholds)
void recognize_page(STRING &image_name)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1493
short inT16
Definition: host.h:33
double tessedit_reject_row_percent
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1554
#define STRING_VAR_H(name, val, comment)
Definition: params.h:271
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:692
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:916
const Textord & textord() const
void SetupWordScripts(BLOCK_LIST *blocks)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1408
bool tessedit_resegment_from_line_boxes
char * tessedit_write_params_to_file
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language)
Definition: tessedit.cpp:465
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:131
bool textord_tabfind_vertical_horizontal_mix
double superscript_bettered_certainty
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
bool cube_recognize(CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:526
double rej_whole_of_mostly_reject_word_fract
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:337
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:615
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
const FCOORD & reskew() const
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:422
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:546
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
void set_pix_grey(Pix *grey_pix)
#define INT_VAR_H(name, val, comment)
Definition: params.h:265
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:283
bool tessedit_preserve_row_rej_perfect_wds
BOOL8 process_cmd_win_event(inT32 cmd_event, char *new_value)
Definition: pgedit.cpp:397
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:664
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:582
bool SubAndSuperscriptFix(WERD_RES *word_res)
double textord_tabfind_aligned_gap_fraction
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:394
CRUNCH_MODE
Definition: pageres.h:145
Textord * mutable_textord()
void nn_match_word(WERD_RES *word, ROW *row)
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1629
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:115
double heuristic_segcost_rating_base
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:57
bool tessedit_enable_bigram_correction
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:569
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:81
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1668
bool crunch_early_convert_bad_unlv_chs
bool applybox_learn_chars_and_char_frags_mode
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1196
bool extract_cube_state(CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
double segment_reward_ngram_best_choice
unsigned char BOOL8
Definition: host.h:46
double superscript_scaledown_ratio
void fill_werd_res(const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
void run_cube_combiner(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39
inT16 count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:408
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:239
void PreenXHeights(BLOCK_LIST *block_list)
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:398
void SetEquationDetect(EquationDetect *detector)
void set_source_resolution(int ppi)
TessdataManager tessdata_manager
Definition: ccutil.h:69
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:46
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1350
BOOL8 word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:761
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:830
void SetupUniversalFontIds()
Definition: tessedit.cpp:444
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:66
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268
double superscript_worse_certainty
CubeRecoContext * GetCubeRecoContext()
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:503
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:641
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:263
double tessedit_good_doc_still_rowrej_wd
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:449
void set_done(WERD_RES *word, inT16 pass)
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:419
unsigned short uinT16
Definition: host.h:34
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:482
double tessedit_reject_doc_percent
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
Definition: werd.h:60
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1111
#define FALSE
Definition: capi.h:46
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:120
void CorrectClassifyWords(PAGE_RES *page_res)
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
CMD_EVENTS mode
Definition: pgedit.cpp:116
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1035
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:343
Pix * pix_binary() const
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1963
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:970
void TidyUp(PAGE_RES *page_res)
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:165
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:875
double tessedit_whole_wd_rej_row_percent
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191
Definition: blobs.h:395
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:448
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2020
GARBAGE_LEVEL
Definition: docqual.h:25
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:899
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:785
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:982
bool AnyTessLang() const
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
inT16 first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:469
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:168
PointerVector< WERD_RES > lang_words
int inT32
Definition: host.h:35
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:308
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:573
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:946
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
Definition: ocrrow.h:32
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
FILE * init_recog_training(const STRING &fname)
double segsearch_max_fixed_pitch_char_wh_ratio
Definition: strngs.h:44
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:204
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:254
double textord_tabfind_vertical_text_ratio
Definition: points.h:189
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
Definition: ocrblock.h:30
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1387
Pix * BestPix() const
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:616
Definition: blobs.h:261
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:293
WordData(const PAGE_RES_IT &page_res_it)
int scaled_factor() const
void flip_0O(WERD_RES *word)
Definition: reject.cpp:673
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:88
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:130
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:684
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:144
void set_pix_original(Pix *original_pix)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1238
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
Pix * pix_original() const
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:36
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1274
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
int language_model_fixed_length_choices_depth
char * ok_repeated_ch_non_alphanum_wds
bool textord_tabfind_force_vertical_text
Tesseract * get_sub_lang(int index) const
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:594
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:760
Definition: rect.h:30
void ReSegmentByClassification(PAGE_RES *page_res)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:290
Pix * pix_grey() const
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1772
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1442
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:960
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
Definition: reject.cpp:292
BOOL8 word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:922
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:596
#define double_VAR_H(name, val, comment)
Definition: params.h:274
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:680
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
int num_sub_langs() const
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:508
void nn_recover_rejects(WERD_RES *word, ROW *row)
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:535
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
SetParamConstraint
Definition: params.h:36
void cube_combine_word(CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:237
int UNICHAR_ID
Definition: unichar.h:33
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:257