26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ 27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ 29 #include "allheaders.h" 39 class BLOB_CHOICE_LIST_CLIST;
101 class CubeLineObject;
103 class CubeRecoContext;
105 class EquationDetect;
107 #ifndef NO_CUBE_BUILD 108 class TesseractCubeCombiner;
192 pixDestroy(&pix_binary_);
202 pixDestroy(&pix_grey_);
203 pix_grey_ = grey_pix;
208 pixDestroy(&pix_original_);
209 pix_original_ = original_pix;
218 Pix*
BestPix()
const {
return pix_original_; }
220 pixDestroy(&pix_thresholds_);
221 pix_thresholds_ = thresholds;
224 return source_resolution_;
227 source_resolution_ = ppi;
230 return pixGetWidth(pix_binary_);
233 return pixGetHeight(pix_binary_);
236 return scaled_color_;
239 return scaled_factor_;
242 scaled_factor_ = factor;
243 scaled_color_ = color;
253 return right_to_left_;
256 return sub_langs_.size();
259 return sub_langs_[index];
264 for (
int i = 0; i < sub_langs_.size(); ++i) {
291 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
295 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
296 Pix** music_mask_pix);
302 const char* word_config,
int pass);
305 const TBOX* target_word_box,
306 const char* word_config,
317 const TBOX* target_word_box,
318 const char* word_config,
322 const TBOX* target_word_box,
323 const char* word_config);
339 bool* make_next_word_fuzzy);
373 STRING* best_str,
float* c2);
380 TBOX &selection_box);
386 const char *lengths);
411 int *num_rebuilt_leading,
413 float *leading_certainty,
414 int *num_rebuilt_trailing,
416 float *trailing_certainty,
417 float *avg_certainty,
418 float *unlikely_threshold);
420 float leading_certainty,
422 int num_chopped_trailing,
423 float trailing_certainty,
428 int *retry_trailing);
431 float certainty_threshold,
433 int *right_ok)
const;
436 #ifndef NO_CUBE_BUILD 456 const char* cube_best_str,
459 Boxa** char_boxes,
CharSamp*** char_samples);
473 const char *lengths);
483 const char *textbase,
484 const char *language,
490 bool set_only_init_params);
492 const char *language,
495 NULL, 0, NULL, NULL,
false);
514 const char *textbase,
515 const char *language,
521 bool set_only_init_params);
528 const char *textbase,
529 const char *language);
535 const char *textbase,
536 const char *language,
542 bool set_only_init_params);
550 #ifndef GRAPHICS_DISABLED 552 #endif // GRAPHICS_DISABLED 575 const char *word_lengths);
577 const char *word_lengths);
579 const char *word_lengths);
581 const char *word_lengths);
645 BOOL8 good_quality_doc);
647 BOOL8 good_quality_doc);
652 inT16 *accepted_match_count);
665 TBOX & selection_box,
696 BLOCK_LIST *block_list);
705 BLOCK_LIST *block_list);
720 const TBOX& box,
const TBOX& next_box,
721 const char* correct_text);
729 const TBOX& box,
const TBOX& next_box,
730 const char* correct_text);
753 int choices_pos,
int choices_length,
765 const char *err_msg);
783 "Take segmentation and labeling from box file");
785 "Conversion of word/line box file to char box file");
787 "Generate training data from boxed chars");
789 "Generate more boxes from boxed chars");
791 "Dump intermediate images made during page segmentation");
793 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 794 " 5=line, 6=word, 7=char" 795 " (Values from PageSegMode enum in publictypes.h)");
797 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" 798 " to loading and running only Tesseract (no Cube, no combiner)." 799 " (Values from OcrEngineMode enum in tesseractclass.h)");
801 "Blacklist of chars not to recognize");
803 "Whitelist of chars to recognize");
805 "List of chars to override tessedit_char_blacklist");
807 "Perform training for ambiguities");
810 "Whether to use the top-line splitting process for Devanagari " 811 "documents while performing page-segmentation.");
814 "Whether to use the top-line splitting process for Devanagari " 815 "documents while performing ocr.");
817 "Write all parameters to the given file.");
819 "Generate and print debug information for adaption");
824 "Exposure value follows this pattern in the image" 825 " filename. The name of the image files are expected" 826 " to be in the form [lang].[fontname].exp[num].tif");
828 "Learn both character fragments (as is done in the" 829 " special low exposure mode) as well as unfragmented" 832 "Each bounding box is assumed to contain ngrams. Only" 833 " learn the ngrams whose outlines overlap horizontally.");
838 "Try to improve fuzzy spaces");
840 "Don't bother with word plausibility");
844 "Add words to the document dictionary");
848 "Enable correction based on the word bigram dictionary.");
850 "Enable single word correction based on the dictionary.");
854 "Remove and conditionally reassign small outlines when they" 855 " confuse layout analysis, determining diacritics vs noise");
868 "Scaling on certainty diff from Hingepoint");
879 "good_quality_doc lte outline error limit");
883 "Adaptation decision algorithm for tess");
885 "Do minimal rejection on pass 1 output");
889 "Adaptation decision algorithm for tess");
895 "Run paragraph detection on the post-text-recognition " 901 "Allow outline errs in unrejection?");
903 "Reduce rejection on good docs");
906 "%rej allowed before rej whole doc");
908 "%rej allowed before rej whole block");
910 "%rej allowed before rej whole row");
912 "Number of row rejects in whole word rejects" 913 "which prevents whole row rejection");
915 "Only rej partially rejected words in block rejection");
917 "Only rej partially rejected words in row rejection");
919 "Use word segmentation quality metric");
921 "Use word segmentation quality metric");
923 "Only preserve wds longer than this");
925 "Apply row rejection to good docs");
927 "rej good doc wd if more than this fraction rejected");
929 "Reject all bad quality wds");
932 "Output data to debug file");
935 "good_quality_doc gte good char limit");
937 "Mark v.bad words for tilde crunch");
939 "Add font info to hocr output");
945 "crunch garbage cert lt this");
956 "Del if word gt xht x this above bl");
964 "Don't pot crunch sensible strings");
967 "Don't crunch words with long lower case strings");
969 "Don't crunch words with long lower case strings");
973 "How many non-noise blbs either side?");
979 "Punct. chs expected WITHIN numbers");
981 "Max allowed deviation of blob top outside of font data");
985 "certainty does a superscript position glyph need to be for us " 986 "to try classifying it as a char with a different baseline?");
988 "badness do we think sufficient to choose a superscript over " 989 "what we'd thought. For example, a value of 0.6 means we want " 990 "to reduce badness of certainty by 40%");
992 "A superscript scaled down more than this is unbelievably " 993 "small. For example, 0.3 means we expect the font size to " 994 "be no smaller than 30% of the text line font size.");
996 "Maximum top of a character measured as a multiple of x-height " 997 "above the baseline for us to reconsider whether it's a " 1000 "Minimum bottom of a character measured as a multiple of " 1001 "x-height above the baseline for us to reconsider whether it's " 1004 "Write block separators in output");
1006 "Write repetition char code");
1014 "Output char for unidentified blobs");
1017 "Min suspect level for rejecting spaces");
1025 "Make output have exactly one word per WERD");
1027 "Don't reject ANYTHING AT ALL");
1033 "Aspect ratio dot/hyphen test");
1035 "Aspect ratio dot/hyphen test");
1047 "Allow NN to unrej");
1052 "-1 -> All pages, else specifc page to process");
1058 "Debug level for TessdataManager functions.");
1060 "List of languages to load with this one");
1062 "In multilingual mode use params model of the primary language");
1066 "Min acceptable orientation margin");
1070 "Allow feature extractors to see the original outline");
1072 "Only initialize with the config file. Useful if the instance is " 1073 "not going to be used for OCR but say only for layout analysis.");
1077 "Force using vertical text page mode");
1079 "Fraction of textlines deemed vertical to use vertical page " 1082 "Fraction of height used as a minimum gap for aligned blobs.");
1085 "Preserve multiple interword spaces");
1087 "Include page separator string in output text after each " 1090 "Page separator (default is form feed control character)");
1101 "find horizontal lines such as headers in vertical page mode");
1104 " dawgs (e.g. for non-space delimited languages)");
1108 " current best rate to prune other hypotheses");
1110 "Turn on word script consistency permuter");
1112 "incorporate segmentation cost in word rating?");
1114 "Score multipler for script consistency within a word. " 1115 "Being a 'reward' factor, it should be <= 1. " 1116 "Smaller value implies bigger reward.");
1118 "Turn on fixed-length phrasebook search permuter");
1120 "Turn on character type (property) consistency permuter");
1122 "Score multipler for char type consistency within a word. ");
1124 "Score multipler for ngram permuter's best choice" 1125 " (only used in the Han script path).");
1127 "Activate character-level n-gram-based permuter");
1130 "Depth of blob choice lists to explore" 1131 " when fixed length dawgs are on");
1133 "use new state cost heuristics for segmentation state evaluation");
1135 "base factor for adding segmentation cost into word rating." 1136 "It's a multiplying factor, the larger the value above 1, " 1137 "the bigger the effect of segmentation cost.");
1139 "weight associated with char rating in combined cost of state");
1141 "weight associated with width evidence in combined cost of" 1144 "weight associated with seam cut in combined cost of state");
1146 "max char width-to-height ratio allowed in segmentation");
1148 "Enable new segmentation search path.");
1150 "Maximum character width-to-height ratio for" 1151 "fixed pitch fonts");
1164 #ifndef NO_CUBE_BUILD 1172 const char* backup_config_file_;
1185 Pix* pix_thresholds_;
1188 int source_resolution_;
1195 bool right_to_left_;
1207 int font_table_size_;
1208 #ifndef NO_CUBE_BUILD 1220 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
CubeObject * cube_recognize_word(BLOCK *block, WERD_RES *word)
bool preserve_interword_spaces
void set_word_fonts(WERD_RES *word)
bool tessedit_use_reject_spaces
void SetScaledColor(int factor, Pix *color)
void tess_add_doc_word(WERD_CHOICE *word_choice)
char * numeric_punctuation
BOOL8 word_bln_display(PAGE_RES_IT *pr_it)
void script_pos_pass(PAGE_RES *page_res)
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool tessedit_preserve_blk_rej_perfect_wds
double tessedit_reject_block_percent
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
Pix * scaled_color() const
bool tessedit_reject_bad_qual_wds
double suspect_accept_rating
double tessedit_upper_flip_hyphen
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
double crunch_poor_garbage_rate
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
bool permute_chartype_word
BOOL8 word_blank_and_set_display(PAGE_RES_IT *pr_its)
bool right_to_left() const
int tessedit_test_adaption_mode
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
int source_resolution() const
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
void process_image_event(const SVEvent &event)
bool tessedit_fix_fuzzy_spaces
double crunch_small_outlines_size
int tessedit_ocr_engine_mode
double heuristic_max_char_wh_ratio
void set_pix_thresholds(Pix *thresholds)
void recognize_page(STRING &image_name)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
int tessedit_pageseg_mode
double tessedit_reject_row_percent
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
#define STRING_VAR_H(name, val, comment)
void blamer_pass(PAGE_RES *page_res)
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
void recog_word_recursive(WERD_RES *word)
bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
const Textord & textord() const
void SetupWordScripts(BLOCK_LIST *blocks)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
bool tessedit_dump_pageseg_images
bool tessedit_resegment_from_line_boxes
char * tessedit_write_params_to_file
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language)
bool tessedit_debug_fonts
bool interactive_display_mode
bool tessedit_redo_xheight
bool suspect_constrain_1Il
inT16 count_outline_errs(char c, inT16 outline_count)
bool textord_tabfind_vertical_horizontal_mix
int x_ht_acceptance_tolerance
double superscript_bettered_certainty
bool tess_acceptable_word(WERD_RES *word)
bool cube_recognize(CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void dont_allow_1Il(WERD_RES *word)
double rej_whole_of_mostly_reject_word_fract
void pgeditor_main(int width, int height, PAGE_RES *page_res)
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool poly_allow_detailed_fx
char * tessedit_char_whitelist
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
const FCOORD & reskew() const
bool tessedit_enable_dict_correction
void tilde_crunch(PAGE_RES_IT &page_res_it)
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
char * tessedit_load_sublangs
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
void set_pix_grey(Pix *grey_pix)
#define INT_VAR_H(name, val, comment)
bool tessedit_consistent_reps
UNICHAR_ID get_rep_char(WERD_RES *word)
bool tessedit_preserve_row_rej_perfect_wds
BOOL8 process_cmd_win_event(inT32 cmd_event, char *new_value)
void convert_bad_unlv_chs(WERD_RES *word_res)
bool tilde_crunch_written
double bestrate_pruning_factor
double segment_reward_chartype
bool tessedit_override_permuter
int debug_fix_space_level
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
double crunch_del_high_word
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
bool tessedit_zero_rejection
bool SubAndSuperscriptFix(WERD_RES *word_res)
double textord_tabfind_aligned_gap_fraction
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
bool tessedit_dont_rowrej_good_wds
char * tessedit_char_unblacklist
Textord * mutable_textord()
Pix ** mutable_pix_binary()
void nn_match_word(WERD_RES *word, ROW *row)
bool rej_use_tess_accepted
void fix_rep_char(PAGE_RES_IT *page_res_it)
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
double heuristic_segcost_rating_base
bool crunch_early_merge_tess_fails
void read_config_file(const char *filename, SetParamConstraint constraint)
bool tessedit_enable_bigram_correction
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool tessedit_init_config_only
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
double crunch_terrible_rating
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
bool segment_segcost_rating
bool crunch_terrible_garbage
bool tessedit_word_for_word
bool tessedit_matcher_log
inT16 word_outline_errs(WERD_RES *word)
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool crunch_early_convert_bad_unlv_chs
bool enable_noise_removal
bool applybox_learn_chars_and_char_frags_mode
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
bool extract_cube_state(CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
bool tessedit_minimal_rejection
double segment_reward_ngram_best_choice
bool tessedit_train_from_boxes
bool tessedit_enable_doc_dict
double superscript_scaledown_ratio
inT32 adaption_word_number
double min_orientation_margin
double crunch_del_low_word
void fill_werd_res(const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
bool tessedit_dump_choices
void run_cube_combiner(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
inT16 count_alphanums(const WERD_CHOICE &word)
char * chs_trailing_punct1
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
double noise_cert_disjoint
bool tessedit_write_block_separators
bool tessedit_debug_block_rejection
int tessedit_preserve_min_wd_len
double crunch_del_min_width
void PreenXHeights(BLOCK_LIST *block_list)
bool tessedit_row_rej_good_docs
bool load_fixed_length_dawgs
bool rej_1Il_trust_permuter_type
inT16 count_alphas(const WERD_CHOICE &word)
void SetEquationDetect(EquationDetect *detector)
bool tessedit_fix_hyphens
void set_source_resolution(int ppi)
TessdataManager tessdata_manager
void recog_word(WERD_RES *word)
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
BOOL8 word_display(PAGE_RES_IT *pr_it)
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
void SetupUniversalFontIds()
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
#define BOOL_VAR_H(name, val, comment)
bool textord_equation_detect
int crunch_pot_indicators
double superscript_worse_certainty
CubeRecoContext * GetCubeRecoContext()
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
void reject_edge_blobs(WERD_RES *word)
double tessedit_good_doc_still_rowrej_wd
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
bool tessedit_ambigs_training
void set_done(WERD_RES *word, inT16 pass)
BOOL8 acceptable_number_string(const char *s, const char *lengths)
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
void SetBlackAndWhitelist()
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
double tessedit_reject_doc_percent
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
double crunch_pot_poor_rate
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
double superscript_min_y_bottom
void unrej_good_chs(WERD_RES *word, ROW *row)
void CorrectClassifyWords(PAGE_RES *page_res)
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool tessedit_dont_blkrej_good_wds
double heuristic_weight_seamcut
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
bool write_results_empty_block
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
inT16 doc_good_char_quality
void font_recognition_pass(PAGE_RES *page_res)
int quality_min_initial_alphas_reqd
bool tessedit_make_boxes_from_boxes
inT16 failure_count(WERD_RES *word)
void TidyUp(PAGE_RES *page_res)
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
double tessedit_whole_wd_rej_row_percent
bool crunch_leave_ok_strings
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int crunch_long_repetitions
void ResetDocumentDictionary()
void reject_I_1_L(WERD_RES *word)
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
bool tessedit_use_primary_params_model
void ResetAdaptiveClassifier()
void bigram_correction_pass(PAGE_RES *page_res)
bool tessedit_debug_quality_metrics
double noise_cert_basechar
int crunch_leave_lc_strings
bool tessedit_rejection_debug
void dictionary_correction_pass(PAGE_RES *page_res)
bool last_char_was_newline
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool ngram_permuter_activated
bool tessedit_test_adaption
bool tessedit_adaption_debug
BOOL8 noise_outlines(TWERD *word)
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
inT16 first_alphanum_index(const char *word, const char *word_lengths)
int paragraph_debug_level
void SetupWordPassN(int pass_n, WordData *word)
double suspect_rating_per_ch
PointerVector< WERD_RES > lang_words
bool crunch_leave_accept_strings
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
bool tessedit_create_boxfile
void reject_mostly_rejects(WERD_RES *word)
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
int tessedit_bigram_debug
double tessedit_lower_flip_hyphen
bool unlv_tilde_crunching
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
FILE * init_recog_training(const STRING &fname)
double heuristic_weight_width
bool crunch_include_numerals
int crunch_leave_uc_strings
double segsearch_max_fixed_pitch_char_wh_ratio
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
double textord_tabfind_vertical_text_ratio
double crunch_pot_poor_cert
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
inT16 word_blob_quality(WERD_RES *word, ROW *row)
double crunch_poor_garbage_cert
bool tessedit_good_quality_unrej
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool debug_acceptable_wds
double heuristic_weight_rating
char * chs_trailing_punct2
bool tessedit_write_images
void flip_hyphens(WERD_RES *word)
bool tessedit_resegment_from_boxes
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
WordData(const PAGE_RES_IT &page_res_it)
int scaled_factor() const
void flip_0O(WERD_RES *word)
Assume a single uniform block of text. (Default.)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
int CountMisfitTops(WERD_RES *word_res)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
int fixsp_non_noise_limit
void split_and_recog_word(WERD_RES *word)
int pageseg_devanagari_split_strategy
void set_pix_original(Pix *original_pix)
bool applybox_learn_ngrams_mode
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
bool textord_tabfind_vertical_text
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
Pix * pix_original() const
void PrerecAllWordsPar(const GenericVector< WordData > &words)
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
bool rej_alphas_in_number_perm
bool tessedit_write_rep_codes
int tessdata_manager_debug_level
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
inT16 safe_dict_word(const WERD_RES *werd_res)
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
int tessedit_tess_adaption_mode
bool tessedit_prefer_joined_punct
int language_model_fixed_length_choices_depth
double subscript_max_y_top
char * ok_repeated_ch_non_alphanum_wds
double quality_outline_pc
int ocr_devanagari_split_strategy
bool textord_tabfind_force_vertical_text
double fixsp_small_outlines_size
bool docqual_excuse_outline_errs
Tesseract * get_sub_lang(int index) const
void tilde_delete(PAGE_RES_IT &page_res_it)
float blob_noise_score(TBLOB *blob)
void ReSegmentByClassification(PAGE_RES *page_res)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
bool enable_new_segsearch
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool tessedit_timing_debug
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
char * applybox_exposure_pattern
bool textord_tabfind_show_vlines
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
BOOL8 check_debug_pt(WERD_RES *word, int location)
bool tessedit_zero_kelvin_rejection
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
char * tessedit_char_blacklist
bool tessedit_unrej_any_wd
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
inT16 alpha_count(const char *word, const char *word_lengths)
int tessedit_image_border
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
bool tessedit_debug_doc_rejection
bool paragraph_text_based
BOOL8 word_dumper(PAGE_RES_IT *pr_it)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
bool tessedit_minimal_rej_pass1
#define double_VAR_H(name, val, comment)
double segment_reward_script
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
void set_unlv_suspects(WERD_RES *word)
int num_sub_langs() const
bool tessedit_display_outwords
bool textord_use_cjk_fp_model
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
void nn_recover_rejects(WERD_RES *word, ROW *row)
bool permute_fixed_length_dawg
char * conflict_set_I_l_1
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void cube_combine_word(CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
bool tessedit_create_hocr
bool rej_1Il_use_dict_word
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
SVMenuNode * build_menu_new()