44 #include "allheaders.h" 59 "Take segmentation and labeling from box file",
61 BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
62 "Conversion of word/line box file to char box file",
65 "Generate training data from boxed chars", this->params()),
67 "Generate more boxes from boxed chars", this->params()),
69 "Dump intermediate images made during page segmentation",
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 76 " 5=line, 6=word, 7=char" 77 " (Values from PageSegMode enum in publictypes.h)",
80 "Which OCR engine(s) to run (Tesseract, Cube, both)." 81 " Defaults to loading and running only Tesseract" 82 " (no Cube,no combiner)." 83 " Values from OcrEngineMode enum in tesseractclass.h)",
86 "Blacklist of chars not to recognize", this->params()),
88 "Whitelist of chars to recognize", this->params()),
90 "List of chars to override tessedit_char_blacklist",
93 "Perform training for ambiguities", this->params()),
96 "Whether to use the top-line splitting process for Devanagari " 97 "documents while performing page-segmentation.",
101 "Whether to use the top-line splitting process for Devanagari " 102 "documents while performing ocr.",
105 "Write all parameters to the given file.", this->params()),
107 "Generate and print debug" 108 " information for adaption",
110 INT_MEMBER(bidi_debug, 0,
"Debug level for BiDi", this->params()),
111 INT_MEMBER(applybox_debug, 1,
"Debug level", this->params()),
112 INT_MEMBER(applybox_page, 0,
"Page number to apply boxes from",
115 "Exposure value follows" 116 " this pattern in the image filename. The name of the image" 117 " files are expected to be in the form" 118 " [lang].[fontname].exp[num].tif",
120 BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
121 "Learn both character fragments (as is done in the" 122 " special low exposure mode) as well as unfragmented" 127 " is assumed to contain ngrams. Only learn the ngrams" 128 " whose outlines overlap horizontally.",
130 BOOL_MEMBER(tessedit_display_outwords, false,
"Draw output words",
132 BOOL_MEMBER(tessedit_dump_choices, false,
"Dump char choices",
134 BOOL_MEMBER(tessedit_timing_debug, false,
"Print timing stats",
137 "Try to improve fuzzy spaces", this->params()),
139 "Don't bother with word plausibility", this->params()),
140 BOOL_MEMBER(tessedit_fix_hyphens, true,
"Crunch double hyphens?",
142 BOOL_MEMBER(tessedit_redo_xheight, true,
"Check/Correct x-height",
145 "Add words to the document dictionary", this->params()),
146 BOOL_MEMBER(tessedit_debug_fonts, false,
"Output font info per char",
148 BOOL_MEMBER(tessedit_debug_block_rejection, false,
"Block and Row stats",
150 BOOL_MEMBER(tessedit_enable_bigram_correction, true,
151 "Enable correction based on the word bigram dictionary.",
153 BOOL_MEMBER(tessedit_enable_dict_correction, false,
154 "Enable single word correction based on the dictionary.",
157 "Amount of debug output for bigram correction.",
160 "Remove and conditionally reassign small outlines when they" 161 " confuse layout analysis, determining diacritics vs noise",
163 INT_MEMBER(debug_noise_removal, 0,
"Debug reassignment of small outlines",
169 "Hingepoint for base char certainty", this->params()),
173 "Hingepoint for disjoint certainty", this->params()),
177 "Threshold for new punc char certainty", this->params()),
180 "Scaling on certainty diff from Hingepoint",
182 INT_MEMBER(noise_maxperblob, 8,
"Max diacritics to apply to a blob",
184 INT_MEMBER(noise_maxperword, 16,
"Max diacritics to apply to a word",
186 INT_MEMBER(debug_x_ht_level, 0,
"Reestimate debug", this->params()),
187 BOOL_MEMBER(debug_acceptable_wds, false,
"Dump word pass/fail chk",
189 STRING_MEMBER(chs_leading_punct,
"('`\"",
"Leading punctuation",
191 STRING_MEMBER(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation",
193 STRING_MEMBER(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation",
196 "good_quality_doc lte rejection limit", this->params()),
198 "good_quality_doc gte good blobs limit", this->params()),
200 "good_quality_doc lte outline error limit", this->params()),
202 "good_quality_doc gte good char limit", this->params()),
203 INT_MEMBER(quality_min_initial_alphas_reqd, 2,
"alphas in a good word",
206 "Adaptation decision algorithm for tess", this->params()),
208 "Do minimal rejection on pass 1 output", this->params()),
209 BOOL_MEMBER(tessedit_test_adaption, false,
"Test adaption criteria",
211 BOOL_MEMBER(tessedit_matcher_log, false,
"Log matcher activity",
214 "Adaptation decision algorithm for tess", this->params()),
215 BOOL_MEMBER(test_pt, false,
"Test for point", this->params()),
216 double_MEMBER(test_pt_x, 99999.99,
"xcoord", this->params()),
217 double_MEMBER(test_pt_y, 99999.99,
"ycoord", this->params()),
218 INT_MEMBER(paragraph_debug_level, 0,
"Print paragraph debug info.",
221 "Run paragraph detection on the post-text-recognition " 224 INT_MEMBER(cube_debug_level, 0,
"Print cube debug info.", this->params()),
225 STRING_MEMBER(outlines_odd,
"%| ",
"Non standard number of outlines",
227 STRING_MEMBER(outlines_2,
"ij!?%\":;",
"Non standard number of outlines",
230 "Allow outline errs in unrejection?", this->params()),
232 "Reduce rejection on good docs", this->params()),
233 BOOL_MEMBER(tessedit_use_reject_spaces, true,
"Reject spaces?",
236 "%rej allowed before rej whole doc", this->params()),
238 "%rej allowed before rej whole block", this->params()),
240 "%rej allowed before rej whole row", this->params()),
242 "Number of row rejects in whole word rejects" 243 "which prevents whole row rejection",
245 BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
246 "Only rej partially rejected words in block rejection",
248 BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
249 "Only rej partially rejected words in row rejection",
252 "Use word segmentation quality metric", this->params()),
254 "Use word segmentation quality metric", this->params()),
256 "Only preserve wds longer than this", this->params()),
258 "Apply row rejection to good docs", this->params()),
260 "rej good doc wd if more than this fraction rejected",
263 "Reject all bad quality wds", this->params()),
264 BOOL_MEMBER(tessedit_debug_doc_rejection, false,
"Page stats",
267 "Output data to debug file", this->params()),
268 BOOL_MEMBER(bland_unrej, false,
"unrej potential with no chekcs",
271 "good_quality_doc gte good char limit", this->params()),
273 "Mark v.bad words for tilde crunch", this->params()),
274 BOOL_MEMBER(hocr_font_info, false,
"Add font info to hocr output",
276 BOOL_MEMBER(crunch_early_merge_tess_fails, true,
"Before word crunch?",
278 BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
279 "Take out ~^ early?", this->params()),
280 double_MEMBER(crunch_terrible_rating, 80.0,
"crunch rating lt this",
282 BOOL_MEMBER(crunch_terrible_garbage, true,
"As it says", this->params()),
284 "crunch garbage cert lt this", this->params()),
286 "crunch garbage rating lt this", this->params()),
287 double_MEMBER(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this",
289 double_MEMBER(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this",
291 BOOL_MEMBER(crunch_pot_garbage, true,
"POTENTIAL crunch garbage",
293 double_MEMBER(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this",
295 double_MEMBER(crunch_del_cert, -10.0,
"POTENTIAL crunch cert lt this",
297 double_MEMBER(crunch_del_min_ht, 0.7,
"Del if word ht lt xht x this",
299 double_MEMBER(crunch_del_max_ht, 3.0,
"Del if word ht gt xht x this",
302 "Del if word width lt xht x this", this->params()),
304 "Del if word gt xht x this above bl", this->params()),
306 "Del if word gt xht x this below bl", this->params()),
307 double_MEMBER(crunch_small_outlines_size, 0.6,
"Small if lt xht x this",
309 INT_MEMBER(crunch_rating_max, 10,
"For adj length in rating per ch",
312 "How many potential indicators needed", this->params()),
313 BOOL_MEMBER(crunch_leave_ok_strings, true,
"Don't touch sensible strings",
315 BOOL_MEMBER(crunch_accept_ok, true,
"Use acceptability in okstring",
318 "Don't pot crunch sensible strings", this->params()),
319 BOOL_MEMBER(crunch_include_numerals, false,
"Fiddle alpha figures",
322 "Don't crunch words with long lower case strings",
325 "Don't crunch words with long lower case strings",
328 "Crunch words with long repetitions", this->params()),
329 INT_MEMBER(crunch_debug, 0,
"As it says", this->params()),
331 "How many non-noise blbs either side?", this->params()),
332 double_MEMBER(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this",
335 "Reward punctation joins", this->params()),
336 INT_MEMBER(fixsp_done_mode, 1,
"What constitues done for spacing",
338 INT_MEMBER(debug_fix_space_level, 0,
"Contextual fixspace debug",
341 "Punct. chs expected WITHIN numbers", this->params()),
343 "Max allowed deviation of blob top outside of font data",
346 "Min change in xht before actually trying it", this->params()),
348 "Debug level for sub & superscript fixer", this->params()),
350 superscript_worse_certainty, 2.0,
351 "How many times worse " 352 "certainty does a superscript position glyph need to be for " 353 "us to try classifying it as a char with a different " 357 superscript_bettered_certainty, 0.97,
359 "badness do we think sufficient to choose a superscript " 360 "over what we'd thought. For example, a value of 0.6 means " 361 "we want to reduce badness of certainty by at least 40%",
364 "A superscript scaled down more than this is unbelievably " 365 "small. For example, 0.3 means we expect the font size to " 366 "be no smaller than 30% of the text line font size.",
369 "Maximum top of a character measured as a multiple of " 370 "x-height above the baseline for us to reconsider whether " 374 "Minimum bottom of a character measured as a multiple of " 375 "x-height above the baseline for us to reconsider whether " 376 "it's a superscript.",
378 BOOL_MEMBER(tessedit_write_block_separators, false,
379 "Write block separators in output", this->params()),
380 BOOL_MEMBER(tessedit_write_rep_codes, false,
"Write repetition char code",
382 BOOL_MEMBER(tessedit_write_unlv, false,
"Write .unlv output file",
384 BOOL_MEMBER(tessedit_create_txt, false,
"Write .txt output file",
386 BOOL_MEMBER(tessedit_create_hocr, false,
"Write .html hOCR output file",
388 BOOL_MEMBER(tessedit_create_tsv, false,
"Write .tsv output file",
390 BOOL_MEMBER(tessedit_create_pdf, false,
"Write .pdf output file",
392 BOOL_MEMBER(textonly_pdf, false,
"Create PDF with only one invisible text layer",
395 "Output char for unidentified blobs", this->params()),
396 INT_MEMBER(suspect_level, 99,
"Suspect marker level", this->params()),
398 "Min suspect level for rejecting spaces", this->params()),
400 "Don't suspect dict wds longer than this", this->params()),
401 BOOL_MEMBER(suspect_constrain_1Il, false,
"UNLV keep 1Il chars rejected",
404 "Don't touch bad rating limit", this->params()),
405 double_MEMBER(suspect_accept_rating, -999.9,
"Accept good rating limit",
408 "Only reject tess failures", this->params()),
409 BOOL_MEMBER(tessedit_zero_rejection, false,
"Don't reject ANYTHING",
412 "Make output have exactly one word per WERD", this->params()),
414 "Don't reject ANYTHING AT ALL", this->params()),
416 "Force all rep chars the same", this->params()),
417 INT_MEMBER(tessedit_reject_mode, 0,
"Rejection algorithm",
419 BOOL_MEMBER(tessedit_rejection_debug, false,
"Adaption debug",
421 BOOL_MEMBER(tessedit_flip_0O, true,
"Contextual 0O O0 flips",
424 "Aspect ratio dot/hyphen test", this->params()),
426 "Aspect ratio dot/hyphen test", this->params()),
428 "Use DOC dawg in 11l conf. detector", this->params()),
429 BOOL_MEMBER(rej_1Il_use_dict_word, false,
"Use dictword test",
431 BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
"Don't double check",
433 BOOL_MEMBER(rej_use_tess_accepted, true,
"Individual rejection control",
435 BOOL_MEMBER(rej_use_tess_blanks, true,
"Individual rejection control",
437 BOOL_MEMBER(rej_use_good_perm, true,
"Individual rejection control",
439 BOOL_MEMBER(rej_use_sensible_wd, false,
"Extend permuter check",
441 BOOL_MEMBER(rej_alphas_in_number_perm, false,
"Extend permuter check",
444 "if >this fract", this->params()),
445 INT_MEMBER(tessedit_image_border, 2,
"Rej blbs near image edge limit",
448 "Allow NN to unrej", this->params()),
449 STRING_MEMBER(conflict_set_I_l_1,
"Il1[]",
"Il1 conflict set",
451 INT_MEMBER(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this",
453 BOOL_MEMBER(tessedit_create_boxfile, false,
"Output text with boxes",
457 " , else specifc page to process",
460 "Capture the image from the IPE", this->params()),
461 BOOL_MEMBER(interactive_display_mode, false,
"Run interactively?",
463 STRING_MEMBER(file_type,
".tif",
"Filename extension", this->params()),
464 BOOL_MEMBER(tessedit_override_permuter, true,
"According to dict_word",
468 " TessdataManager functions.",
471 "List of languages to load with this one", this->params()),
472 BOOL_MEMBER(tessedit_use_primary_params_model, false,
473 "In multilingual mode use params model of the" 477 "Min acceptable orientation margin", this->params()),
478 BOOL_MEMBER(textord_tabfind_show_vlines, false,
"Debug line finding",
483 "Allow feature extractors to see the original outline",
486 "Only initialize with the config file. Useful if the " 487 "instance is not going to be used for OCR but say only " 488 "for layout analysis.",
490 BOOL_MEMBER(textord_equation_detect, false,
"Turn on equation detector",
493 "Enable vertical detection", this->params()),
494 BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
495 "Force using vertical text page mode", this->params()),
497 textord_tabfind_vertical_text_ratio, 0.5,
498 "Fraction of textlines deemed vertical to use vertical page " 502 textord_tabfind_aligned_gap_fraction, 0.75,
503 "Fraction of height used as a minimum gap for aligned blobs.",
505 INT_MEMBER(tessedit_parallelize, 0,
"Run in parallel where possible",
508 "Preserve multiple interword spaces", this->params()),
510 "Include page separator string in output text after each " 514 "Page separator (default is form feed control character)",
526 BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
527 "find horizontal lines such as headers in vertical page mode",
529 INT_MEMBER(tessedit_ok_mode, 5,
"Acceptance decision algorithm",
532 "Load fixed length dawgs" 533 " (e.g. for non-space delimited languages)",
535 INT_MEMBER(segment_debug, 0,
"Debug the whole segmentation process",
537 BOOL_MEMBER(permute_debug, 0,
"Debug char permutation process",
540 "Multiplying factor of" 541 " current best rate to prune other hypotheses",
544 "Turn on word script consistency permuter", this->params()),
546 "incorporate segmentation cost in word rating?",
549 "Score multipler for script consistency within a word. " 550 "Being a 'reward' factor, it should be <= 1. " 551 "Smaller value implies bigger reward.",
554 "Turn on fixed-length phrasebook search permuter",
557 "Turn on character type (property) consistency permuter",
560 "Score multipler for char type consistency within a word. ",
563 "Score multipler for ngram permuter's best choice" 564 " (only used in the Han script path).",
567 "Activate character-level n-gram-based permuter",
569 BOOL_MEMBER(permute_only_top, false,
"Run only the top choice permuter",
571 INT_MEMBER(language_model_fixed_length_choices_depth, 3,
572 "Depth of blob choice lists to explore" 573 " when fixed length dawgs are on",
576 "use new state cost heuristics for segmentation state" 580 "base factor for adding segmentation cost into word rating." 581 "It's a multiplying factor, the larger the value above 1, " 582 "the bigger the effect of segmentation cost.",
585 "weight associated with char rating in combined cost of" 589 "weight associated with width evidence in combined cost of" 593 "weight associated with seam cut in combined cost of state",
596 "max char width-to-height ratio allowed in segmentation",
599 "Enable new segmentation search path.", this->params()),
601 "Maximum character width-to-height ratio for" 602 " fixed-pitch fonts",
606 backup_config_file_(NULL),
611 pix_thresholds_(NULL),
612 source_resolution_(0),
614 right_to_left_(false),
619 most_recently_used_(this),
621 #ifndef NO_CUBE_BUILD
623 tess_cube_combiner_(NULL),
630 pixDestroy(&pix_original_);
632 sub_langs_.delete_data_pointers();
633 #ifndef NO_CUBE_BUILD 635 if (cube_cntxt_ != NULL) {
639 if (tess_cube_combiner_ != NULL) {
640 delete tess_cube_combiner_;
641 tess_cube_combiner_ = NULL;
647 pixDestroy(&pix_binary_);
648 pixDestroy(&cube_binary_);
649 pixDestroy(&pix_grey_);
650 pixDestroy(&pix_thresholds_);
651 pixDestroy(&scaled_color_);
652 deskew_ =
FCOORD(1.0f, 0.0f);
653 reskew_ =
FCOORD(1.0f, 0.0f);
656 for (
int i = 0; i < sub_langs_.size(); ++i)
657 sub_langs_[i]->
Clear();
661 equ_detect_ = detector;
668 for (
int i = 0; i < sub_langs_.size(); ++i) {
669 sub_langs_[i]->ResetAdaptiveClassifierInternal();
676 for (
int i = 0; i < sub_langs_.size(); ++i) {
677 sub_langs_[i]->getDict().ResetDocumentDictionary();
687 for (
int i = 0; i < sub_langs_.size(); ++i) {
688 sub_langs_[i]->unicharset.set_black_and_whitelist(
698 pixDestroy(&cube_binary_);
704 for (
int i = 0; i < sub_langs_.size(); ++i) {
707 static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
708 if (pageseg_strategy > max_pageseg_strategy)
709 max_pageseg_strategy = pageseg_strategy;
711 pixDestroy(&sub_langs_[i]->cube_binary_);
712 sub_langs_[i]->cube_binary_ = pixClone(
pix_binary());
713 pixDestroy(&sub_langs_[i]->pix_binary_);
714 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
720 if (splitter_.
Split(
true)) {
722 pixDestroy(&pix_binary_);
738 for (
int i = 0; i < sub_langs_.size(); ++i) {
741 static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
742 if (ocr_strategy > max_ocr_strategy)
743 max_ocr_strategy = ocr_strategy;
749 bool split_for_ocr = splitter_.
Split(
false);
752 pixDestroy(&pix_binary_);
753 pix_binary_ = pixClone(splitter_.
orig_pix());
758 BLOCK block(
"",
TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
759 pixGetHeight(pix_binary_));
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void ResetDocumentDictionary()
void set_orig_pix(Pix *pix)
void set_ocr_split_strategy(SplitStrategy strategy)
void set_pageseg_split_strategy(SplitStrategy strategy)
char * tessedit_char_whitelist
#define INT_INIT_MEMBER(name, val, comment, vec)
#define STRING_MEMBER(name, val, comment, vec)
char * tessedit_char_unblacklist
void ResetAdaptiveClassifierInternal()
void set_segmentation_block_list(BLOCK_LIST *block_list)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void set_use_cjk_fp_model(bool flag)
void SetEquationDetect(EquationDetect *detector)
void SetBlackAndWhitelist()
#define BOOL_MEMBER(name, val, comment, vec)
#define INT_MEMBER(name, val, comment, vec)
void extract_edges(Pix *pix, BLOCK *block)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
void ResetDocumentDictionary()
#define double_MEMBER(name, val, comment, vec)
void ResetAdaptiveClassifier()
void SetLangTesseract(Tesseract *lang_tesseract)
Assume a single uniform block of text. (Default.)
int pageseg_devanagari_split_strategy
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
int ocr_devanagari_split_strategy
C_BLOB_LIST * blob_list()
get blobs
bool HasDifferentSplitStrategies() const
char * tessedit_char_blacklist
bool Split(bool split_for_pageseg)
bool textord_use_cjk_fp_model