33 #define PERFECT_WERDS 999 34 #define MAXSPACING 128 51 BLOCK_RES_IT block_res_it;
52 ROW_RES_IT row_res_it;
53 WERD_RES_IT word_res_it_from;
54 WERD_RES_IT word_res_it_to;
56 WERD_RES_LIST fuzzy_space_words;
58 BOOL8 prevent_null_wd_fixsp;
63 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64 block_res_it.forward()) {
65 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67 row_res_it.forward()) {
68 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69 while (!word_res_it_from.at_last()) {
70 word_res = word_res_it_from.data();
71 while (!word_res_it_from.at_last() &&
73 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
74 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
76 block_res_it.data()->block);
77 word_res = word_res_it_from.forward();
79 if (monitor != NULL) {
81 monitor->
progress = 90 + 5 * word_index / word_count;
83 (monitor->
cancel != NULL &&
89 if (!word_res_it_from.at_last()) {
90 word_res_it_to = word_res_it_from;
91 prevent_null_wd_fixsp =
95 word_res_it_to.forward();
97 if (monitor != NULL) {
99 monitor->
progress = 90 + 5 * word_index / word_count;
101 (monitor->
cancel != NULL &&
105 while (!word_res_it_to.at_last () &&
106 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
107 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
111 prevent_null_wd_fixsp =
TRUE;
112 word_res = word_res_it_to.forward();
117 prevent_null_wd_fixsp =
TRUE;
118 if (prevent_null_wd_fixsp) {
119 word_res_it_from = word_res_it_to;
121 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
124 row_res_it.data()->row,
125 block_res_it.data()->block);
126 new_length = fuzzy_space_words.length();
127 word_res_it_from.add_list_before(&fuzzy_space_words);
129 !word_res_it_from.at_last() && new_length > 0;
131 word_res_it_from.forward();
138 block_res_it.data()->block);
149 WERD_RES_LIST current_perm;
154 dump_words(best_perm, best_score, 1, improved);
159 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
162 dump_words(current_perm, current_score, 2, improved);
163 if (current_score > best_score) {
166 best_score = current_score;
172 dump_words(best_perm, best_score, 3, improved);
178 WERD_RES_IT src_it(&src_list);
179 WERD_RES_IT new_it(&new_list);
183 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
184 src_wd = src_it.data();
189 new_it.add_after_then_move(new_wd);
198 WERD_RES_IT word_it(&words);
203 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204 word = word_it.data();
206 WordData word_data(block, row, word);
240 WERD_RES_IT word_res_it(&word_res_list);
241 inT16 total_score = 0;
242 inT16 word_count = 0;
243 inT16 done_word_count = 0;
248 inT16 prev_word_score = 0;
253 BOOL8 current_word_ok_so_far;
254 STRING punct_chars =
"!\"`',.:;";
260 word = word_res_it.data();
264 total_score += prev_word_score;
269 prev_char_digit =
FALSE;
270 prev_word_done =
FALSE;
278 current_word_ok_so_far =
FALSE;
280 (prev_char_digit && (
286 total_score += prev_word_score;
289 current_word_ok_so_far = word_done;
292 if (current_word_ok_so_far) {
293 prev_word_done =
TRUE;
294 prev_word_score = word_len;
296 prev_word_done =
FALSE;
302 for (i = 0, prev_char_1 =
FALSE; i < word_len; i++) {
304 if (prev_char_1 || (current_char_1 && (i > 0)))
306 prev_char_1 = current_char_1;
312 for (i = 0, offset = 0, prev_char_punct =
FALSE; i < word_len;
316 if (prev_char_punct || (current_char_punct && i > 0))
318 prev_char_punct = current_char_punct;
322 for (i = 0, offset = 0; i < word_len - 1;
331 word_res_it.forward();
332 }
while (word_res_it.data()->part_of_combo);
333 }
while (!word_res_it.at_first());
334 total_score += prev_word_score;
337 if (done_word_count == word_count)
347 for (i = 0, offset = 0; i < char_position;
373 WERD_RES_IT word_it(&words);
374 WERD_RES_IT prev_word_it(&words);
384 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
385 word = word_it.data();
389 gap = box.
left() - prev_right;
393 prev_right = box.
right();
398 word_it.set_to_list(&words);
400 for (; (prev_right == -
MAX_INT16) || !word_it.at_first();
402 word = word_it.data();
406 gap = box.
left() - prev_right;
407 if (gap <= min_gap) {
408 prev_word = prev_word_it.data();
414 copy_word =
new WERD;
415 *copy_word = *(prev_word->
word);
421 prev_word_it.add_before_then_move(combo);
428 delete word_it.extract();
437 prev_word_it = word_it;
440 prev_right = box.
right();
451 WERD_RES_IT word_res_it(&perm);
456 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
457 word_res_it.forward()) {
458 if (!word_res_it.data()->part_of_combo) {
460 word_res_it.data()->best_choice->unichar_string();
469 tprintf(
"EXTRACTED (%d): \"", score);
472 tprintf(
"TESTED (%d): \"", score);
475 tprintf(
"RETURNED (%d): \"", score);
479 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
480 word_res_it.forward()) {
481 if (!word_res_it.data()->part_of_combo) {
483 word_res_it.data()->best_choice->unichar_string().string(),
484 (int)word_res_it.data()->best_choice->permuter());
488 }
else if (improved) {
490 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
491 word_res_it.forward()) {
492 if (!word_res_it.data()->part_of_combo) {
494 word_res_it.data()->best_choice->unichar_string().string(),
495 (int)word_res_it.data()->best_choice->permuter());
538 WERD_RES_LIST sub_word_list;
539 WERD_RES_IT sub_word_list_it(&sub_word_list);
544 word_res = word_res_it.data();
556 tprintf(
"FP fixspace working on \"%s\"\n",
560 sub_word_list_it.add_after_stay_put(word_res_it.extract());
562 new_length = sub_word_list.length();
563 word_res_it.add_list_before(&sub_word_list);
564 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
565 word_res_it.forward();
572 WERD_RES_IT best_perm_it(&best_perm);
573 WERD_RES_LIST current_perm;
574 WERD_RES_IT current_perm_it(¤t_perm);
581 dump_words(best_perm, best_score, 1, improved);
583 old_word_res = best_perm_it.data();
592 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
595 dump_words(current_perm, current_score, 2, improved);
596 if (current_score > best_score) {
599 best_score = current_score;
606 dump_words(best_perm, best_score, 3, improved);
616 WERD_RES_IT word_it(&words);
617 WERD_RES_IT worst_word_it;
618 float worst_noise_score = 9999;
619 int worst_blob_index = -1;
624 C_BLOB_IT rej_cblob_it;
625 C_BLOB_LIST new_blob_list;
626 C_BLOB_IT new_blob_it;
627 C_BLOB_IT new_rej_cblob_it;
629 inT16 start_of_noise_blob;
632 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
634 if (blob_index > -1 && worst_noise_score > noise_score) {
635 worst_noise_score = noise_score;
636 worst_blob_index = blob_index;
637 worst_word_it = word_it;
640 if (worst_blob_index < 0) {
647 word_res = worst_word_it.data();
651 new_blob_it.set_to_list(&new_blob_list);
653 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
654 new_blob_it.add_after_then_move(blob_it.extract());
656 start_of_noise_blob = blob_it.data()->bounding_box().left();
657 delete blob_it.extract();
659 new_word =
new WERD(&new_blob_list, word_res->
word);
667 (!rej_cblob_it.empty() &&
668 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
669 rej_cblob_it.forward()) {
670 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
675 worst_word_it.add_before_then_move(new_word_res);
681 float *worst_noise_score) {
682 float noise_score[512];
704 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
711 noise_score[i] = non_noise_limit;
716 tprintf(
"%1.1f ", noise_score[i]);
725 if (noise_score[i] >= non_noise_limit) {
737 if (noise_score[i] >= non_noise_limit) {
746 if (min_noise_blob > max_noise_blob)
749 *worst_noise_score = small_limit;
751 for (i = min_noise_blob; i <= max_noise_blob; i++) {
752 if (noise_score[i] < *worst_noise_score) {
754 *worst_noise_score = noise_score[i];
762 inT16 outline_count = 0;
764 inT16 largest_outline_dimension = 0;
768 box = ol->bounding_box();
770 max_dimension = box.
height();
772 max_dimension = box.
width();
775 if (largest_outline_dimension < max_dimension)
776 largest_outline_dimension = max_dimension;
779 if (outline_count > 5) {
781 largest_outline_dimension *= 2;
788 largest_outline_dimension /= 2;
791 return largest_outline_dimension;
802 tprintf(
"Blob count: %d (word); %d/%d (rebuild word)\n",
808 if (show_map_detail) {
817 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
831 WERD_RES_IT word_it(&word_res_list);
837 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
838 word = word_it.data();
char * numeric_punctuation
CANCEL_FUNC cancel
for errcode use
const int kBlnBaselineOffset
GenericVector< TBLOB * > blobs
const STRING & unichar_string() const
static WERD_RES * deep_copy(const WERD_RES *src)
void set_blanks(uinT8 new_blanks)
UNICHAR_ID unichar_id(int index) const
void full_print(FILE *fp)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void break_noisiest_blob_word(WERD_RES_LIST &words)
int debug_fix_space_level
void join_on(WERD *other)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
WERD_CHOICE * best_choice
TBOX bounding_box() const
int c_blob_comparator(const void *blob1p, const void *blob2p)
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
inT16 progress
chars in this buffer(0)
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
void * cancel_this
called whenever progress increases
tesseract::BoxWord * box_word
const char * string() const
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
volatile inT8 ocr_alive
true if not last
void fixspace_dbg(WERD_RES *word)
BOOL8 flag(WERD_FLAGS mask) const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
bool deadline_exceeded() const
void transform_to_next_perm(WERD_RES_LIST &words)
void SetupWordPassN(int pass_n, WordData *word)
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
const UNICHARSET * uch_set
BOOL8 contains(const char c) const
WERD_CHOICE * prev_word_best_choice_
int fixsp_non_noise_limit
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
inT16 safe_dict_word(const WERD_RES *werd_res)
bool tessedit_prefer_joined_punct
const STRING & unichar_lengths() const
double fixsp_small_outlines_size
bool get_isdigit(UNICHAR_ID unichar_id) const
float blob_noise_score(TBLOB *blob)
void copy_on(WERD_RES *word_res)
BOOL8 check_debug_pt(WERD_RES *word, int location)
void set_flag(WERD_FLAGS mask, BOOL8 value)
C_BLOB_LIST * cblob_list()
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
TBOX bounding_box() const
char * conflict_set_I_l_1
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
BLOCK_RES_LIST block_res_list
C_BLOB_LIST * rej_cblob_list()