tesseract  3.05.02
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  * Created: Wed Nov 24 09:11:23 GMT 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <ctype.h>
25 #include <string.h>
26 #ifdef __UNIX__
27 #include <assert.h>
28 #include <errno.h>
29 #endif
30 #include "allheaders.h"
31 #include "boxread.h"
32 #include "chopper.h"
33 #include "pageres.h"
34 #include "unichar.h"
35 #include "unicharset.h"
36 #include "tesseractclass.h"
37 #include "genericvector.h"
38 
40 const int kMaxGroupSize = 4;
43 const double kMaxXHeightDeviationFraction = 0.125;
44 
80 namespace tesseract {
81 
82 static void clear_any_old_text(BLOCK_LIST *block_list) {
83  BLOCK_IT block_it(block_list);
84  for (block_it.mark_cycle_pt();
85  !block_it.cycled_list(); block_it.forward()) {
86  ROW_IT row_it(block_it.data()->row_list());
87  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
88  WERD_IT word_it(row_it.data()->word_list());
89  for (word_it.mark_cycle_pt();
90  !word_it.cycled_list(); word_it.forward()) {
91  word_it.data()->set_text("");
92  }
93  }
94  }
95 }
96 
97 // Applies the box file based on the image name fname, and resegments
98 // the words in the block_list (page), with:
99 // blob-mode: one blob per line in the box file, words as input.
100 // word/line-mode: one blob per space-delimited unit after the #, and one word
101 // per line in the box file. (See comment above for box file format.)
102 // If find_segmentation is true, (word/line mode) then the classifier is used
103 // to re-segment words/lines to match the space-delimited truth string for
104 // each box. In this case, the input box may be for a word or even a whole
105 // text line, and the output words will contain multiple blobs corresponding
106 // to the space-delimited input string.
107 // With find_segmentation false, no classifier is needed, but the chopper
108 // can still be used to correctly segment touching characters with the help
109 // of the input boxes.
110 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
111 // from normal classification, ie. with a word, chopped_word, rebuild_word,
112 // seam_array, denorm, box_word, and best_state, but NO best_choice or
113 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
114 // Instead, the correct_text member of WERD_RES is set, and this may be later
115 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
116 // is not required before calling ApplyBoxTraining.
118  bool find_segmentation,
119  BLOCK_LIST *block_list) {
120  GenericVector<TBOX> boxes;
121  GenericVector<STRING> texts, full_texts;
122  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
123  NULL)) {
124  return NULL; // Can't do it.
125  }
126 
127  int box_count = boxes.size();
128  int box_failures = 0;
129  // Add an empty everything to the end.
130  boxes.push_back(TBOX());
131  texts.push_back(STRING());
132  full_texts.push_back(STRING());
133 
134  // In word mode, we use the boxes to make a word for each box, but
135  // in blob mode we use the existing words and maximally chop them first.
136  PAGE_RES* page_res = find_segmentation ?
137  NULL : SetupApplyBoxes(boxes, block_list);
138  clear_any_old_text(block_list);
139 
140  for (int i = 0; i < boxes.size() - 1; i++) {
141  bool foundit = false;
142  if (page_res != NULL) {
143  if (i == 0) {
144  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
145  full_texts[i].string());
146  } else {
147  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
148  boxes[i + 1], full_texts[i].string());
149  }
150  } else {
151  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
152  texts[i].string());
153  }
154  if (!foundit) {
155  box_failures++;
156  ReportFailedBox(i, boxes[i], texts[i].string(),
157  "FAILURE! Couldn't find a matching blob");
158  }
159  }
160 
161  if (page_res == NULL) {
162  // In word/line mode, we now maximally chop all the words and resegment
163  // them with the classifier.
164  page_res = SetupApplyBoxes(boxes, block_list);
165  ReSegmentByClassification(page_res);
166  }
167  if (applybox_debug > 0) {
168  tprintf("APPLY_BOXES:\n");
169  tprintf(" Boxes read from boxfile: %6d\n", box_count);
170  if (box_failures > 0)
171  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
172  }
173  TidyUp(page_res);
174  return page_res;
175 }
176 
177 // Helper computes median xheight in the image.
178 static double MedianXHeight(BLOCK_LIST *block_list) {
179  BLOCK_IT block_it(block_list);
180  STATS xheights(0, block_it.data()->bounding_box().height());
181  for (block_it.mark_cycle_pt();
182  !block_it.cycled_list(); block_it.forward()) {
183  ROW_IT row_it(block_it.data()->row_list());
184  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
185  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
186  }
187  }
188  return xheights.median();
189 }
190 
193 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
194  double median_xheight = MedianXHeight(block_list);
195  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
196  // Strip all fuzzy space markers to simplify the PAGE_RES.
197  BLOCK_IT b_it(block_list);
198  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
199  BLOCK* block = b_it.data();
200  ROW_IT r_it(block->row_list());
201  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
202  ROW* row = r_it.data();
203  float diff = fabs(row->x_height() - median_xheight);
204  if (diff > max_deviation) {
205  if (applybox_debug) {
206  tprintf("row xheight=%g, but median xheight = %g\n",
207  row->x_height(), median_xheight);
208  }
209  row->set_x_height(static_cast<float>(median_xheight));
210  }
211  }
212  }
213 }
214 
218  BLOCK_LIST *block_list) {
219  PreenXHeights(block_list);
220  // Strip all fuzzy space markers to simplify the PAGE_RES.
221  BLOCK_IT b_it(block_list);
222  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
223  BLOCK* block = b_it.data();
224  ROW_IT r_it(block->row_list());
225  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
226  ROW* row = r_it.data();
227  WERD_IT w_it(row->word_list());
228  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
229  WERD* word = w_it.data();
230  if (word->cblob_list()->empty()) {
231  delete w_it.extract();
232  } else {
233  word->set_flag(W_FUZZY_SP, false);
234  word->set_flag(W_FUZZY_NON, false);
235  }
236  }
237  }
238  }
239  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
240  PAGE_RES_IT pr_it(page_res);
241  WERD_RES* word_res;
242  while ((word_res = pr_it.word()) != NULL) {
243  MaximallyChopWord(boxes, pr_it.block()->block,
244  pr_it.row()->row, word_res);
245  pr_it.forward();
246  }
247  return page_res;
248 }
249 
254  BLOCK* block, ROW* row,
255  WERD_RES* word_res) {
256  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
261  row, block)) {
262  word_res->CloneChoppedToRebuild();
263  return;
264  }
265  if (chop_debug) {
266  tprintf("Maximally chopping word at:");
267  word_res->word->bounding_box().print();
268  }
269  GenericVector<BLOB_CHOICE*> blob_choices;
270  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
271  float rating = static_cast<float>(MAX_INT8);
272  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
273  // The rating and certainty are not quite arbitrary. Since
274  // select_blob_to_chop uses the worst certainty to choose, they all have
275  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
276  // in here, and then divide by e each time they are chopped, which
277  // should guarantee a set of unequal values for the whole tree of blobs
278  // produced, however much chopping is required. The chops are thus only
279  // limited by the ability of the chopper to find suitable chop points,
280  // and not by the value of the certainties.
281  BLOB_CHOICE* choice =
282  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
283  blob_choices.push_back(choice);
284  rating -= 0.125f;
285  }
286  const double e = exp(1.0); // The base of natural logs.
287  int blob_number;
288  int right_chop_index = 0;
290  // We only chop if the language is not fixed pitch like CJK.
291  SEAM* seam = NULL;
292  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
293  &blob_number)) != NULL) {
294  word_res->InsertSeam(blob_number, seam);
295  BLOB_CHOICE* left_choice = blob_choices[blob_number];
296  rating = left_choice->rating() / e;
297  left_choice->set_rating(rating);
298  left_choice->set_certainty(-rating);
299  // combine confidence w/ serial #
300  BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
301  rating - 0.125f, -rating, -1,
302  0.0f, 0.0f, 0.0f, BCC_FAKE);
303  blob_choices.insert(right_choice, blob_number + 1);
304  }
305  }
306  word_res->CloneChoppedToRebuild();
307  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
308 }
309 
321 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
322  int overlap_area = box1.intersection(box2).area();
323  double miss_metric = box1.area()- overlap_area;
324  miss_metric /= box1.area();
325  miss_metric *= box2.area() - overlap_area;
326  miss_metric /= box2.area();
327  return miss_metric;
328 }
329 
340 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
341  const TBOX& box, const TBOX& next_box,
342  const char* correct_text) {
343  if (applybox_debug > 1) {
344  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
345  }
346  PAGE_RES_IT page_res_it(page_res);
347  WERD_RES* word_res;
348  for (word_res = page_res_it.word(); word_res != NULL;
349  word_res = page_res_it.forward()) {
350  if (!word_res->box_word->bounding_box().major_overlap(box))
351  continue;
352  if (applybox_debug > 1) {
353  tprintf("Checking word box:");
354  word_res->box_word->bounding_box().print();
355  }
356  int word_len = word_res->box_word->length();
357  for (int i = 0; i < word_len; ++i) {
358  TBOX char_box = TBOX();
359  int blob_count = 0;
360  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
361  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
362  if (!blob_box.major_overlap(box))
363  break;
364  if (word_res->correct_text[i + blob_count].length() > 0)
365  break; // Blob is claimed already.
366  double current_box_miss_metric = BoxMissMetric(blob_box, box);
367  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
368  if (applybox_debug > 2) {
369  tprintf("Checking blob:");
370  blob_box.print();
371  tprintf("Current miss metric = %g, next = %g\n",
372  current_box_miss_metric, next_box_miss_metric);
373  }
374  if (current_box_miss_metric > next_box_miss_metric)
375  break; // Blob is a better match for next box.
376  char_box += blob_box;
377  }
378  if (blob_count > 0) {
379  if (applybox_debug > 1) {
380  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
381  }
382  if (!char_box.almost_equal(box, 3) &&
383  (box.x_gap(next_box) < -3 ||
384  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
385  return false;
386  }
387  // We refine just the box_word, best_state and correct_text here.
388  // The rebuild_word is made in TidyUp.
389  // blob_count blobs are put together to match the box. Merge the
390  // box_word boxes, save the blob_count in the state and the text.
391  word_res->box_word->MergeBoxes(i, i + blob_count);
392  word_res->best_state[i] = blob_count;
393  word_res->correct_text[i] = correct_text;
394  if (applybox_debug > 2) {
395  tprintf("%d Blobs match: blob box:", blob_count);
396  word_res->box_word->BlobBox(i).print();
397  tprintf("Matches box:");
398  box.print();
399  tprintf("With next box:");
400  next_box.print();
401  }
402  // Eliminated best_state and correct_text entries for the consumed
403  // blobs.
404  for (int j = 1; j < blob_count; ++j) {
405  word_res->best_state.remove(i + 1);
406  word_res->correct_text.remove(i + 1);
407  }
408  // Assume that no box spans multiple source words, so we are done with
409  // this box.
410  if (applybox_debug > 1) {
411  tprintf("Best state = ");
412  for (int j = 0; j < word_res->best_state.size(); ++j) {
413  tprintf("%d ", word_res->best_state[j]);
414  }
415  tprintf("\n");
416  tprintf("Correct text = [[ ");
417  for (int j = 0; j < word_res->correct_text.size(); ++j) {
418  tprintf("%s ", word_res->correct_text[j].string());
419  }
420  tprintf("]]\n");
421  }
422  return true;
423  }
424  }
425  }
426  if (applybox_debug > 0) {
427  tprintf("FAIL!\n");
428  }
429  return false; // Failure.
430 }
431 
438 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
439  const TBOX& box, const TBOX& next_box,
440  const char* correct_text) {
441  if (applybox_debug > 1) {
442  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
443  }
444  WERD* new_word = NULL;
445  BLOCK_IT b_it(block_list);
446  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
447  BLOCK* block = b_it.data();
448  if (!box.major_overlap(block->bounding_box()))
449  continue;
450  ROW_IT r_it(block->row_list());
451  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
452  ROW* row = r_it.data();
453  if (!box.major_overlap(row->bounding_box()))
454  continue;
455  WERD_IT w_it(row->word_list());
456  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
457  WERD* word = w_it.data();
458  if (applybox_debug > 2) {
459  tprintf("Checking word:");
460  word->bounding_box().print();
461  }
462  if (word->text() != NULL && word->text()[0] != '\0')
463  continue; // Ignore words that are already done.
464  if (!box.major_overlap(word->bounding_box()))
465  continue;
466  C_BLOB_IT blob_it(word->cblob_list());
467  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
468  blob_it.forward()) {
469  C_BLOB* blob = blob_it.data();
470  TBOX blob_box = blob->bounding_box();
471  if (!blob_box.major_overlap(box))
472  continue;
473  double current_box_miss_metric = BoxMissMetric(blob_box, box);
474  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
475  if (applybox_debug > 2) {
476  tprintf("Checking blob:");
477  blob_box.print();
478  tprintf("Current miss metric = %g, next = %g\n",
479  current_box_miss_metric, next_box_miss_metric);
480  }
481  if (current_box_miss_metric > next_box_miss_metric)
482  continue; // Blob is a better match for next box.
483  if (applybox_debug > 2) {
484  tprintf("Blob match: blob:");
485  blob_box.print();
486  tprintf("Matches box:");
487  box.print();
488  tprintf("With next box:");
489  next_box.print();
490  }
491  if (new_word == NULL) {
492  // Make a new word with a single blob.
493  new_word = word->shallow_copy();
494  new_word->set_text(correct_text);
495  w_it.add_to_end(new_word);
496  }
497  C_BLOB_IT new_blob_it(new_word->cblob_list());
498  new_blob_it.add_to_end(blob_it.extract());
499  }
500  }
501  }
502  }
503  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
504  return new_word != NULL;
505 }
506 
510  PAGE_RES_IT pr_it(page_res);
511  WERD_RES* word_res;
512  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
513  WERD* word = word_res->word;
514  if (word->text() == NULL || word->text()[0] == '\0')
515  continue; // Ignore words that have no text.
516  // Convert the correct text to a vector of UNICHAR_ID
517  GenericVector<UNICHAR_ID> target_text;
518  if (!ConvertStringToUnichars(word->text(), &target_text)) {
519  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
520  word->text());
521  pr_it.DeleteCurrentWord();
522  continue;
523  }
524  if (!FindSegmentation(target_text, word_res)) {
525  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
526  word->text());
527  pr_it.DeleteCurrentWord();
528  continue;
529  }
530  }
531 }
532 
535 bool Tesseract::ConvertStringToUnichars(const char* utf8,
536  GenericVector<UNICHAR_ID>* class_ids) {
537  for (int step = 0; *utf8 != '\0'; utf8 += step) {
538  const char* next_space = strchr(utf8, ' ');
539  if (next_space == NULL)
540  next_space = utf8 + strlen(utf8);
541  step = next_space - utf8;
542  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
543  if (class_id == INVALID_UNICHAR_ID) {
544  return false;
545  }
546  while (utf8[step] == ' ')
547  ++step;
548  class_ids->push_back(class_id);
549  }
550  return true;
551 }
552 
560  WERD_RES* word_res) {
561  // Classify all required combinations of blobs and save results in choices.
562  int word_length = word_res->box_word->length();
564  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
565  for (int i = 0; i < word_length; ++i) {
566  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
567  BLOB_CHOICE_LIST* match_result = classify_piece(
568  word_res->seam_array, i, i + j - 1, "Applybox",
569  word_res->chopped_word, word_res->blamer_bundle);
570  if (applybox_debug > 2) {
571  tprintf("%d+%d:", i, j);
572  print_ratings_list("Segment:", match_result, unicharset);
573  }
574  choices[i].push_back(match_result);
575  }
576  }
577  // Search the segmentation graph for the target text. Must be an exact
578  // match. Using wildcards makes it difficult to find the correct
579  // segmentation even when it is there.
580  word_res->best_state.clear();
581  GenericVector<int> search_segmentation;
582  float best_rating = 0.0f;
583  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
584  &search_segmentation, &best_rating, &word_res->best_state);
585  for (int i = 0; i < word_length; ++i)
586  choices[i].delete_data_pointers();
587  delete [] choices;
588  if (word_res->best_state.empty()) {
589  // Build the original segmentation and if it is the same length as the
590  // truth, assume it will do.
591  int blob_count = 1;
592  for (int s = 0; s < word_res->seam_array.size(); ++s) {
593  SEAM* seam = word_res->seam_array[s];
594  if (!seam->HasAnySplits()) {
595  word_res->best_state.push_back(blob_count);
596  blob_count = 1;
597  } else {
598  ++blob_count;
599  }
600  }
601  word_res->best_state.push_back(blob_count);
602  if (word_res->best_state.size() != target_text.size()) {
603  word_res->best_state.clear(); // No good. Original segmentation bad size.
604  return false;
605  }
606  }
607  word_res->correct_text.clear();
608  for (int i = 0; i < target_text.size(); ++i) {
609  word_res->correct_text.push_back(
610  STRING(unicharset.id_to_unichar(target_text[i])));
611  }
612  return true;
613 }
614 
630  int choices_pos, int choices_length,
631  const GenericVector<UNICHAR_ID>& target_text,
632  int text_index,
633  float rating, GenericVector<int>* segmentation,
634  float* best_rating,
635  GenericVector<int>* best_segmentation) {
637  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
638  // Rating of matching choice or worst choice if no match.
639  float choice_rating = 0.0f;
640  // Find the corresponding best BLOB_CHOICE.
641  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
642  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
643  choice_it.forward()) {
644  BLOB_CHOICE* choice = choice_it.data();
645  choice_rating = choice->rating();
646  UNICHAR_ID class_id = choice->unichar_id();
647  if (class_id == target_text[text_index]) {
648  break;
649  }
650  // Search ambigs table.
651  if (class_id < table.size() && table[class_id] != NULL) {
652  AmbigSpec_IT spec_it(table[class_id]);
653  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
654  spec_it.forward()) {
655  const AmbigSpec *ambig_spec = spec_it.data();
656  // We'll only do 1-1.
657  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
658  ambig_spec->correct_ngram_id == target_text[text_index])
659  break;
660  }
661  if (!spec_it.cycled_list())
662  break; // Found an ambig.
663  }
664  }
665  if (choice_it.cycled_list())
666  continue; // No match.
667  segmentation->push_back(length);
668  if (choices_pos + length == choices_length &&
669  text_index + 1 == target_text.size()) {
670  // This is a complete match. If the rating is good record a new best.
671  if (applybox_debug > 2) {
672  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
673  rating + choice_rating, *best_rating, segmentation->size(),
674  best_segmentation->size());
675  }
676  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
677  *best_segmentation = *segmentation;
678  *best_rating = rating + choice_rating;
679  }
680  } else if (choices_pos + length < choices_length &&
681  text_index + 1 < target_text.size()) {
682  if (applybox_debug > 3) {
683  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
684  target_text[text_index],
685  unicharset.id_to_unichar(target_text[text_index]),
686  choice_it.data()->unichar_id() == target_text[text_index]
687  ? "Match" : "Ambig",
688  choices_pos, length);
689  }
690  SearchForText(choices, choices_pos + length, choices_length, target_text,
691  text_index + 1, rating + choice_rating, segmentation,
692  best_rating, best_segmentation);
693  if (applybox_debug > 3) {
694  tprintf("End recursion for %d=%s\n", target_text[text_index],
695  unicharset.id_to_unichar(target_text[text_index]));
696  }
697  }
698  segmentation->truncate(segmentation->size() - 1);
699  }
700 }
701 
706 void Tesseract::TidyUp(PAGE_RES* page_res) {
707  int ok_blob_count = 0;
708  int bad_blob_count = 0;
709  int ok_word_count = 0;
710  int unlabelled_words = 0;
711  PAGE_RES_IT pr_it(page_res);
712  WERD_RES* word_res;
713  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
714  int ok_in_word = 0;
715  int blob_count = word_res->correct_text.size();
716  WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
717  word_choice->set_permuter(TOP_CHOICE_PERM);
718  for (int c = 0; c < blob_count; ++c) {
719  if (word_res->correct_text[c].length() > 0) {
720  ++ok_in_word;
721  }
722  // Since we only need a fake word_res->best_choice, the actual
723  // unichar_ids do not matter. Which is fortunate, since TidyUp()
724  // can be called while training Tesseract, at the stage where
725  // unicharset is not meaningful yet.
727  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
728  }
729  if (ok_in_word > 0) {
730  ok_blob_count += ok_in_word;
731  bad_blob_count += word_res->correct_text.size() - ok_in_word;
732  word_res->LogNewRawChoice(word_choice);
733  word_res->LogNewCookedChoice(1, false, word_choice);
734  } else {
735  ++unlabelled_words;
736  if (applybox_debug > 0) {
737  tprintf("APPLY_BOXES: Unlabelled word at :");
738  word_res->word->bounding_box().print();
739  }
740  pr_it.DeleteCurrentWord();
741  delete word_choice;
742  }
743  }
744  pr_it.restart_page();
745  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
746  // Denormalize back to a BoxWord.
747  word_res->RebuildBestState();
748  word_res->SetupBoxWord();
749  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
750  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
751  }
752  if (applybox_debug > 0) {
753  tprintf(" Found %d good blobs.\n", ok_blob_count);
754  if (bad_blob_count > 0) {
755  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
756  bad_blob_count, ok_word_count);
757  }
758  if (unlabelled_words > 0)
759  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
760  }
761 }
762 
764 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
765  const char *box_ch, const char *err_msg) {
766  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
767  boxfile_lineno + 1, box_ch,
768  box.left(), box.bottom(), box.right(), box.top(), err_msg);
769 }
770 
773  PAGE_RES_IT pr_it(page_res);
774  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
775  word_res = pr_it.forward()) {
776  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
777  word_res->correct_text.size());
778  for (int i = 0; i < word_res->correct_text.size(); ++i) {
779  // The part before the first space is the real ground truth, and the
780  // rest is the bounding box location and page number.
781  GenericVector<STRING> tokens;
782  word_res->correct_text[i].split(' ', &tokens);
783  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
784  choice->append_unichar_id_space_allocated(char_id,
785  word_res->best_state[i],
786  0.0f, 0.0f);
787  }
788  word_res->ClearWordChoices();
789  word_res->LogNewRawChoice(choice);
790  word_res->LogNewCookedChoice(1, false, choice);
791  }
792 }
793 
796 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
797  PAGE_RES_IT pr_it(page_res);
798  int word_count = 0;
799  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
800  word_res = pr_it.forward()) {
801  LearnWord(fontname.string(), word_res);
802  ++word_count;
803  }
804  tprintf("Generated training data for %d words\n", word_count);
805 }
806 
807 
808 } // namespace tesseract
ROW * row
Definition: pageres.h:127
BLOCK * block
Definition: pageres.h:99
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
TBOX bounding_box() const
Definition: ocrrow.h:85
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:43
int IntCastRounded(double x)
Definition: helpers.h:172
const TBOX & bounding_box() const
Definition: boxword.h:82
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
WERD_RES * word() const
Definition: pageres.h:736
Definition: werd.h:36
void insert(T t, int index)
#define MAX_INT8
Definition: host.h:51
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
void set_rating(float newrat)
Definition: ratngs.h:147
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
TBOX bounding_box() const
Definition: stepblob.cpp:250
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void set_x_height(float new_xheight)
Definition: ocrrow.h:64
void remove(int index)
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
ROW_RES * row() const
Definition: pageres.h:739
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
int length() const
Definition: boxword.h:85
WERD_RES * restart_page()
Definition: pageres.h:683
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
int push_back(T object)
void PreenXHeights(BLOCK_LIST *block_list)
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
inT16 bottom() const
Definition: rect.h:61
tesseract::BoxWord * box_word
Definition: pageres.h:250
void SetupBoxWord()
Definition: pageres.cpp:843
const char * string() const
Definition: strngs.cpp:201
bool HasAnySplits() const
Definition: seam.h:67
const int kMaxGroupSize
Definition: applybox.cpp:40
ROW_RES * next_row() const
Definition: pageres.h:748
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:154
void RebuildBestState()
Definition: pageres.cpp:800
Definition: werd.h:60
float rating() const
Definition: ratngs.h:79
Definition: werd.h:35
void CorrectClassifyWords(PAGE_RES *page_res)
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
void set_certainty(float newrat)
Definition: ratngs.h:150
WERD_RES * forward()
Definition: pageres.h:716
BlamerBundle * blamer_bundle
Definition: pageres.h:230
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
WERD * shallow_copy()
Definition: werd.cpp:352
void TidyUp(PAGE_RES *page_res)
inT16 left() const
Definition: rect.h:68
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
TWERD * chopped_word
Definition: pageres.h:201
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
Dict & getDict()
Definition: classify.h:65
void truncate(int size)
void print() const
Definition: rect.h:270
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
int x_gap(const TBOX &box) const
Definition: rect.h:217
ROW_RES * prev_row() const
Definition: pageres.h:730
inT32 area() const
Definition: rect.h:118
BLOCK_RES * block() const
Definition: pageres.h:742
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int NumBlobs() const
Definition: blobs.h:425
GenericVector< STRING > correct_text
Definition: pageres.h:259
void ClearWordChoices()
Definition: pageres.cpp:1174
#define tprintf(...)
Definition: tprintf.h:31
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
Definition: ocrrow.h:32
Definition: strngs.h:44
int size() const
Definition: genericvector.h:72
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
Pix * BestPix() const
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
inT16 top() const
Definition: rect.h:54
const char * text() const
Definition: werd.h:125
int length() const
Definition: genericvector.h:79
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
Definition: seam.h:44
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:143
bool classify_bln_numeric_mode
Definition: classify.h:500
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void ReSegmentByClassification(PAGE_RES *page_res)
WERD_LIST * word_list()
Definition: ocrrow.h:52
float x_height() const
Definition: ocrrow.h:61
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
UNICHARSET unicharset
Definition: ccutil.h:70
WERD * word
Definition: pageres.h:175
void DeleteCurrentWord()
Definition: pageres.cpp:1450
GenericVector< int > best_state
Definition: pageres.h:255
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
bool empty() const
Definition: genericvector.h:84
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:373
Definition: statistc.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:50
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
TBOX bounding_box() const
Definition: werd.cpp:160
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450
void set_text(const char *new_text)
Definition: werd.h:126
int UNICHAR_ID
Definition: unichar.h:33