tesseract  3.05.02
tfacepp.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tfacepp.cpp (Formerly tface++.c)
3  * Description: C++ side of the C/C++ Tess/Editor interface.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 15:39:23 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #pragma warning(disable:4305) // int/float warnings
23 #pragma warning(disable:4800) // int/bool warnings
24 #endif
25 
26 #include <math.h>
27 
28 #include "blamer.h"
29 #include "errcode.h"
30 #include "ratngs.h"
31 #include "reject.h"
32 #include "tesseractclass.h"
33 #include "werd.h"
34 
35 #define MAX_UNDIVIDED_LENGTH 24
36 
37 
38 
39 /**********************************************************************
40  * recog_word
41  *
42  * Convert the word to tess form and pass it to the tess segmenter.
43  * Convert the output back to editor form.
44  **********************************************************************/
45 namespace tesseract {
47  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
49  if (classify_debug_level) tprintf("No truth for word - skipping\n");
50  word->tess_failed = true;
51  return;
52  }
55  word->SetupBoxWord();
56  if (word->best_choice->length() != word->box_word->length()) {
57  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
58  "Strlen=%d; #Blobs=%d\n",
59  word->best_choice->debug_string().string(),
60  word->best_choice->length(), word->box_word->length());
61  }
62  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
63  // Check that the ratings matrix size matches the sum of all the
64  // segmentation states.
65  if (!word->StatesAllValid()) {
66  tprintf("Not all words have valid states relative to ratings matrix!!");
67  word->DebugWordChoices(true, NULL);
68  ASSERT_HOST(word->StatesAllValid());
69  }
71  /* Override the permuter type if a straight dictionary check disagrees. */
72  uinT8 perm_type = word->best_choice->permuter();
73  if ((perm_type != SYSTEM_DAWG_PERM) &&
74  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
75  uinT8 real_dict_perm_type = dict_word(*word->best_choice);
76  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
77  (real_dict_perm_type == FREQ_DAWG_PERM) ||
78  (real_dict_perm_type == USER_DAWG_PERM)) &&
80  word->best_choice->unichar_lengths().string()) > 0)) {
81  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
82  }
83  }
85  perm_type != word->best_choice->permuter()) {
86  tprintf("Permuter Type Flipped from %d to %d\n",
87  perm_type, word->best_choice->permuter());
88  }
89  }
90  // Factored out from control.cpp
91  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
92  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
93  static_cast<int>(strspn(word->best_choice->unichar_string().string(),
94  " ")) == word->best_choice->length()) {
95  word->tess_failed = true;
96  word->reject_map.initialise(word->box_word->length());
98  } else {
99  word->tess_failed = false;
100  }
101 }
102 
103 
104 /**********************************************************************
105  * recog_word_recursive
106  *
107  * Convert the word to tess form and pass it to the tess segmenter.
108  * Convert the output back to editor form.
109  **********************************************************************/
111  int word_length = word->chopped_word->NumBlobs(); // no of blobs
112  if (word_length > MAX_UNDIVIDED_LENGTH) {
113  return split_and_recog_word(word);
114  }
115  cc_recog(word);
116  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
117 
118  // Do sanity checks and minor fixes on best_choice.
119  if (word->best_choice->length() > word_length) {
120  word->best_choice->make_bad(); // should never happen
121  tprintf("recog_word: Discarded long string \"%s\""
122  " (%d characters vs %d blobs)\n",
123  word->best_choice->unichar_string().string(),
124  word->best_choice->length(), word_length);
125  tprintf("Word is at:");
126  word->word->bounding_box().print();
127  }
128  if (word->best_choice->length() < word_length) {
129  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
130  while (word->best_choice->length() < word_length) {
131  word->best_choice->append_unichar_id(space_id, 1, 0.0,
132  word->best_choice->certainty());
133  }
134  }
135 }
136 
137 
138 /**********************************************************************
139  * split_and_recog_word
140  *
141  * Split the word into 2 smaller pieces at the largest gap.
142  * Recognize the pieces and stick the results back together.
143  **********************************************************************/
145  // Find the biggest blob gap in the chopped_word.
146  int bestgap = -MAX_INT32;
147  int split_index = 0;
148  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
149  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
150  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
151  int gap = blob_box.left() - prev_box.right();
152  if (gap > bestgap) {
153  bestgap = gap;
154  split_index = b;
155  }
156  }
157  ASSERT_HOST(split_index > 0);
158 
159  WERD_RES *word2 = NULL;
160  BlamerBundle *orig_bb = NULL;
161  split_word(word, split_index, &word2, &orig_bb);
162 
163  // Recognize the first part of the word.
164  recog_word_recursive(word);
165  // Recognize the second part of the word.
166  recog_word_recursive(word2);
167 
168  join_words(word, word2, orig_bb);
169 }
170 
171 
172 /**********************************************************************
173  * split_word
174  *
175  * Split a given WERD_RES in place into two smaller words for recognition.
176  * split_pt is the index of the first blob to go in the second word.
177  * The underlying word is left alone, only the TWERD (and subsequent data)
178  * are split up. orig_blamer_bundle is set to the original blamer bundle,
179  * and will now be owned by the caller. New blamer bundles are forged for the
180  * two pieces.
181  **********************************************************************/
183  int split_pt,
184  WERD_RES **right_piece,
185  BlamerBundle **orig_blamer_bundle) const {
186  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
187 
188  // Save a copy of the blamer bundle so we can try to reconstruct it below.
189  BlamerBundle *orig_bb =
190  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
191 
192  WERD_RES *word2 = new WERD_RES(*word);
193 
194  // blow away the copied chopped_word, as we want to work with
195  // the blobs from the input chopped_word so seam_arrays can be merged.
196  TWERD *chopped = word->chopped_word;
197  TWERD *chopped2 = new TWERD;
198  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
199  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
200  chopped2->blobs.push_back(chopped->blobs[i]);
201  }
202  chopped->blobs.truncate(split_pt);
203  word->chopped_word = NULL;
204  delete word2->chopped_word;
205  word2->chopped_word = NULL;
206 
207  const UNICHARSET &unicharset = *word->uch_set;
208  word->ClearResults();
209  word2->ClearResults();
210  word->chopped_word = chopped;
211  word2->chopped_word = chopped2;
214 
215  // Try to adjust the blamer bundle.
216  if (orig_bb != NULL) {
217  // TODO(rays) Looks like a leak to me.
218  // orig_bb should take, rather than copy.
219  word->blamer_bundle = new BlamerBundle();
220  word2->blamer_bundle = new BlamerBundle();
221  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
222  word2->chopped_word->blobs[0]->bounding_box().left(),
224  word->blamer_bundle, word2->blamer_bundle);
225  }
226 
227  *right_piece = word2;
228  *orig_blamer_bundle = orig_bb;
229 }
230 
231 
232 /**********************************************************************
233  * join_words
234  *
235  * The opposite of split_word():
236  * join word2 (including any recognized data / seam array / etc)
237  * onto the right of word and then delete word2.
238  * Also, if orig_bb is provided, stitch it back into word.
239  **********************************************************************/
241  WERD_RES *word2,
242  BlamerBundle *orig_bb) const {
243  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
244  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
245  // Tack the word2 outputs onto the end of the word outputs.
246  word->chopped_word->blobs += word2->chopped_word->blobs;
247  word->rebuild_word->blobs += word2->rebuild_word->blobs;
248  word2->chopped_word->blobs.clear();
249  word2->rebuild_word->blobs.clear();
250  TPOINT split_pt;
251  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
252  split_pt.y = (prev_box.top() + prev_box.bottom() +
253  blob_box.top() + blob_box.bottom()) / 4;
254  // Move the word2 seams onto the end of the word1 seam_array.
255  // Since the seam list is one element short, an empty seam marking the
256  // end of the last blob in the first word is needed first.
257  word->seam_array.push_back(new SEAM(0.0f, split_pt));
258  word->seam_array += word2->seam_array;
259  word2->seam_array.truncate(0);
260  // Fix widths and gaps.
261  word->blob_widths += word2->blob_widths;
262  word->blob_gaps += word2->blob_gaps;
263  // Fix the ratings matrix.
264  int rat1 = word->ratings->dimension();
265  int rat2 = word2->ratings->dimension();
266  word->ratings->AttachOnCorner(word2->ratings);
267  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
268  word->best_state += word2->best_state;
269  // Append the word choices.
270  *word->raw_choice += *word2->raw_choice;
271 
272  // How many alt choices from each should we try to get?
273  const int kAltsPerPiece = 2;
274  // When do we start throwing away extra alt choices?
275  const int kTooManyAltChoices = 100;
276 
277  // Construct the cartesian product of the best_choices of word(1) and word2.
278  WERD_CHOICE_LIST joined_choices;
279  WERD_CHOICE_IT jc_it(&joined_choices);
280  WERD_CHOICE_IT bc1_it(&word->best_choices);
281  WERD_CHOICE_IT bc2_it(&word2->best_choices);
282  int num_word1_choices = word->best_choices.length();
283  int total_joined_choices = num_word1_choices;
284  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
285  // word2 choices, and put them in the joined_choices list. The 1st word2
286  // choice gets added to the original word1 choices in-place after we have
287  // finished with them.
288  int bc2_index = 1;
289  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
290  if (total_joined_choices >= kTooManyAltChoices &&
291  bc2_index > kAltsPerPiece)
292  break;
293  int bc1_index = 0;
294  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
295  ++bc1_index, bc1_it.forward()) {
296  if (total_joined_choices >= kTooManyAltChoices &&
297  bc1_index > kAltsPerPiece)
298  break;
299  WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
300  *wc += *bc2_it.data();
301  jc_it.add_after_then_move(wc);
302  ++total_joined_choices;
303  }
304  }
305  // Now that we've filled in as many alternates as we want, paste the best
306  // choice for word2 onto the original word alt_choices.
307  bc1_it.move_to_first();
308  bc2_it.move_to_first();
309  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
310  *bc1_it.data() += *bc2_it.data();
311  }
312  bc1_it.move_to_last();
313  bc1_it.add_list_after(&joined_choices);
314 
315  // Restore the pointer to original blamer bundle and combine blamer
316  // information recorded in the splits.
317  if (orig_bb != NULL) {
318  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
320  delete word->blamer_bundle;
321  word->blamer_bundle = orig_bb;
322  }
323  word->SetupBoxWord();
324  word->reject_map.initialise(word->box_word->length());
325  delete word2;
326 }
327 
328 
329 } // namespace tesseract
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
void ClearResults()
Definition: pageres.cpp:1141
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:334
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
TWERD * rebuild_word
Definition: pageres.h:244
int dimension() const
Definition: matrix.h:530
const STRING & unichar_string() const
Definition: ratngs.h:525
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void initialise(inT16 length)
Definition: rejctmap.cpp:318
Definition: blobs.h:50
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:471
float certainty() const
Definition: ratngs.h:328
unsigned char uinT8
Definition: host.h:32
void rej_word_tess_failure()
Definition: rejctmap.cpp:425
int length() const
Definition: boxword.h:85
T & back() const
uinT8 permuter() const
Definition: ratngs.h:344
WERD_CHOICE * best_choice
Definition: pageres.h:219
TBOX bounding_box() const
Definition: blobs.cpp:482
inT16 y
Definition: blobs.h:72
BOOL8 tess_failed
Definition: pageres.h:272
int push_back(T object)
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
bool StatesAllValid()
Definition: pageres.cpp:449
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:46
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
inT16 bottom() const
Definition: rect.h:61
tesseract::BoxWord * box_word
Definition: pageres.h:250
void SetupBoxWord()
Definition: pageres.cpp:843
const char * string() const
Definition: strngs.cpp:201
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:441
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:225
GenericVector< int > blob_gaps
Definition: pageres.h:208
MATRIX * ratings
Definition: pageres.h:215
BlamerBundle * blamer_bundle
Definition: pageres.h:230
inT16 left() const
Definition: rect.h:68
TWERD * chopped_word
Definition: pageres.h:201
Definition: blobs.h:395
void truncate(int size)
void print() const
Definition: rect.h:270
int length() const
Definition: ratngs.h:301
inT16 x
Definition: blobs.h:71
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:35
int NumBlobs() const
Definition: blobs.h:425
#define tprintf(...)
Definition: tprintf.h:31
bool wordrec_debug_blamer
Definition: wordrec.h:167
#define MAX_INT32
Definition: host.h:53
const UNICHARSET * uch_set
Definition: pageres.h:192
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128
inT16 top() const
Definition: rect.h:54
void cc_recog(WERD_RES *word)
Definition: tface.cpp:113
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:144
const STRING & unichar_lengths() const
Definition: ratngs.h:532
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:547
Definition: seam.h:44
bool wordrec_skip_no_truth_words
Definition: wordrec.h:166
GenericVector< int > blob_widths
Definition: pageres.h:205
Definition: rect.h:30
inT16 right() const
Definition: rect.h:75
void reserve(int size)
UNICHARSET unicharset
Definition: ccutil.h:70
WERD * word
Definition: pageres.h:175
GenericVector< int > best_state
Definition: pageres.h:255
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
const STRING debug_string() const
Definition: ratngs.h:503
bool empty() const
Definition: genericvector.h:84
#define ASSERT_HOST(x)
Definition: errcode.h:84
TBOX bounding_box() const
Definition: werd.cpp:160
REJMAP reject_map
Definition: pageres.h:271
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int UNICHAR_ID
Definition: unichar.h:33