tesseract  3.05.02
tesseract_cube_combiner.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tesseract_cube_combiner.h
3  * Description: Declaration of the Tesseract & Cube results combiner Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The TesseractCubeCombiner class provides the functionality of combining
21 // the recognition results of Tesseract and Cube at the word level
22 
23 #include <algorithm>
24 #include <wctype.h>
25 
27 
28 #include "cube_object.h"
29 #include "cube_reco_context.h"
30 #include "cube_utils.h"
31 #include "neural_net.h"
32 #include "tesseractclass.h"
33 #include "word_altlist.h"
34 
35 namespace tesseract {
36 
38  cube_cntxt_ = cube_cntxt;
39  combiner_net_ = NULL;
40 }
41 
43  if (combiner_net_ != NULL) {
44  delete combiner_net_;
45  combiner_net_ = NULL;
46  }
47 }
48 
50  ASSERT_HOST(cube_cntxt_);
51  // Compute the path of the combiner net
52  string data_path;
53  cube_cntxt_->GetDataFilePath(&data_path);
54  string net_file_name = data_path + cube_cntxt_->Lang() +
55  ".tesseract_cube.nn";
56 
57  // Return false if file does not exist
58  FILE *fp = fopen(net_file_name.c_str(), "rb");
59  if (fp == NULL)
60  return false;
61  else
62  fclose(fp);
63 
64  // Load and validate net
65  combiner_net_ = NeuralNet::FromFile(net_file_name);
66  if (combiner_net_ == NULL) {
67  tprintf("Could not read combiner net file %s", net_file_name.c_str());
68  return false;
69  } else if (combiner_net_->out_cnt() != 2) {
70  tprintf("Invalid combiner net file %s! Output count != 2\n",
71  net_file_name.c_str());
72  delete combiner_net_;
73  combiner_net_ = NULL;
74  return false;
75  }
76  return true;
77 }
78 
79 // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
80 // strips punc and/or normalizes case and then converts back
81 string TesseractCubeCombiner::NormalizeString(const string &str,
82  bool remove_punc,
83  bool norm_case) {
84  // convert to UTF32
85  string_32 str32;
86  CubeUtils::UTF8ToUTF32(str.c_str(), &str32);
87  // strip punc and normalize
88  string_32 new_str32;
89  for (int idx = 0; idx < str32.length(); idx++) {
90  // if no punc removal is required or not a punctuation character
91  if (!remove_punc || iswpunct(str32[idx]) == 0) {
92  char_32 norm_char = str32[idx];
93  // normalize case if required
94  if (norm_case && iswalpha(norm_char)) {
95  norm_char = towlower(norm_char);
96  }
97  new_str32.push_back(norm_char);
98  }
99  }
100  // convert back to UTF8
101  string new_str;
102  CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str);
103  return new_str;
104 }
105 
106 // Compares 2 strings optionally ignoring punctuation
107 int TesseractCubeCombiner::CompareStrings(const string &str1,
108  const string &str2,
109  bool ignore_punc,
110  bool ignore_case) {
111  if (!ignore_punc && !ignore_case) {
112  return str1.compare(str2);
113  }
114  string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
115  string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
116  return norm_str1.compare(norm_str2);
117 }
118 
119 // Check if a string is a valid Tess dict word or not
120 bool TesseractCubeCombiner::ValidWord(const string &str) {
121  return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str())
122  > 0);
123 }
124 
125 // Public method for computing the combiner features. The agreement
126 // output parameter will be true if both answers are identical,
127 // and false otherwise.
129  int tess_confidence,
130  CubeObject *cube_obj,
131  WordAltList *cube_alt_list,
132  vector<double> *features,
133  bool *agreement) {
134  features->clear();
135  *agreement = false;
136  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)
137  return false;
138 
139  // Get Cube's best string; return false if empty
140  char_32 *cube_best_str32 = cube_alt_list->Alt(0);
141  if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1)
142  return false;
143  string cube_best_str;
144  int cube_best_cost = cube_alt_list->AltCost(0);
145  int cube_best_bigram_cost = 0;
146  bool cube_best_bigram_cost_valid = true;
147  if (cube_cntxt_->Bigrams())
148  cube_best_bigram_cost = cube_cntxt_->Bigrams()->
149  Cost(cube_best_str32, cube_cntxt_->CharacterSet());
150  else
151  cube_best_bigram_cost_valid = false;
152  CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
153 
154  // Get Tesseract's UTF32 string
155  string_32 tess_str32;
156  CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32);
157 
158  // Compute agreement flag
159  *agreement = (tess_str.compare(cube_best_str) == 0);
160 
161  // Get Cube's second best string; if empty, return false
162  char_32 *cube_next_best_str32;
163  string cube_next_best_str;
164  int cube_next_best_cost = WORST_COST;
165  if (cube_alt_list->AltCount() > 1) {
166  cube_next_best_str32 = cube_alt_list->Alt(1);
167  if (cube_next_best_str32 == NULL ||
168  CubeUtils::StrLen(cube_next_best_str32) == 0) {
169  return false;
170  }
171  cube_next_best_cost = cube_alt_list->AltCost(1);
172  CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str);
173  }
174  // Rank of Tesseract's top result in Cube's alternate list
175  int tess_rank = 0;
176  for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) {
177  string alt_str;
178  CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str);
179  if (alt_str == tess_str)
180  break;
181  }
182 
183  // Cube's cost for tesseract's result. Note that this modifies the
184  // state of cube_obj, including its alternate list by calling RecognizeWord()
185  int tess_cost = cube_obj->WordCost(tess_str.c_str());
186  // Cube's bigram cost of Tesseract's string
187  int tess_bigram_cost = 0;
188  int tess_bigram_cost_valid = true;
189  if (cube_cntxt_->Bigrams())
190  tess_bigram_cost = cube_cntxt_->Bigrams()->
191  Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet());
192  else
193  tess_bigram_cost_valid = false;
194 
195  // Tesseract confidence
196  features->push_back(tess_confidence);
197  // Cube cost of Tesseract string
198  features->push_back(tess_cost);
199  // Cube Rank of Tesseract string
200  features->push_back(tess_rank);
201  // length of Tesseract OCR string
202  features->push_back(tess_str.length());
203  // Tesseract OCR string in dictionary
204  features->push_back(ValidWord(tess_str));
205  if (tess_bigram_cost_valid) {
206  // bigram cost of Tesseract string
207  features->push_back(tess_bigram_cost);
208  }
209  // Cube tess_cost of Cube best string
210  features->push_back(cube_best_cost);
211  // Cube tess_cost of Cube next best string
212  features->push_back(cube_next_best_cost);
213  // length of Cube string
214  features->push_back(cube_best_str.length());
215  // Cube string in dictionary
216  features->push_back(ValidWord(cube_best_str));
217  if (cube_best_bigram_cost_valid) {
218  // bigram cost of Cube string
219  features->push_back(cube_best_bigram_cost);
220  }
221  // case-insensitive string comparison, including punctuation
222  int compare_nocase_punc = CompareStrings(cube_best_str,
223  tess_str, false, true);
224  features->push_back(compare_nocase_punc == 0);
225  // case-sensitive string comparison, ignoring punctuation
226  int compare_case_nopunc = CompareStrings(cube_best_str,
227  tess_str, true, false);
228  features->push_back(compare_case_nopunc == 0);
229  // case-insensitive string comparison, ignoring punctuation
230  int compare_nocase_nopunc = CompareStrings(cube_best_str,
231  tess_str, true, true);
232  features->push_back(compare_nocase_nopunc == 0);
233  return true;
234 }
235 
236 // The CubeObject parameter is used for 2 purposes: 1) to retrieve
237 // cube's alt list, and 2) to compute cube's word cost for the
238 // tesseract result. The call to CubeObject::WordCost() modifies
239 // the object's alternate list, so previous state will be lost.
241  CubeObject *cube_obj) {
242  // If no combiner is loaded or the cube object is undefined,
243  // tesseract wins with probability 1.0
244  if (combiner_net_ == NULL || cube_obj == NULL) {
245  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
246  "Cube objects not initialized; defaulting to Tesseract\n");
247  return 1.0;
248  }
249 
250  // Retrieve the alternate list from the CubeObject's current state.
251  // If the alt list empty, tesseract wins with probability 1.0
252  WordAltList *cube_alt_list = cube_obj->AlternateList();
253  if (cube_alt_list == NULL)
254  cube_alt_list = cube_obj->RecognizeWord();
255  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
256  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
257  "Cube returned no results; defaulting to Tesseract\n");
258  return 1.0;
259  }
260  return CombineResults(tess_res, cube_obj, cube_alt_list);
261 }
262 
263 // The alt_list parameter is expected to have been extracted from the
264 // CubeObject that recognized the word to be combined. The cube_obj
265 // parameter passed may be either same instance or a separate instance to
266 // be used only by the combiner. In both cases, its alternate
267 // list will be modified by an internal call to RecognizeWord().
269  CubeObject *cube_obj,
270  WordAltList *cube_alt_list) {
271  // If no combiner is loaded or the cube object is undefined, or the
272  // alt list is empty, tesseract wins with probability 1.0
273  if (combiner_net_ == NULL || cube_obj == NULL ||
274  cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
275  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
276  "Cube result cannot be retrieved; defaulting to Tesseract\n");
277  return 1.0;
278  }
279 
280  // Tesseract result string, tesseract confidence, and cost of
281  // tesseract result according to cube
282  string tess_str = tess_res->best_choice->unichar_string().string();
283  // Map certainty [-20.0, 0.0] to confidence [0, 100]
284  int tess_confidence = MIN(100, MAX(1, static_cast<int>(
285  100 + (5 * tess_res->best_choice->certainty()))));
286 
287  // Compute the combiner features. If feature computation fails or
288  // answers are identical, tesseract wins with probability 1.0
289  vector<double> features;
290  bool agreement;
291  bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence,
292  cube_obj, cube_alt_list,
293  &features, &agreement);
294  if (!combiner_success || agreement)
295  return 1.0;
296 
297  // Classify combiner feature vector and return output (probability
298  // of tesseract class).
299  double net_out[2];
300  if (!combiner_net_->FeedForward(&features[0], net_out))
301  return 1.0;
302  return net_out[1];
303 }
304 }
static NeuralNet * FromFile(const string file_name)
Definition: neural_net.cpp:210
const STRING & unichar_string() const
Definition: ratngs.h:525
bool FeedForward(const Type *inputs, Type *outputs)
Definition: neural_net.cpp:88
TesseractCubeCombiner(CubeRecoContext *cube_cntxt)
#define MIN(x, y)
Definition: ndminx.h:28
float certainty() const
Definition: ratngs.h:328
int WordCost(const char *str)
char_32 * Alt(int alt_idx)
Definition: word_altlist.h:41
WERD_CHOICE * best_choice
Definition: pageres.h:219
const char * string() const
Definition: strngs.cpp:201
#define WORST_COST
Definition: cube_const.h:30
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:272
tesseract::Tesseract * TesseractObject() const
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
WordAltList * RecognizeWord(LangModel *lang_mod=NULL)
const string & Lang() const
int out_cnt() const
Definition: neural_net.h:50
Dict & getDict()
Definition: classify.h:65
CharSet * CharacterSet() const
#define MAX(x, y)
Definition: ndminx.h:24
signed int char_32
Definition: string_32.h:40
#define tprintf(...)
Definition: tprintf.h:31
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:730
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
basic_string< char_32 > string_32
Definition: string_32.h:41
bool GetDataFilePath(string *path) const
int AltCost(int alt_idx) const
Definition: altlist.h:41
bool ValidWord(const std::string &str)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256
int AltCount() const
Definition: altlist.h:39
bool ComputeCombinerFeatures(const std::string &tess_res, int tess_confidence, CubeObject *cube_obj, WordAltList *cube_alt_list, std::vector< double > *features, bool *agreement)
#define ASSERT_HOST(x)
Definition: errcode.h:84
CharBigrams * Bigrams() const
WordAltList * AlternateList() const
Definition: cube_object.h:119