tesseract  3.05.02
word_unigrams.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_unigrams.cpp
3  * Description: Implementation of the Word Unigrams Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include <algorithm>
24 
25 #include "const.h"
26 #include "cube_utils.h"
27 #include "ndminx.h"
28 #include "word_unigrams.h"
29 
30 namespace tesseract {
31 
33  costs_ = NULL;
34  words_ = NULL;
35  word_cnt_ = 0;
36 }
37 
39  if (words_ != NULL) {
40  if (words_[0] != NULL) {
41  delete []words_[0];
42  }
43 
44  delete []words_;
45  words_ = NULL;
46  }
47 
48  if (costs_ != NULL) {
49  delete []costs_;
50  }
51 }
52 
57 WordUnigrams *WordUnigrams::Create(const string &data_file_path,
58  const string &lang) {
59  string file_name;
60  string str;
61 
62  file_name = data_file_path + lang;
63  file_name += ".cube.word-freq";
64 
65  // load the string into memory
66  if (CubeUtils::ReadFileToString(file_name, &str) == false) {
67  return NULL;
68  }
69 
70  // split into lines
71  vector<string> str_vec;
72  CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
73  if (str_vec.size() < 2) {
74  return NULL;
75  }
76 
77  // allocate memory
78  WordUnigrams *word_unigrams_obj = new WordUnigrams();
79 
80  int full_len = str.length();
81  int word_cnt = str_vec.size() / 2;
82  word_unigrams_obj->words_ = new char*[word_cnt];
83  word_unigrams_obj->costs_ = new int[word_cnt];
84 
85  word_unigrams_obj->words_[0] = new char[full_len];
86 
87  // construct sorted list of words and costs
88  word_unigrams_obj->word_cnt_ = 0;
89  char *char_buff = word_unigrams_obj->words_[0];
90  word_cnt = 0;
91  int max_cost = 0;
92 
93  for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
94  word_unigrams_obj->words_[word_cnt] = char_buff;
95 
96  strcpy(char_buff, str_vec[wrd].c_str());
97  char_buff += (str_vec[wrd].length() + 1);
98 
99  if (sscanf(str_vec[wrd + 1].c_str(), "%d",
100  word_unigrams_obj->costs_ + word_cnt) != 1) {
101  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
102  "word unigram data.\n");
103  delete word_unigrams_obj;
104  return NULL;
105  }
106  // update max cost
107  max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
108  word_cnt++;
109  }
110  word_unigrams_obj->word_cnt_ = word_cnt;
111 
112  // compute the not-in-list-cost by assuming that a word not in the list
113  // [ahmadab]: This can be computed as follows:
114  // - Given that the distribution of words follow Zipf's law:
115  // (F = K / (rank ^ S)), where s is slightly > 1.0
116  // - Number of words in the list is N
117  // - The mean frequency of a word that did not appear in the list is the
118  // area under the rest of the Zipf's curve divided by 2 (the mean)
119  // - The area would be the bound integral from N to infinity =
120  // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
121  // - Given that cost = -LOG(prob), the cost of an unlisted word would be
122  // = max_cost + 2*LOG(N)
123  word_unigrams_obj->not_in_list_cost_ = max_cost +
124  (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
125  // success
126  return word_unigrams_obj;
127 }
128 
135 int WordUnigrams::Cost(const char_32 *key_str32,
136  LangModel *lang_mod,
137  CharSet *char_set) const {
138  if (!key_str32)
139  return 0;
140  // convert string to UTF8 to split into space-separated words
141  string key_str;
142  CubeUtils::UTF32ToUTF8(key_str32, &key_str);
143  vector<string> words;
144  CubeUtils::SplitStringUsing(key_str, " \t", &words);
145 
146  // no words => no cost
147  if (words.empty()) {
148  return 0;
149  }
150 
151  // aggregate the costs of all the words
152  int cost = 0;
153  for (int word_idx = 0; word_idx < words.size(); word_idx++) {
154  // convert each word back to UTF32 for analyzing case and punctuation
155  string_32 str32;
156  CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
157  int len = CubeUtils::StrLen(str32.c_str());
158 
159  // strip all trailing punctuation
160  string clean_str;
161  int clean_len = len;
162  bool trunc = false;
163  while (clean_len > 0 &&
164  lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
165  --clean_len;
166  trunc = true;
167  }
168 
169  // If either the original string was not truncated (no trailing
170  // punctuation) or the entire string was removed (all characters
171  // are trailing punctuation), evaluate original word as is;
172  // otherwise, copy all but the trailing punctuation characters
173  char_32 *clean_str32 = NULL;
174  if (clean_len == 0 || !trunc) {
175  clean_str32 = CubeUtils::StrDup(str32.c_str());
176  } else {
177  clean_str32 = new char_32[clean_len + 1];
178  for (int i = 0; i < clean_len; ++i) {
179  clean_str32[i] = str32[i];
180  }
181  clean_str32[clean_len] = '\0';
182  }
183  ASSERT_HOST(clean_str32 != NULL);
184 
185  string str8;
186  CubeUtils::UTF32ToUTF8(clean_str32, &str8);
187  int word_cost = CostInternal(str8.c_str());
188 
189  // if case invariant, get costs of all-upper-case and all-lower-case
190  // versions and return the min cost
191  if (clean_len >= kMinLengthNumOrCaseInvariant &&
192  CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
193  char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
194  if (lower_32) {
195  string lower_8;
196  CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
197  word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
198  delete [] lower_32;
199  }
200  char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
201  if (upper_32) {
202  string upper_8;
203  CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
204  word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
205  delete [] upper_32;
206  }
207  }
208 
209  if (clean_len >= kMinLengthNumOrCaseInvariant) {
210  // if characters are all numeric, incur 0 word cost
211  bool is_numeric = true;
212  for (int i = 0; i < clean_len; ++i) {
213  if (!lang_mod->IsDigit(clean_str32[i]))
214  is_numeric = false;
215  }
216  if (is_numeric)
217  word_cost = 0;
218  }
219  delete [] clean_str32;
220  cost += word_cost;
221  } // word_idx
222 
223  // return the mean cost
224  return static_cast<int>(cost / static_cast<double>(words.size()));
225 }
226 
230 int WordUnigrams::CostInternal(const char *key_str) const {
231  if (strlen(key_str) == 0)
232  return not_in_list_cost_;
233  int hi = word_cnt_ - 1;
234  int lo = 0;
235  while (lo <= hi) {
236  int current = (hi + lo) / 2;
237  int comp = strcmp(key_str, words_[current]);
238  // a match
239  if (comp == 0) {
240  return costs_[current];
241  }
242  if (comp < 0) {
243  // go lower
244  hi = current - 1;
245  } else {
246  // go higher
247  lo = current + 1;
248  }
249  }
250  return not_in_list_cost_;
251 }
252 } // namespace tesseract
int CostInternal(const char *str) const
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:338
#define MIN(x, y)
Definition: ndminx.h:28
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:189
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:369
static WordUnigrams * Create(const string &data_file_path, const string &lang)
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:37
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:272
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:220
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:284
virtual bool IsTrailingPunc(char_32 ch)=0
#define MAX(x, y)
Definition: ndminx.h:24
signed int char_32
Definition: string_32.h:40
basic_string< char_32 > string_32
Definition: string_32.h:41
virtual bool IsDigit(char_32 ch)=0
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256
int Cost(const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:90