tesseract  3.05.02
cube_utils.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: cube_utils.cpp
3  * Description: Implementation of the Cube Utilities Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include "cube_utils.h"
24 #include "char_set.h"
25 #include "unichar.h"
26 
27 namespace tesseract {
29 }
30 
32 }
33 
37 int CubeUtils::Prob2Cost(double prob_val) {
38  if (prob_val < MIN_PROB) {
39  return MIN_PROB_COST;
40  }
41  return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
42 }
43 
47 double CubeUtils::Cost2Prob(int cost) {
48  return exp(-cost / PROB2COST_SCALE);
49 }
50 
54 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
55  if (char_32_ptr == NULL) {
56  return 0;
57  }
58  int len = -1;
59  while (char_32_ptr[++len]);
60  return len;
61 }
62 
66 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
67  const char_32 *pch1 = str1;
68  const char_32 *pch2 = str2;
69 
70  for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
71  if ((*pch1) != (*pch2)) {
72  return (*pch1) - (*pch2);
73  }
74  }
75 
76  if ((*pch1) == 0) {
77  if ((*pch2) == 0) {
78  return 0;
79  } else {
80  return -1;
81  }
82  } else {
83  return 1;
84  }
85 }
86 
91  int len = StrLen(str32);
92  char_32 *new_str = new char_32[len + 1];
93  memcpy(new_str, str32, len * sizeof(*str32));
94  new_str[len] = 0;
95  return new_str;
96 }
97 
101 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
102  int wid, int hgt) {
103  // get the raw img data from the image
104  unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
105  if (temp_buff == NULL) {
106  return NULL;
107  }
108 
109  // create a char samp from temp buffer
110  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
111 
112  // clean up temp buffer
113  delete []temp_buff;
114  return char_samp;
115 }
116 
121  // parameter check
122  if (char_samp == NULL) {
123  return NULL;
124  }
125 
126  // get the raw data
127  int stride = char_samp->Stride();
128  int wid = char_samp->Width();
129  int hgt = char_samp->Height();
130 
131  Pix *pix = pixCreate(wid, hgt, 1);
132  if (pix == NULL) {
133  return NULL;
134  }
135 
136  // copy the contents
137  unsigned char *line = char_samp->RawData();
138  for (int y = 0; y < hgt ; y++, line += stride) {
139  for (int x = 0; x < wid; x++) {
140  if (line[x] != 0) {
141  pixSetPixel(pix, x, y, 0);
142  } else {
143  pixSetPixel(pix, x, y, 255);
144  }
145  }
146  }
147 
148  return pix;
149 }
150 
154 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
155  int wid, int hgt) {
156  // skip invalid dimensions
157  if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
158  (left + wid) > pix->w || (top + hgt) > pix->h ||
159  pix->d != 1) {
160  return NULL;
161  }
162 
163  // copy the char img to a temp buffer
164  unsigned char *temp_buff = new unsigned char[wid * hgt];
165  l_int32 w;
166  l_int32 h;
167  l_int32 d;
168  l_int32 wpl;
169  l_uint32 *line;
170  l_uint32 *data;
171 
172  pixGetDimensions(pix, &w, &h, &d);
173  wpl = pixGetWpl(pix);
174  data = pixGetData(pix);
175  line = data + (top * wpl);
176 
177  for (int y = 0, off = 0; y < hgt ; y++) {
178  for (int x = 0; x < wid; x++, off++) {
179  temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
180  }
181  line += wpl;
182  }
183  return temp_buff;
184 }
185 
189 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
190  str->clear();
191  FILE *fp = fopen(file_name.c_str(), "rb");
192  if (fp == NULL) {
193  return false;
194  }
195 
196  // get the size of the size
197  fseek(fp, 0, SEEK_END);
198  int file_size = ftell(fp);
199  if (file_size < 1) {
200  fclose(fp);
201  return false;
202  }
203  // adjust string size
204  str->reserve(file_size);
205  // read the contents
206  rewind(fp);
207  char *buff = new char[file_size];
208  int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
209  if (read_bytes == file_size) {
210  str->append(buff, file_size);
211  }
212  delete []buff;
213  fclose(fp);
214  return (read_bytes == file_size);
215 }
216 
220 void CubeUtils::SplitStringUsing(const string &str,
221  const string &delims,
222  vector<string> *str_vec) {
223  // Optimize the common case where delims is a single character.
224  if (delims[0] != '\0' && delims[1] == '\0') {
225  char c = delims[0];
226  const char* p = str.data();
227  const char* end = p + str.size();
228  while (p != end) {
229  if (*p == c) {
230  ++p;
231  } else {
232  const char* start = p;
233  while (++p != end && *p != c);
234  str_vec->push_back(string(start, p - start));
235  }
236  }
237  return;
238  }
239 
240  string::size_type begin_index, end_index;
241  begin_index = str.find_first_not_of(delims);
242  while (begin_index != string::npos) {
243  end_index = str.find_first_of(delims, begin_index);
244  if (end_index == string::npos) {
245  str_vec->push_back(str.substr(begin_index));
246  return;
247  }
248  str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
249  begin_index = str.find_first_not_of(delims, end_index);
250  }
251 }
252 
256 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
257  str32->clear();
258  int len = strlen(utf8_str);
259  int step = 0;
260  for (int ch = 0; ch < len; ch += step) {
261  step = UNICHAR::utf8_step(utf8_str + ch);
262  if (step > 0) {
263  UNICHAR uni_ch(utf8_str + ch, step);
264  (*str32) += uni_ch.first_uni();
265  }
266  }
267 }
268 
272 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
273  str->clear();
274  for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {
275  UNICHAR uni_ch((*ch_32));
276  char *utf8 = uni_ch.utf8_str();
277  if (utf8 != NULL) {
278  (*str) += utf8;
279  delete []utf8;
280  }
281  }
282 }
283 
284 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
285  bool all_one_case = true;
286  bool capitalized;
287  bool prev_upper;
288  bool prev_lower;
289  bool first_upper;
290  bool first_lower;
291  bool cur_upper;
292  bool cur_lower;
293 
294  string str8;
295  if (!char_set) {
296  // If cube char_set is missing, use C-locale-dependent functions
297  // on UTF8 characters to determine case properties.
298  first_upper = isupper(str32[0]);
299  first_lower = islower(str32[0]);
300  if (first_upper)
301  capitalized = true;
302  prev_upper = first_upper;
303  prev_lower = first_lower;
304  for (int c = 1; str32[c] != 0; ++c) {
305  cur_upper = isupper(str32[c]);
306  cur_lower = islower(str32[c]);
307  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
308  all_one_case = false;
309  if (cur_upper)
310  capitalized = false;
311  prev_upper = cur_upper;
312  prev_lower = cur_lower;
313  }
314  } else {
315  UNICHARSET *unicharset = char_set->InternalUnicharset();
316  // Use UNICHARSET functions to determine case properties
317  first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
318  first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
319  if (first_upper)
320  capitalized = true;
321  prev_upper = first_upper;
322  prev_lower = first_lower;
323 
324  for (int c = 1; c < StrLen(str32); ++c) {
325  cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
326  cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
327  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
328  all_one_case = false;
329  if (cur_upper)
330  capitalized = false;
331  prev_upper = cur_upper;
332  prev_lower = cur_lower;
333  }
334  }
335  return all_one_case || capitalized;
336 }
337 
338 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
339  if (!char_set) {
340  return NULL;
341  }
342  UNICHARSET *unicharset = char_set->InternalUnicharset();
343  int len = StrLen(str32);
344  char_32 *lower = new char_32[len + 1];
345  for (int i = 0; i < len; ++i) {
346  char_32 ch = str32[i];
347  if (ch == INVALID_UNICHAR_ID) {
348  delete [] lower;
349  return NULL;
350  }
351  // convert upper-case characters to lower-case
352  if (unicharset->get_isupper(char_set->ClassID(ch))) {
353  UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
354  const char_32 *str32_lower = char_set->ClassString(uid_lower);
355  // expect lower-case version of character to be a single character
356  if (!str32_lower || StrLen(str32_lower) != 1) {
357  delete [] lower;
358  return NULL;
359  }
360  lower[i] = str32_lower[0];
361  } else {
362  lower[i] = ch;
363  }
364  }
365  lower[len] = 0;
366  return lower;
367 }
368 
369 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
370  if (!char_set) {
371  return NULL;
372  }
373  UNICHARSET *unicharset = char_set->InternalUnicharset();
374  int len = StrLen(str32);
375  char_32 *upper = new char_32[len + 1];
376  for (int i = 0; i < len; ++i) {
377  char_32 ch = str32[i];
378  if (ch == INVALID_UNICHAR_ID) {
379  delete [] upper;
380  return NULL;
381  }
382  // convert lower-case characters to upper-case
383  if (unicharset->get_islower(char_set->ClassID(ch))) {
384  UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
385  const char_32 *str32_upper = char_set->ClassString(uid_upper);
386  // expect upper-case version of character to be a single character
387  if (!str32_upper || StrLen(str32_upper) != 1) {
388  delete [] upper;
389  return NULL;
390  }
391  upper[i] = str32_upper[0];
392  } else {
393  upper[i] = ch;
394  }
395  }
396  upper[len] = 0;
397  return upper;
398 }
399 } // namespace tesseract
unsigned short Height() const
Definition: bmp_8.h:50
int first_uni() const
Definition: unichar.cpp:97
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:338
#define PROB2COST_SCALE
Definition: cube_const.h:24
unsigned short Width() const
Definition: bmp_8.h:48
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:189
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:369
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
static double Cost2Prob(int cost)
Definition: cube_utils.cpp:47
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:37
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:272
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:220
static CharSamp * FromRawData(int left, int top, int wid, int hgt, unsigned char *data)
Definition: char_samp.cpp:266
unsigned char * RawData() const
Definition: bmp_8.h:51
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:284
signed int char_32
Definition: string_32.h:40
#define MIN_PROB_COST
Definition: cube_const.h:26
#define MIN_PROB
Definition: cube_const.h:28
basic_string< char_32 > string_32
Definition: string_32.h:41
unsigned short Stride() const
Definition: bmp_8.h:49
static CharSamp * CharSampleFromPix(Pix *pix, int left, int top, int wid, int hgt)
Definition: cube_utils.cpp:101
UNICHARSET * InternalUnicharset()
Definition: char_set.h:121
static int StrCmp(const char_32 *str1, const char_32 *str2)
Definition: cube_utils.cpp:66
char * utf8_str() const
Definition: unichar.cpp:125
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256
int ClassID(const char_32 *str) const
Definition: char_set.h:54
static Pix * PixFromCharSample(CharSamp *char_samp)
Definition: cube_utils.cpp:120
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
const char_32 * ClassString(int class_id) const
Definition: char_set.h:104
int UNICHAR_ID
Definition: unichar.h:33
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:90