tesseract  3.05.02
tesseract::CubeUtils Class Reference

#include <cube_utils.h>

Public Member Functions

 CubeUtils ()
 
 ~CubeUtils ()
 

Static Public Member Functions

static int Prob2Cost (double prob_val)
 
static double Cost2Prob (int cost)
 
static int StrLen (const char_32 *str)
 
static int StrCmp (const char_32 *str1, const char_32 *str2)
 
static char_32StrDup (const char_32 *str)
 
static CharSampCharSampleFromPix (Pix *pix, int left, int top, int wid, int hgt)
 
static Pix * PixFromCharSample (CharSamp *char_samp)
 
static bool ReadFileToString (const string &file_name, string *str)
 
static void SplitStringUsing (const string &str, const string &delims, vector< string > *str_vec)
 
static void UTF8ToUTF32 (const char *utf8_str, string_32 *str32)
 
static void UTF32ToUTF8 (const char_32 *utf32_str, string *str)
 
static bool IsCaseInvariant (const char_32 *str32, CharSet *char_set)
 
static char_32ToLower (const char_32 *str32, CharSet *char_set)
 
static char_32ToUpper (const char_32 *str32, CharSet *char_set)
 

Detailed Description

Definition at line 35 of file cube_utils.h.

Constructor & Destructor Documentation

◆ CubeUtils()

tesseract::CubeUtils::CubeUtils ( )

Definition at line 28 of file cube_utils.cpp.

28  {
29 }

◆ ~CubeUtils()

tesseract::CubeUtils::~CubeUtils ( )

Definition at line 31 of file cube_utils.cpp.

31  {
32 }

Member Function Documentation

◆ CharSampleFromPix()

CharSamp * tesseract::CubeUtils::CharSampleFromPix ( Pix *  pix,
int  left,
int  top,
int  wid,
int  hgt 
)
static

creates a char samp from a specified portion of the image

Definition at line 101 of file cube_utils.cpp.

102  {
103  // get the raw img data from the image
104  unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
105  if (temp_buff == NULL) {
106  return NULL;
107  }
108 
109  // create a char samp from temp buffer
110  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
111 
112  // clean up temp buffer
113  delete []temp_buff;
114  return char_samp;
115 }
static CharSamp * FromRawData(int left, int top, int wid, int hgt, unsigned char *data)
Definition: char_samp.cpp:266

◆ Cost2Prob()

double tesseract::CubeUtils::Cost2Prob ( int  cost)
static

converts a cost to probability

Definition at line 47 of file cube_utils.cpp.

47  {
48  return exp(-cost / PROB2COST_SCALE);
49 }
#define PROB2COST_SCALE
Definition: cube_const.h:24

◆ IsCaseInvariant()

bool tesseract::CubeUtils::IsCaseInvariant ( const char_32 str32,
CharSet char_set 
)
static

Definition at line 284 of file cube_utils.cpp.

284  {
285  bool all_one_case = true;
286  bool capitalized;
287  bool prev_upper;
288  bool prev_lower;
289  bool first_upper;
290  bool first_lower;
291  bool cur_upper;
292  bool cur_lower;
293 
294  string str8;
295  if (!char_set) {
296  // If cube char_set is missing, use C-locale-dependent functions
297  // on UTF8 characters to determine case properties.
298  first_upper = isupper(str32[0]);
299  first_lower = islower(str32[0]);
300  if (first_upper)
301  capitalized = true;
302  prev_upper = first_upper;
303  prev_lower = first_lower;
304  for (int c = 1; str32[c] != 0; ++c) {
305  cur_upper = isupper(str32[c]);
306  cur_lower = islower(str32[c]);
307  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
308  all_one_case = false;
309  if (cur_upper)
310  capitalized = false;
311  prev_upper = cur_upper;
312  prev_lower = cur_lower;
313  }
314  } else {
315  UNICHARSET *unicharset = char_set->InternalUnicharset();
316  // Use UNICHARSET functions to determine case properties
317  first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
318  first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
319  if (first_upper)
320  capitalized = true;
321  prev_upper = first_upper;
322  prev_lower = first_lower;
323 
324  for (int c = 1; c < StrLen(str32); ++c) {
325  cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
326  cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
327  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
328  all_one_case = false;
329  if (cur_upper)
330  capitalized = false;
331  prev_upper = cur_upper;
332  prev_lower = cur_lower;
333  }
334  }
335  return all_one_case || capitalized;
336 }
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

◆ PixFromCharSample()

Pix * tesseract::CubeUtils::PixFromCharSample ( CharSamp char_samp)
static

create a B/W image from a char_sample

Definition at line 120 of file cube_utils.cpp.

120  {
121  // parameter check
122  if (char_samp == NULL) {
123  return NULL;
124  }
125 
126  // get the raw data
127  int stride = char_samp->Stride();
128  int wid = char_samp->Width();
129  int hgt = char_samp->Height();
130 
131  Pix *pix = pixCreate(wid, hgt, 1);
132  if (pix == NULL) {
133  return NULL;
134  }
135 
136  // copy the contents
137  unsigned char *line = char_samp->RawData();
138  for (int y = 0; y < hgt ; y++, line += stride) {
139  for (int x = 0; x < wid; x++) {
140  if (line[x] != 0) {
141  pixSetPixel(pix, x, y, 0);
142  } else {
143  pixSetPixel(pix, x, y, 255);
144  }
145  }
146  }
147 
148  return pix;
149 }

◆ Prob2Cost()

int tesseract::CubeUtils::Prob2Cost ( double  prob_val)
static

convert a prob to a cost (-ve log prob)

Definition at line 37 of file cube_utils.cpp.

37  {
38  if (prob_val < MIN_PROB) {
39  return MIN_PROB_COST;
40  }
41  return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
42 }
#define PROB2COST_SCALE
Definition: cube_const.h:24
#define MIN_PROB_COST
Definition: cube_const.h:26
#define MIN_PROB
Definition: cube_const.h:28

◆ ReadFileToString()

bool tesseract::CubeUtils::ReadFileToString ( const string &  file_name,
string *  str 
)
static

read file contents to a string

Definition at line 189 of file cube_utils.cpp.

189  {
190  str->clear();
191  FILE *fp = fopen(file_name.c_str(), "rb");
192  if (fp == NULL) {
193  return false;
194  }
195 
196  // get the size of the size
197  fseek(fp, 0, SEEK_END);
198  int file_size = ftell(fp);
199  if (file_size < 1) {
200  fclose(fp);
201  return false;
202  }
203  // adjust string size
204  str->reserve(file_size);
205  // read the contents
206  rewind(fp);
207  char *buff = new char[file_size];
208  int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
209  if (read_bytes == file_size) {
210  str->append(buff, file_size);
211  }
212  delete []buff;
213  fclose(fp);
214  return (read_bytes == file_size);
215 }

◆ SplitStringUsing()

void tesseract::CubeUtils::SplitStringUsing ( const string &  str,
const string &  delims,
vector< string > *  str_vec 
)
static

splits a string into vectors based on specified delimiters

Definition at line 220 of file cube_utils.cpp.

222  {
223  // Optimize the common case where delims is a single character.
224  if (delims[0] != '\0' && delims[1] == '\0') {
225  char c = delims[0];
226  const char* p = str.data();
227  const char* end = p + str.size();
228  while (p != end) {
229  if (*p == c) {
230  ++p;
231  } else {
232  const char* start = p;
233  while (++p != end && *p != c);
234  str_vec->push_back(string(start, p - start));
235  }
236  }
237  return;
238  }
239 
240  string::size_type begin_index, end_index;
241  begin_index = str.find_first_not_of(delims);
242  while (begin_index != string::npos) {
243  end_index = str.find_first_of(delims, begin_index);
244  if (end_index == string::npos) {
245  str_vec->push_back(str.substr(begin_index));
246  return;
247  }
248  str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
249  begin_index = str.find_first_not_of(delims, end_index);
250  }
251 }

◆ StrCmp()

int tesseract::CubeUtils::StrCmp ( const char_32 str1,
const char_32 str2 
)
static

compares two char_32 strings

Definition at line 66 of file cube_utils.cpp.

66  {
67  const char_32 *pch1 = str1;
68  const char_32 *pch2 = str2;
69 
70  for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
71  if ((*pch1) != (*pch2)) {
72  return (*pch1) - (*pch2);
73  }
74  }
75 
76  if ((*pch1) == 0) {
77  if ((*pch2) == 0) {
78  return 0;
79  } else {
80  return -1;
81  }
82  } else {
83  return 1;
84  }
85 }
signed int char_32
Definition: string_32.h:40

◆ StrDup()

char_32 * tesseract::CubeUtils::StrDup ( const char_32 str32)
static

Duplicates a 32-bit char buffer

Definition at line 90 of file cube_utils.cpp.

90  {
91  int len = StrLen(str32);
92  char_32 *new_str = new char_32[len + 1];
93  memcpy(new_str, str32, len * sizeof(*str32));
94  new_str[len] = 0;
95  return new_str;
96 }
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
signed int char_32
Definition: string_32.h:40

◆ StrLen()

int tesseract::CubeUtils::StrLen ( const char_32 char_32_ptr)
static

computes the length of a NULL terminated char_32 string

Definition at line 54 of file cube_utils.cpp.

54  {
55  if (char_32_ptr == NULL) {
56  return 0;
57  }
58  int len = -1;
59  while (char_32_ptr[++len]);
60  return len;
61 }

◆ ToLower()

char_32 * tesseract::CubeUtils::ToLower ( const char_32 str32,
CharSet char_set 
)
static

Definition at line 338 of file cube_utils.cpp.

338  {
339  if (!char_set) {
340  return NULL;
341  }
342  UNICHARSET *unicharset = char_set->InternalUnicharset();
343  int len = StrLen(str32);
344  char_32 *lower = new char_32[len + 1];
345  for (int i = 0; i < len; ++i) {
346  char_32 ch = str32[i];
347  if (ch == INVALID_UNICHAR_ID) {
348  delete [] lower;
349  return NULL;
350  }
351  // convert upper-case characters to lower-case
352  if (unicharset->get_isupper(char_set->ClassID(ch))) {
353  UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
354  const char_32 *str32_lower = char_set->ClassString(uid_lower);
355  // expect lower-case version of character to be a single character
356  if (!str32_lower || StrLen(str32_lower) != 1) {
357  delete [] lower;
358  return NULL;
359  }
360  lower[i] = str32_lower[0];
361  } else {
362  lower[i] = ch;
363  }
364  }
365  lower[len] = 0;
366  return lower;
367 }
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
signed int char_32
Definition: string_32.h:40
int UNICHAR_ID
Definition: unichar.h:33

◆ ToUpper()

char_32 * tesseract::CubeUtils::ToUpper ( const char_32 str32,
CharSet char_set 
)
static

Definition at line 369 of file cube_utils.cpp.

369  {
370  if (!char_set) {
371  return NULL;
372  }
373  UNICHARSET *unicharset = char_set->InternalUnicharset();
374  int len = StrLen(str32);
375  char_32 *upper = new char_32[len + 1];
376  for (int i = 0; i < len; ++i) {
377  char_32 ch = str32[i];
378  if (ch == INVALID_UNICHAR_ID) {
379  delete [] upper;
380  return NULL;
381  }
382  // convert lower-case characters to upper-case
383  if (unicharset->get_islower(char_set->ClassID(ch))) {
384  UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
385  const char_32 *str32_upper = char_set->ClassString(uid_upper);
386  // expect upper-case version of character to be a single character
387  if (!str32_upper || StrLen(str32_upper) != 1) {
388  delete [] upper;
389  return NULL;
390  }
391  upper[i] = str32_upper[0];
392  } else {
393  upper[i] = ch;
394  }
395  }
396  upper[len] = 0;
397  return upper;
398 }
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
signed int char_32
Definition: string_32.h:40
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
int UNICHAR_ID
Definition: unichar.h:33

◆ UTF32ToUTF8()

void tesseract::CubeUtils::UTF32ToUTF8 ( const char_32 utf32_str,
string *  str 
)
static

UTF-32 to UTF-8 conversion functions

Definition at line 272 of file cube_utils.cpp.

272  {
273  str->clear();
274  for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {
275  UNICHAR uni_ch((*ch_32));
276  char *utf8 = uni_ch.utf8_str();
277  if (utf8 != NULL) {
278  (*str) += utf8;
279  delete []utf8;
280  }
281  }
282 }
signed int char_32
Definition: string_32.h:40

◆ UTF8ToUTF32()

void tesseract::CubeUtils::UTF8ToUTF32 ( const char *  utf8_str,
string_32 str32 
)
static

UTF-8 to UTF-32 conversion functions

Definition at line 256 of file cube_utils.cpp.

256  {
257  str32->clear();
258  int len = strlen(utf8_str);
259  int step = 0;
260  for (int ch = 0; ch < len; ch += step) {
261  step = UNICHAR::utf8_step(utf8_str + ch);
262  if (step > 0) {
263  UNICHAR uni_ch(utf8_str + ch, step);
264  (*str32) += uni_ch.first_uni();
265  }
266  }
267 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

The documentation for this class was generated from the following files: