tesseract  3.05.02
tesseract::CharBigrams Class Reference

#include <char_bigrams.h>

Public Member Functions

 CharBigrams ()
 
 ~CharBigrams ()
 
int Cost (const char_32 *str, CharSet *char_set) const
 

Static Public Member Functions

static CharBigramsCreate (const string &data_file_path, const string &lang)
 

Protected Member Functions

int PairCost (char_32 ch1, char_32 ch2) const
 
int MeanCostWithSpaces (const char_32 *char_32_ptr) const
 

Detailed Description

Definition at line 56 of file char_bigrams.h.

Constructor & Destructor Documentation

◆ CharBigrams()

tesseract::CharBigrams::CharBigrams ( )

Definition at line 32 of file char_bigrams.cpp.

32  {
33  memset(&bigram_table_, 0, sizeof(bigram_table_));
34 }

◆ ~CharBigrams()

tesseract::CharBigrams::~CharBigrams ( )

Definition at line 36 of file char_bigrams.cpp.

36  {
37  if (bigram_table_.char_bigram != NULL) {
38  for (int ch1 = 0; ch1 <= bigram_table_.max_char; ch1++) {
39  CharBigram *char_bigram = bigram_table_.char_bigram + ch1;
40 
41  if (char_bigram->bigram != NULL) {
42  delete []char_bigram->bigram;
43  }
44  }
45  delete []bigram_table_.char_bigram;
46  }
47 }

Member Function Documentation

◆ Cost()

int tesseract::CharBigrams::Cost ( const char_32 str,
CharSet char_set 
) const

Definition at line 155 of file char_bigrams.cpp.

155  {
156  if (!char_32_ptr || char_32_ptr[0] == 0) {
157  return bigram_table_.worst_cost;
158  }
159  int cost = MeanCostWithSpaces(char_32_ptr);
160  if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
161  CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
162  char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
163  if (lower_32 && lower_32[0] != 0) {
164  int cost_lower = MeanCostWithSpaces(lower_32);
165  cost = MIN(cost, cost_lower);
166  }
167  delete [] lower_32;
168  char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
169  if (upper_32 && upper_32[0] != 0) {
170  int cost_upper = MeanCostWithSpaces(upper_32);
171  cost = MIN(cost, cost_upper);
172  }
173  delete [] upper_32;
174  }
175  return cost;
176 }
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:338
#define MIN(x, y)
Definition: ndminx.h:28
int MeanCostWithSpaces(const char_32 *char_32_ptr) const
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:369
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:284
signed int char_32
Definition: string_32.h:40

◆ Create()

CharBigrams * tesseract::CharBigrams::Create ( const string &  data_file_path,
const string &  lang 
)
static

Definition at line 49 of file char_bigrams.cpp.

50  {
51  string file_name;
52  string str;
53 
54  file_name = data_file_path + lang;
55  file_name += ".cube.bigrams";
56 
57  // load the string into memory
58  if (!CubeUtils::ReadFileToString(file_name, &str)) {
59  return NULL;
60  }
61 
62  // construct a new object
63  CharBigrams *char_bigrams_obj = new CharBigrams();
64  CharBigramTable *table = &char_bigrams_obj->bigram_table_;
65 
66  table->total_cnt = 0;
67  table->max_char = -1;
68  table->char_bigram = NULL;
69 
70  // split into lines
71  vector<string> str_vec;
72  CubeUtils::SplitStringUsing(str, "\r\n", &str_vec);
73 
74  for (int big = 0; big < str_vec.size(); big++) {
75  char_32 ch1;
76  char_32 ch2;
77  int cnt;
78  if (sscanf(str_vec[big].c_str(), "%d %x %x", &cnt, &ch1, &ch2) != 3) {
79  fprintf(stderr, "Cube ERROR (CharBigrams::Create): invalid format "
80  "reading line: %s\n", str_vec[big].c_str());
81  delete char_bigrams_obj;
82  return NULL;
83  }
84 
85  // expand the bigram table
86  if (ch1 > table->max_char) {
87  CharBigram *char_bigram = new CharBigram[ch1 + 1];
88 
89  if (table->char_bigram != NULL && table->max_char >= 0) {
90  memcpy(char_bigram, table->char_bigram,
91  (table->max_char + 1) * sizeof(*char_bigram));
92 
93  delete []table->char_bigram;
94  }
95  table->char_bigram = char_bigram;
96 
97  // init
98  for (int new_big = table->max_char + 1; new_big <= ch1; new_big++) {
99  table->char_bigram[new_big].total_cnt = 0;
100  table->char_bigram[new_big].max_char = -1;
101  table->char_bigram[new_big].bigram = NULL;
102  }
103  table->max_char = ch1;
104  }
105 
106  if (ch2 > table->char_bigram[ch1].max_char) {
107  Bigram *bigram = new Bigram[ch2 + 1];
108 
109  if (table->char_bigram[ch1].bigram != NULL &&
110  table->char_bigram[ch1].max_char >= 0) {
111  memcpy(bigram, table->char_bigram[ch1].bigram,
112  (table->char_bigram[ch1].max_char + 1) * sizeof(*bigram));
113  delete []table->char_bigram[ch1].bigram;
114  }
115  table->char_bigram[ch1].bigram = bigram;
116 
117  // init
118  for (int new_big = table->char_bigram[ch1].max_char + 1;
119  new_big <= ch2; new_big++) {
120  table->char_bigram[ch1].bigram[new_big].cnt = 0;
121  }
122  table->char_bigram[ch1].max_char = ch2;
123  }
124 
125  table->char_bigram[ch1].bigram[ch2].cnt = cnt;
126  table->char_bigram[ch1].total_cnt += cnt;
127  table->total_cnt += cnt;
128  }
129 
130  // compute costs (-log probs)
131  table->worst_cost = static_cast<int>(
132  -PROB2COST_SCALE * log(0.5 / table->total_cnt));
133  for (char_32 ch1 = 0; ch1 <= table->max_char; ch1++) {
134  for (char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) {
135  int cnt = table->char_bigram[ch1].bigram[ch2].cnt;
136  table->char_bigram[ch1].bigram[ch2].cost =
137  static_cast<int>(-PROB2COST_SCALE *
138  log(MAX(0.5, static_cast<double>(cnt)) /
139  table->total_cnt));
140  }
141  }
142  return char_bigrams_obj;
143 }
#define PROB2COST_SCALE
Definition: cube_const.h:24
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:189
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:220
#define MAX(x, y)
Definition: ndminx.h:24
signed int char_32
Definition: string_32.h:40

◆ MeanCostWithSpaces()

int tesseract::CharBigrams::MeanCostWithSpaces ( const char_32 char_32_ptr) const
protected

Definition at line 178 of file char_bigrams.cpp.

178  {
179  if (!char_32_ptr)
180  return bigram_table_.worst_cost;
181  int len = CubeUtils::StrLen(char_32_ptr);
182  int cost = 0;
183  int c = 0;
184  cost = PairCost(' ', char_32_ptr[0]);
185  for (c = 1; c < len; c++) {
186  cost += PairCost(char_32_ptr[c - 1], char_32_ptr[c]);
187  }
188  cost += PairCost(char_32_ptr[len - 1], ' ');
189  return static_cast<int>(cost / static_cast<double>(len + 1));
190 }
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
int PairCost(char_32 ch1, char_32 ch2) const

◆ PairCost()

int tesseract::CharBigrams::PairCost ( char_32  ch1,
char_32  ch2 
) const
protected

Definition at line 145 of file char_bigrams.cpp.

145  {
146  if (ch1 > bigram_table_.max_char) {
147  return bigram_table_.worst_cost;
148  }
149  if (ch2 > bigram_table_.char_bigram[ch1].max_char) {
150  return bigram_table_.worst_cost;
151  }
152  return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
153 }

The documentation for this class was generated from the following files: