tesseract  3.05.02
char_set.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: char_samp_enum.h
3  * Description: Declaration of a Character Set Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The CharSet class encapsulates the list of 32-bit strings/characters that
21 // Cube supports for a specific language. The char set is loaded from the
22 // .unicharset file corresponding to a specific language
23 // Each string has a corresponding int class-id that gets used throughout Cube
24 // The class provides pass back and forth conversion between the class-id
25 // and its corresponding 32-bit string. This is done using a hash table that
26 // maps the string to the class id.
27 
28 #ifndef CHAR_SET_H
29 #define CHAR_SET_H
30 
31 #include <string.h>
32 #include <string>
33 #include <algorithm>
34 
35 #include "string_32.h"
36 #include "tessdatamanager.h"
37 #include "unicharset.h"
38 #include "cube_const.h"
39 
40 namespace tesseract {
41 
42 class CharSet {
43  public:
44  CharSet();
45  ~CharSet();
46 
47  // Returns true if Cube is sharing Tesseract's unicharset.
48  inline bool SharedUnicharset() { return (unicharset_map_ == NULL); }
49 
50  // Returns the class id corresponding to a 32-bit string. Returns -1
51  // if the string is not supported. This is done by hashing the
52  // string and then looking up the string in the hash-bin if there
53  // are collisions.
54  inline int ClassID(const char_32 *str) const {
55  int hash_val = Hash(str);
56  if (hash_bin_size_[hash_val] == 0)
57  return -1;
58  for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
59  if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0)
60  return hash_bins_[hash_val][bin];
61  }
62  return -1;
63  }
64  // Same as above but using a 32-bit char instead of a string
65  inline int ClassID(char_32 ch) const {
66  int hash_val = Hash(ch);
67  if (hash_bin_size_[hash_val] == 0)
68  return -1;
69  for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
70  if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch &&
71  class_strings_[hash_bins_[hash_val][bin]]->length() == 1) {
72  return hash_bins_[hash_val][bin];
73  }
74  }
75  return -1;
76  }
77  // Retrieve the unicharid in Tesseract's unicharset corresponding
78  // to a 32-bit string. When Tesseract and Cube share the same
79  // unicharset, this will just be the class id.
80  inline int UnicharID(const char_32 *str) const {
81  int class_id = ClassID(str);
82  if (class_id == INVALID_UNICHAR_ID)
83  return INVALID_UNICHAR_ID;
84  int unichar_id;
85  if (unicharset_map_)
86  unichar_id = unicharset_map_[class_id];
87  else
88  unichar_id = class_id;
89  return unichar_id;
90  }
91  // Same as above but using a 32-bit char instead of a string
92  inline int UnicharID(char_32 ch) const {
93  int class_id = ClassID(ch);
94  if (class_id == INVALID_UNICHAR_ID)
95  return INVALID_UNICHAR_ID;
96  int unichar_id;
97  if (unicharset_map_)
98  unichar_id = unicharset_map_[class_id];
99  else
100  unichar_id = class_id;
101  return unichar_id;
102  }
103  // Returns the 32-bit string corresponding to a class id
104  inline const char_32 * ClassString(int class_id) const {
105  if (class_id < 0 || class_id >= class_cnt_) {
106  return NULL;
107  }
108  return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str());
109  }
110  // Returns the count of supported strings
111  inline int ClassCount() const { return class_cnt_; }
112 
113  // Creates CharSet object by reading the unicharset from the
114  // TessDatamanager, and mapping Cube's unicharset to Tesseract's if
115  // they differ.
116  static CharSet *Create(TessdataManager *tessdata_manager,
117  UNICHARSET *tess_unicharset);
118 
119  // Return the UNICHARSET cube is using for recognition internally --
120  // ClassId() returns unichar_id's in this unicharset.
121  UNICHARSET *InternalUnicharset() { return unicharset_; }
122 
123  private:
124  // Hash table configuration params. Determined emperically on
125  // the supported languages so far (Eng, Ara, Hin). Might need to be
126  // tuned for speed when more languages are supported
127  static const int kHashBins = 3001;
128  static const int kMaxHashSize = 16;
129 
130  // Using djb2 hashing function to hash a 32-bit string
131  // introduced in http://www.cse.yorku.ca/~oz/hash.html
132  static inline int Hash(const char_32 *str) {
133  unsigned long hash = 5381;
134  int c;
135  while ((c = *str++))
136  hash = ((hash << 5) + hash) + c;
137  return (hash%kHashBins);
138  }
139  // Same as above but for a single char
140  static inline int Hash(char_32 ch) {
141  char_32 b[2];
142  b[0] = ch;
143  b[1] = 0;
144  return Hash(b);
145  }
146 
147  // Load the list of supported chars from the given data file
148  // pointer. If tess_unicharset is non-NULL, mapping each Cube class
149  // id to a tesseract unicharid.
150  bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset);
151 
152  // class count
153  int class_cnt_;
154  // hash-bin sizes array
155  int hash_bin_size_[kHashBins];
156  // hash bins
157  int hash_bins_[kHashBins][kMaxHashSize];
158  // supported strings array
159  string_32 **class_strings_;
160  // map from class id to secondary (tesseract's) unicharset's ids
161  int *unicharset_map_;
162  // A unicharset which is filled in with a Tesseract-style UNICHARSET for
163  // cube's data if our unicharset is different from tesseract's.
164  UNICHARSET cube_unicharset_;
165  // This points to either the tess_unicharset we're passed or cube_unicharset_,
166  // depending upon whether we just have one unicharset or one for each
167  // tesseract and cube, respectively.
168  UNICHARSET *unicharset_;
169  // has the char set been initialized flag
170  bool init_;
171 };
172 }
173 
174 #endif // CHAR_SET_H
static CharSet * Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
Definition: char_set.cpp:54
int UnicharID(const char_32 *str) const
Definition: char_set.h:80
int ClassCount() const
Definition: char_set.h:111
signed int char_32
Definition: string_32.h:40
basic_string< char_32 > string_32
Definition: string_32.h:41
UNICHARSET * InternalUnicharset()
Definition: char_set.h:121
int ClassID(const char_32 *str) const
Definition: char_set.h:54
int UnicharID(char_32 ch) const
Definition: char_set.h:92
bool SharedUnicharset()
Definition: char_set.h:48
int ClassID(char_32 ch) const
Definition: char_set.h:65
const char_32 * ClassString(int class_id) const
Definition: char_set.h:104