tesseract  3.05.02
char_set.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: char_samp_enum.cpp
3  * Description: Implementation of a Character Set Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 
22 #include "char_set.h"
23 #include "cube_utils.h"
24 #include "tessdatamanager.h"
25 
26 namespace tesseract {
27 
29  class_cnt_ = 0;
30  class_strings_ = NULL;
31  unicharset_map_ = NULL;
32  init_ = false;
33 
34  // init hash table
35  memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
36 }
37 
39  if (class_strings_ != NULL) {
40  for (int cls = 0; cls < class_cnt_; cls++) {
41  if (class_strings_[cls] != NULL) {
42  delete class_strings_[cls];
43  }
44  }
45  delete []class_strings_;
46  class_strings_ = NULL;
47  }
48  delete []unicharset_map_;
49 }
50 
51 // Creates CharSet object by reading the unicharset from the
52 // TessDatamanager, and mapping Cube's unicharset to Tesseract's if
53 // they differ.
55  UNICHARSET *tess_unicharset) {
56  CharSet *char_set = new CharSet();
57 
58  // First look for Cube's unicharset; if not there, use tesseract's
59  bool cube_unicharset_exists;
60  if (!(cube_unicharset_exists =
61  tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) &&
62  !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
63  fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
64  "either cube or tesseract unicharset\n");
65  return NULL;
66  }
67  FILE *charset_fp = tessdata_manager->GetDataFilePtr();
68  if (!charset_fp) {
69  fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
70  "a unicharset\n");
71  return NULL;
72  }
73 
74  // If we found a cube unicharset separate from tesseract's, load it and
75  // map its unichars to tesseract's; if only one unicharset exists,
76  // just load it.
77  bool loaded;
78  if (cube_unicharset_exists) {
79  char_set->cube_unicharset_.load_from_file(charset_fp);
80  loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
81  loaded = loaded && char_set->LoadSupportedCharList(
82  tessdata_manager->GetDataFilePtr(), tess_unicharset);
83  char_set->unicharset_ = &char_set->cube_unicharset_;
84  } else {
85  loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
86  char_set->unicharset_ = tess_unicharset;
87  }
88  if (!loaded) {
89  delete char_set;
90  return NULL;
91  }
92 
93  char_set->init_ = true;
94  return char_set;
95 }
96 
97 // Load the list of supported chars from the given data file pointer.
98 bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) {
99  if (init_)
100  return true;
101 
102  char str_line[256];
103  // init hash table
104  memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
105  // read the char count
106  if (fgets(str_line, sizeof(str_line), fp) == NULL) {
107  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
108  "read char count.\n");
109  return false;
110  }
111  class_cnt_ = atoi(str_line);
112  if (class_cnt_ < 2) {
113  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid "
114  "class count: %d\n", class_cnt_);
115  return false;
116  }
117  // memory for class strings
118  class_strings_ = new string_32*[class_cnt_];
119  // memory for unicharset map
120  if (tess_unicharset) {
121  unicharset_map_ = new int[class_cnt_];
122  }
123 
124  // Read in character strings and add to hash table
125  for (int class_id = 0; class_id < class_cnt_; class_id++) {
126  // Read the class string
127  if (fgets(str_line, sizeof(str_line), fp) == NULL) {
128  fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): "
129  "could not read class string with class_id=%d.\n", class_id);
130  return false;
131  }
132  // Terminate at space if any
133  char *p = strchr(str_line, ' ');
134  if (p != NULL)
135  *p = '\0';
136  // Convert to UTF32 and store
137  string_32 str32;
138  // Convert NULL to a space
139  if (strcmp(str_line, "NULL") == 0) {
140  strcpy(str_line, " ");
141  }
142  CubeUtils::UTF8ToUTF32(str_line, &str32);
143  class_strings_[class_id] = new string_32(str32);
144 
145  // Add to hash-table
146  int hash_val = Hash(reinterpret_cast<const char_32 *>(str32.c_str()));
147  if (hash_bin_size_[hash_val] >= kMaxHashSize) {
148  fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash "
149  "table is full.\n");
150  return false;
151  }
152  hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id;
153 
154  if (tess_unicharset != NULL) {
155  // Add class id to unicharset map
156  UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line);
157  if (tess_id == INVALID_UNICHAR_ID) {
158  tess_unicharset->unichar_insert(str_line);
159  tess_id = tess_unicharset->unichar_to_id(str_line);
160  }
161  ASSERT_HOST(tess_id != INVALID_UNICHAR_ID);
162  unicharset_map_[class_id] = tess_id;
163  }
164  }
165  return true;
166 }
167 
168 } // tesseract
static CharSet * Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
Definition: char_set.cpp:54
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void TESS_API unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
basic_string< char_32 > string_32
Definition: string_32.h:41
bool SeekToStart(TessdataType tessdata_type)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:256
#define ASSERT_HOST(x)
Definition: errcode.h:84
int UNICHAR_ID
Definition: unichar.h:33