tesseract  3.05.02
unicharset_extractor.cpp
Go to the documentation of this file.
1 // File: unicharset_extractor.cpp
3 // Description: Unicode character/ligature set extractor.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 // Given a list of box files on the command line, this program generates a file
21 // containing a unicharset, a list of all the characters used by Tesseract
22 //
23 // The file contains the size of the set on the first line, and then one
24 // unichar per line.
25 
26 #include <stdio.h>
27 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
28 #include <wchar.h>
29 #include <wctype.h>
30 #define USING_WCTYPE
31 #endif
32 #include <locale.h>
33 
34 #include "boxread.h"
35 #include "rect.h"
36 #include "strngs.h"
37 #include "tessopt.h"
38 #include "unichar.h"
39 #include "unicharset.h"
40 
41 static const char* const kUnicharsetFileName = "unicharset";
42 
43 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
44  UNICHAR uch(wc);
45  char *unichar = uch.utf8_str();
46  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
47  delete[] unichar;
48  return unichar_id;
49 }
50 
51 // Set character properties using wctype if we have it.
52 // Contributed by piggy@gmail.com.
53 // Modified by Ray to use UNICHAR for unicode conversion
54 // and to check for wctype using autoconf/presence of windows.
55 void set_properties(UNICHARSET *unicharset, const char* const c_string) {
56 #ifdef USING_WCTYPE
57  UNICHAR_ID id;
58  int wc;
59 
60  // Convert the string to a unichar id.
61  id = unicharset->unichar_to_id(c_string);
62 
63  // Set the other_case property to be this unichar id by default.
64  unicharset->set_other_case(id, id);
65 
66  int step = UNICHAR::utf8_step(c_string);
67  if (step == 0)
68  return; // Invalid utf-8.
69 
70  // Get the next Unicode code point in the string.
71  UNICHAR ch(c_string, step);
72  wc = ch.first_uni();
73 
74  /* Copy the properties. */
75  if (iswalpha(wc)) {
76  unicharset->set_isalpha(id, 1);
77  if (iswlower(wc)) {
78  unicharset->set_islower(id, 1);
79  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
80  towupper(wc)));
81  }
82  if (iswupper(wc)) {
83  unicharset->set_isupper(id, 1);
84  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
85  towlower(wc)));
86  }
87  }
88  if (iswdigit(wc))
89  unicharset->set_isdigit(id, 1);
90  if(iswpunct(wc))
91  unicharset->set_ispunctuation(id, 1);
92 
93 #endif
94 }
95 
96 int main(int argc, char** argv) {
97  int option;
98  const char* output_directory = ".";
99  STRING unicharset_file_name;
100  // Special characters are now included by default.
101  UNICHARSET unicharset;
102 
103  setlocale(LC_ALL, "");
104 
105  // Print usage
106  if (argc <= 1) {
107  printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
108 #ifdef USING_WCTYPE
109  printf("Character properties using wctype is enabled\n");
110 #else
111  printf("WARNING: Character properties using wctype is DISABLED\n");
112 #endif
113  exit(1);
114 
115  }
116 
117  // Parse arguments
118  while ((option = tessopt(argc, argv, "D" )) != EOF) {
119  switch (option) {
120  case 'D':
121  output_directory = tessoptarg;
122  ++tessoptind;
123  break;
124  }
125  }
126 
127  // Save file name
128  unicharset_file_name = output_directory;
129  unicharset_file_name += "/";
130  unicharset_file_name += kUnicharsetFileName;
131 
132  // Load box files
133  for (; tessoptind < argc; ++tessoptind) {
134  printf("Extracting unicharset from %s\n", argv[tessoptind]);
135 
136  FILE* box_file = fopen(argv[tessoptind], "rb");
137  if (box_file == NULL) {
138  printf("Cannot open box file %s\n", argv[tessoptind]);
139  return -1;
140  }
141 
142  TBOX box;
143  STRING unichar_string;
144  int line_number = 0;
145  while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
146  unicharset.unichar_insert(unichar_string.string());
147  set_properties(&unicharset, unichar_string.string());
148  }
149  }
150 
151  // Write unicharset file
152  if (unicharset.save_to_file(unicharset_file_name.string())) {
153  printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
154  }
155  else {
156  printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
157  return -1;
158  }
159  return 0;
160 }
int main(int argc, char **argv)
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:119
int first_uni() const
Definition: unichar.cpp:97
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:394
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc)
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:425
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:404
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:399
char * tessoptarg
Definition: tessopt.cpp:25
const char * string() const
Definition: strngs.cpp:201
void set_properties(UNICHARSET *unicharset, const char *const c_string)
void TESS_API unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:389
Definition: strngs.h:44
char * utf8_str() const
Definition: unichar.cpp:125
Definition: rect.h:30
int tessoptind
Definition: tessopt.cpp:24
int tessopt(inT32 argc, char *argv[], const char *arglist)
Definition: tessopt.cpp:33
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:409
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
int UNICHAR_ID
Definition: unichar.h:33