tesseract  3.05.02
unicharset.h
Go to the documentation of this file.
1 // File: unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
21 #define TESSERACT_CCUTIL_UNICHARSET_H__
22 
23 #include "errcode.h"
24 #include "genericvector.h"
25 #include "helpers.h"
26 #include "serialis.h"
27 #include "strngs.h"
28 #include "tesscallback.h"
29 #include "unichar.h"
30 #include "unicharmap.h"
31 
32 // Enum holding special values of unichar_id. Every unicharset has these.
33 // Warning! Keep in sync with kSpecialUnicharCodes.
38 
40 };
41 
43  public:
44  // Minimum number of characters used for fragment representation.
45  static const int kMinLen = 6;
46  // Maximum number of characters used for fragment representation.
47  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
48  // Maximum number of fragments per character.
49  static const int kMaxChunks = 5;
50 
51  // Setters and Getters.
52  inline void set_all(const char *unichar, int pos, int total, bool natural) {
53  set_unichar(unichar);
54  set_pos(pos);
55  set_total(total);
56  set_natural(natural);
57  }
58  inline void set_unichar(const char *uch) {
59  strncpy(this->unichar, uch, UNICHAR_LEN);
60  this->unichar[UNICHAR_LEN] = '\0';
61  }
62  inline void set_pos(int p) { this->pos = p; }
63  inline void set_total(int t) { this->total = t; }
64  inline const char* get_unichar() const { return this->unichar; }
65  inline int get_pos() const { return this->pos; }
66  inline int get_total() const { return this->total; }
67 
68  // Returns the string that represents a fragment
69  // with the given unichar, pos and total.
70  static STRING to_string(const char *unichar, int pos, int total,
71  bool natural);
72  // Returns the string that represents this fragment.
73  STRING to_string() const {
74  return to_string(unichar, pos, total, natural);
75  }
76 
77  // Checks whether a fragment has the same unichar,
78  // position and total as the given inputs.
79  inline bool equals(const char *other_unichar,
80  int other_pos, int other_total) const {
81  return (strcmp(this->unichar, other_unichar) == 0 &&
82  this->pos == other_pos && this->total == other_total);
83  }
84  inline bool equals(const CHAR_FRAGMENT *other) const {
85  return this->equals(other->get_unichar(),
86  other->get_pos(),
87  other->get_total());
88  }
89 
90  // Checks whether a given fragment is a continuation of this fragment.
91  // Assumes that the given fragment pointer is not NULL.
92  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
93  return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
94  this->total == fragment->get_total() &&
95  this->pos == fragment->get_pos() + 1);
96  }
97 
98  // Returns true if this fragment is a beginning fragment.
99  inline bool is_beginning() const { return this->pos == 0; }
100 
101  // Returns true if this fragment is an ending fragment.
102  inline bool is_ending() const { return this->pos == this->total-1; }
103 
104  // Returns true if the fragment was a separate component to begin with,
105  // ie did not need chopping to be isolated, but may have been separated
106  // out from a multi-outline blob.
107  inline bool is_natural() const { return natural; }
108  void set_natural(bool value) { natural = value; }
109 
110  // Parses the string to see whether it represents a character fragment
111  // (rather than a regular character). If so, allocates memory for a new
112  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
113  // information. Fragments are of the form:
114  // |m|1|2, meaning chunk 1 of 2 of character m, or
115  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
116  // to divide the parts, as they were already separate connected components.
117  //
118  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
119  // instance, otherwise (if the string does not represent a fragment or it
120  // looks like it does, but parsing it as a fragment fails) returns NULL.
121  //
122  // Note: The caller is responsible for deallocating memory
123  // associated with the returned pointer.
124  static CHAR_FRAGMENT *parse_from_string(const char *str);
125 
126  private:
127  char unichar[UNICHAR_LEN + 1];
128  // True if the fragment was a separate component to begin with,
129  // ie did not need chopping to be isolated, but may have been separated
130  // out from a multi-outline blob.
131  bool natural;
132  inT16 pos; // fragment position in the character
133  inT16 total; // total number of fragments in the character
134 };
135 
136 // The UNICHARSET class is an utility class for Tesseract that holds the
137 // set of characters that are used by the engine. Each character is identified
138 // by a unique number, from 0 to (size - 1).
139 class UNICHARSET {
140  public:
141  // Custom list of characters and their ligature forms (UTF8)
142  // These map to unicode values in the private use area (PUC) and are supported
143  // by only few font families (eg. Wyld, Adobe Caslon Pro).
144  static TESS_API const char* kCustomLigatures[][2];
145 
146  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
148 
149  // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
150  enum Direction {
171  };
172 
173  // Create an empty UNICHARSET
174  UNICHARSET();
175 
176  ~UNICHARSET();
177 
178  // Return the UNICHAR_ID of a given unichar representation within the
179  // UNICHARSET.
180  UNICHAR_ID TESS_API unichar_to_id(const char* const unichar_repr) const;
181 
182  // Return the UNICHAR_ID of a given unichar representation within the
183  // UNICHARSET. Only the first length characters from unichar_repr are used.
184  UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
185 
186  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
187  // while leaving the rest of the string encodable. Returns 0 if the
188  // beginning of the string is not encodable.
189  // WARNING: this function now encodes the whole string for precision.
190  // Use encode_string in preference to repeatedly calling step.
191  int step(const char* str) const;
192 
193  // Return whether the given UTF-8 string is encodable with this UNICHARSET.
194  // If not encodable, write the first byte offset which cannot be converted
195  // into the second (return) argument.
196  bool encodable_string(const char *str, int *first_bad_position) const;
197 
198  // Encodes the given UTF-8 string with this UNICHARSET.
199  // Any part of the string that cannot be encoded (because the utf8 can't
200  // be broken up into pieces that are in the unicharset) then:
201  // if give_up_on_failure, stops and returns a partial encoding,
202  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
203  // Returns true if the encoding succeeds completely, false if there is at
204  // least one failure.
205  // If lengths is not NULL, then it is filled with the corresponding
206  // byte length of each encoded UNICHAR_ID.
207  // If encoded_length is not NULL then on return it contains the length of
208  // str that was encoded. (if give_up_on_failure the location of the first
209  // failure, otherwise strlen(str).)
210  bool encode_string(const char* str, bool give_up_on_failure,
211  GenericVector<UNICHAR_ID>* encoding,
212  GenericVector<char>* lengths,
213  int* encoded_length) const;
214 
215  // Return the unichar representation corresponding to the given UNICHAR_ID
216  // within the UNICHARSET.
217  const char* id_to_unichar(UNICHAR_ID id) const;
218 
219  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
220  // resolving any private encodings internal to Tesseract. This method is
221  // preferable to id_to_unichar for outputting text that will be visible to
222  // external applications.
223  const char* id_to_unichar_ext(UNICHAR_ID id) const;
224 
225  // Return a STRING that reformats the utf8 str into the str followed
226  // by its hex unicodes.
227  static STRING debug_utf8_str(const char* str);
228 
229  // Return a STRING containing debug information on the unichar, including
230  // the id_to_unichar, its hex unicodes and the properties.
231  STRING debug_str(UNICHAR_ID id) const;
232  STRING debug_str(const char * unichar_repr) const {
233  return debug_str(unichar_to_id(unichar_repr));
234  }
235 
236  // Add a unichar representation to the set.
237  void TESS_API unichar_insert(const char* const unichar_repr);
238 
239  // Return true if the given unichar id exists within the set.
240  // Relies on the fact that unichar ids are contiguous in the unicharset.
241  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
242  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
243  unichar_id >= 0;
244  }
245 
246  // Return true if the given unichar representation exists within the set.
247  bool TESS_API contains_unichar(const char* const unichar_repr) const;
248  bool contains_unichar(const char* const unichar_repr, int length) const;
249 
250  // Return true if the given unichar representation corresponds to the given
251  // UNICHAR_ID within the set.
252  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
253 
254  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
256  for (int i = 0; i < size_used; ++i) {
257  if (unichars[i].properties.fragment != NULL) {
258  delete unichars[i].properties.fragment;
259  unichars[i].properties.fragment = NULL;
260  }
261  }
262  }
263 
264  // Clear the UNICHARSET (all the previous data is lost).
265  void clear() {
266  if (script_table != NULL) {
267  for (int i = 0; i < script_table_size_used; ++i)
268  delete[] script_table[i];
269  delete[] script_table;
270  script_table = NULL;
271  script_table_size_used = 0;
272  }
273  if (unichars != NULL) {
275  delete[] unichars;
276  unichars = NULL;
277  }
278  script_table_size_reserved = 0;
279  size_reserved = 0;
280  size_used = 0;
281  ids.clear();
282  top_bottom_set_ = false;
283  script_has_upper_lower_ = false;
284  script_has_xheight_ = false;
285  null_sid_ = 0;
286  common_sid_ = 0;
287  latin_sid_ = 0;
288  cyrillic_sid_ = 0;
289  greek_sid_ = 0;
290  han_sid_ = 0;
291  hiragana_sid_ = 0;
292  katakana_sid_ = 0;
293  default_sid_ = 0;
294  }
295 
296  // Return the size of the set (the number of different UNICHAR it holds).
297  int size() const {
298  return size_used;
299  }
300 
301  // Reserve enough memory space for the given number of UNICHARS
302  void reserve(int unichars_number);
303 
304  // Opens the file indicated by filename and saves unicharset to that file.
305  // Returns true if the operation is successful.
306  bool save_to_file(const char * const filename) const {
307  FILE* file = fopen(filename, "w+b");
308  if (file == NULL) return false;
309  bool result = save_to_file(file);
310  fclose(file);
311  return result;
312  }
313 
314  // Saves the content of the UNICHARSET to the given file.
315  // Returns true if the operation is successful.
316  bool save_to_file(FILE *file) const {
317  STRING str;
318  if (!save_to_string(&str)) return false;
319  if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
320  return true;
321  }
322  bool save_to_file(tesseract::TFile *file) const {
323  STRING str;
324  if (!save_to_string(&str)) return false;
325  if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
326  return true;
327  }
328 
329  // Saves the content of the UNICHARSET to the given STRING.
330  // Returns true if the operation is successful.
331  bool TESS_API save_to_string(STRING *str) const;
332 
333  // Load a unicharset from a unicharset file that has been loaded into
334  // the given memory buffer.
335  // Returns true if the operation is successful.
336  bool load_from_inmemory_file(const char* const memory, int mem_size,
337  bool skip_fragments);
338  // Returns true if the operation is successful.
339  bool load_from_inmemory_file(const char* const memory, int mem_size) {
340  return load_from_inmemory_file(memory, mem_size, false);
341  }
342 
343  // Opens the file indicated by filename and loads the UNICHARSET
344  // from the given file. The previous data is lost.
345  // Returns true if the operation is successful.
346  bool load_from_file(const char* const filename, bool skip_fragments) {
347  FILE* file = fopen(filename, "rb");
348  if (file == NULL) return false;
349  bool result = load_from_file(file, skip_fragments);
350  fclose(file);
351  return result;
352  }
353  // returns true if the operation is successful.
354  bool load_from_file(const char* const filename) {
355  return load_from_file(filename, false);
356  }
357 
358  // Loads the UNICHARSET from the given file. The previous data is lost.
359  // Returns true if the operation is successful.
360  bool load_from_file(FILE *file, bool skip_fragments);
361  bool load_from_file(FILE *file) { return load_from_file(file, false); }
362  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
363 
364 
365  // Sets up internal data after loading the file, based on the char
366  // properties. Called from load_from_file, but also needs to be run
367  // during set_unicharset_properties.
368  void post_load_setup();
369 
370  // Returns true if right_to_left scripts are significant in the unicharset,
371  // but without being so sensitive that "universal" unicharsets containing
372  // characters from many scripts, like orientation and script detection,
373  // look like they are right_to_left.
374  bool major_right_to_left() const;
375 
376  // Set a whitelist and/or blacklist of characters to recognize.
377  // An empty or NULL whitelist enables everything (minus any blacklist).
378  // An empty or NULL blacklist disables nothing.
379  // An empty or NULL unblacklist has no effect.
380  // The blacklist overrides the whitelist.
381  // The unblacklist overrides the blacklist.
382  // Each list is a string of utf8 character strings. Boundaries between
383  // unicharset units are worked out automatically, and characters not in
384  // the unicharset are silently ignored.
385  void set_black_and_whitelist(const char* blacklist, const char* whitelist,
386  const char* unblacklist);
387 
388  // Set the isalpha property of the given unichar to the given value.
389  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
390  unichars[unichar_id].properties.isalpha = value;
391  }
392 
393  // Set the islower property of the given unichar to the given value.
394  void set_islower(UNICHAR_ID unichar_id, bool value) {
395  unichars[unichar_id].properties.islower = value;
396  }
397 
398  // Set the isupper property of the given unichar to the given value.
399  void set_isupper(UNICHAR_ID unichar_id, bool value) {
400  unichars[unichar_id].properties.isupper = value;
401  }
402 
403  // Set the isdigit property of the given unichar to the given value.
404  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
405  unichars[unichar_id].properties.isdigit = value;
406  }
407 
408  // Set the ispunctuation property of the given unichar to the given value.
409  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
410  unichars[unichar_id].properties.ispunctuation = value;
411  }
412 
413  // Set the isngram property of the given unichar to the given value.
414  void set_isngram(UNICHAR_ID unichar_id, bool value) {
415  unichars[unichar_id].properties.isngram = value;
416  }
417 
418  // Set the script name of the given unichar to the given value.
419  // Value is copied and thus can be a temporary;
420  void set_script(UNICHAR_ID unichar_id, const char* value) {
421  unichars[unichar_id].properties.script_id = add_script(value);
422  }
423 
424  // Set other_case unichar id in the properties for the given unichar id.
425  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
426  unichars[unichar_id].properties.other_case = other_case;
427  }
428 
429  // Set the direction property of the given unichar to the given value.
431  unichars[unichar_id].properties.direction = value;
432  }
433 
434  // Set mirror unichar id in the properties for the given unichar id.
435  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
436  unichars[unichar_id].properties.mirror = mirror;
437  }
438 
439  // Record normalized version of unichar with the given unichar_id.
440  void set_normed(UNICHAR_ID unichar_id, const char* normed) {
441  unichars[unichar_id].properties.normed = normed;
442  unichars[unichar_id].properties.normed_ids.truncate(0);
443  }
444  // Sets the normed_ids vector from the normed string. normed_ids is not
445  // stored in the file, and needs to be set when the UNICHARSET is loaded.
446  void set_normed_ids(UNICHAR_ID unichar_id);
447 
448  // Return the isalpha property of the given unichar.
449  bool get_isalpha(UNICHAR_ID unichar_id) const {
450  if (INVALID_UNICHAR_ID == unichar_id) return false;
451  ASSERT_HOST(contains_unichar_id(unichar_id));
452  return unichars[unichar_id].properties.isalpha;
453  }
454 
455  // Return the islower property of the given unichar.
456  bool get_islower(UNICHAR_ID unichar_id) const {
457  if (INVALID_UNICHAR_ID == unichar_id) return false;
458  ASSERT_HOST(contains_unichar_id(unichar_id));
459  return unichars[unichar_id].properties.islower;
460  }
461 
462  // Return the isupper property of the given unichar.
463  bool get_isupper(UNICHAR_ID unichar_id) const {
464  if (INVALID_UNICHAR_ID == unichar_id) return false;
465  ASSERT_HOST(contains_unichar_id(unichar_id));
466  return unichars[unichar_id].properties.isupper;
467  }
468 
469  // Return the isdigit property of the given unichar.
470  bool get_isdigit(UNICHAR_ID unichar_id) const {
471  if (INVALID_UNICHAR_ID == unichar_id) return false;
472  ASSERT_HOST(contains_unichar_id(unichar_id));
473  return unichars[unichar_id].properties.isdigit;
474  }
475 
476  // Return the ispunctuation property of the given unichar.
477  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
478  if (INVALID_UNICHAR_ID == unichar_id) return false;
479  ASSERT_HOST(contains_unichar_id(unichar_id));
480  return unichars[unichar_id].properties.ispunctuation;
481  }
482 
483  // Return the isngram property of the given unichar.
484  bool get_isngram(UNICHAR_ID unichar_id) const {
485  if (INVALID_UNICHAR_ID == unichar_id) return false;
486  ASSERT_HOST(contains_unichar_id(unichar_id));
487  return unichars[unichar_id].properties.isngram;
488  }
489 
490  // Returns whether the unichar id represents a unicode value in the private
491  // use area.
492  bool get_isprivate(UNICHAR_ID unichar_id) const;
493 
494  // Returns true if the ids have useful min/max top/bottom values.
495  bool top_bottom_useful() const {
496  return top_bottom_set_;
497  }
498  // Sets all ranges to empty, so they can be expanded to set the values.
499  void set_ranges_empty();
500  // Sets all the properties for this unicharset given a src_unicharset with
501  // everything set. The unicharsets don't have to be the same, and graphemes
502  // are correctly accounted for.
505  }
506  // Sets properties from Other, starting only at the given index.
507  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
508  // Expands the tops and bottoms and widths for this unicharset given a
509  // src_unicharset with ranges in it. The unicharsets don't have to be the
510  // same, and graphemes are correctly accounted for.
511  void ExpandRangesFromOther(const UNICHARSET& src);
512  // Makes this a copy of src. Clears this completely first, so the automattic
513  // ids will not be present in this if not in src.
514  void CopyFrom(const UNICHARSET& src);
515  // For each id in src, if it does not occur in this, add it, as in
516  // SetPropertiesFromOther, otherwise expand the ranges, as in
517  // ExpandRangesFromOther.
518  void AppendOtherUnicharset(const UNICHARSET& src);
519  // Returns true if the acceptable ranges of the tops of the characters do
520  // not overlap, making their x-height calculations distinct.
521  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
522  // Returns the min and max bottom and top of the given unichar in
523  // baseline-normalized coordinates, ie, where the baseline is
524  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
525  // (See normalis.h for the definitions).
526  void get_top_bottom(UNICHAR_ID unichar_id,
527  int* min_bottom, int* max_bottom,
528  int* min_top, int* max_top) const {
529  if (INVALID_UNICHAR_ID == unichar_id) {
530  *min_bottom = *min_top = 0;
531  *max_bottom = *max_top = 256; // kBlnCellHeight
532  return;
533  }
534  ASSERT_HOST(contains_unichar_id(unichar_id));
535  *min_bottom = unichars[unichar_id].properties.min_bottom;
536  *max_bottom = unichars[unichar_id].properties.max_bottom;
537  *min_top = unichars[unichar_id].properties.min_top;
538  *max_top = unichars[unichar_id].properties.max_top;
539  }
540  void set_top_bottom(UNICHAR_ID unichar_id,
541  int min_bottom, int max_bottom,
542  int min_top, int max_top) {
543  unichars[unichar_id].properties.min_bottom =
544  static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
545  unichars[unichar_id].properties.max_bottom =
546  static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
547  unichars[unichar_id].properties.min_top =
548  static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
549  unichars[unichar_id].properties.max_top =
550  static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
551  }
552  // Returns the width stats (as mean, sd) of the given unichar relative to the
553  // median advance of all characters in the character set.
554  void get_width_stats(UNICHAR_ID unichar_id,
555  float* width, float* width_sd) const {
556  if (INVALID_UNICHAR_ID == unichar_id) {
557  *width = 0.0f;
558  *width_sd = 0.0f;;
559  return;
560  }
561  ASSERT_HOST(contains_unichar_id(unichar_id));
562  *width = unichars[unichar_id].properties.width;
563  *width_sd = unichars[unichar_id].properties.width_sd;
564  }
565  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
566  unichars[unichar_id].properties.width = width;
567  unichars[unichar_id].properties.width_sd = width_sd;
568  }
569  // Returns the stats of the x-bearing (as mean, sd) of the given unichar
570  // relative to the median advance of all characters in the character set.
571  void get_bearing_stats(UNICHAR_ID unichar_id,
572  float* bearing, float* bearing_sd) const {
573  if (INVALID_UNICHAR_ID == unichar_id) {
574  *bearing = *bearing_sd = 0.0f;
575  return;
576  }
577  ASSERT_HOST(contains_unichar_id(unichar_id));
578  *bearing = unichars[unichar_id].properties.bearing;
579  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
580  }
581  void set_bearing_stats(UNICHAR_ID unichar_id,
582  float bearing, float bearing_sd) {
583  unichars[unichar_id].properties.bearing = bearing;
584  unichars[unichar_id].properties.bearing_sd = bearing_sd;
585  }
586  // Returns the stats of the x-advance of the given unichar (as mean, sd)
587  // relative to the median advance of all characters in the character set.
588  void get_advance_stats(UNICHAR_ID unichar_id,
589  float* advance, float* advance_sd) const {
590  if (INVALID_UNICHAR_ID == unichar_id) {
591  *advance = *advance_sd = 0;
592  return;
593  }
594  ASSERT_HOST(contains_unichar_id(unichar_id));
595  *advance = unichars[unichar_id].properties.advance;
596  *advance_sd = unichars[unichar_id].properties.advance_sd;
597  }
598  void set_advance_stats(UNICHAR_ID unichar_id,
599  float advance, float advance_sd) {
600  unichars[unichar_id].properties.advance = advance;
601  unichars[unichar_id].properties.advance_sd = advance_sd;
602  }
603  // Returns true if the font metrics properties are empty.
604  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
605  return unichars[unichar_id].properties.AnyRangeEmpty();
606  }
607 
608  // Return the script name of the given unichar.
609  // The returned pointer will always be the same for the same script, it's
610  // managed by unicharset and thus MUST NOT be deleted
611  int get_script(UNICHAR_ID unichar_id) const {
612  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
613  ASSERT_HOST(contains_unichar_id(unichar_id));
614  return unichars[unichar_id].properties.script_id;
615  }
616 
617  // Return the character properties, eg. alpha/upper/lower/digit/punct,
618  // as a bit field of unsigned int.
619  unsigned int get_properties(UNICHAR_ID unichar_id) const;
620 
621  // Return the character property as a single char. If a character has
622  // multiple attributes, the main property is defined by the following order:
623  // upper_case : 'A'
624  // lower_case : 'a'
625  // alpha : 'x'
626  // digit : '0'
627  // punctuation: 'p'
628  char get_chartype(UNICHAR_ID unichar_id) const;
629 
630  // Get other_case unichar id in the properties for the given unichar id.
632  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
633  ASSERT_HOST(contains_unichar_id(unichar_id));
634  return unichars[unichar_id].properties.other_case;
635  }
636 
637  // Returns the direction property of the given unichar.
638  Direction get_direction(UNICHAR_ID unichar_id) const {
639  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
640  ASSERT_HOST(contains_unichar_id(unichar_id));
641  return unichars[unichar_id].properties.direction;
642  }
643 
644  // Get mirror unichar id in the properties for the given unichar id.
645  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
646  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
647  ASSERT_HOST(contains_unichar_id(unichar_id));
648  return unichars[unichar_id].properties.mirror;
649  }
650 
651  // Returns UNICHAR_ID of the corresponding lower-case unichar.
652  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
653  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
654  ASSERT_HOST(contains_unichar_id(unichar_id));
655  if (unichars[unichar_id].properties.islower) return unichar_id;
656  return unichars[unichar_id].properties.other_case;
657  }
658 
659  // Returns UNICHAR_ID of the corresponding upper-case unichar.
660  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
661  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
662  ASSERT_HOST(contains_unichar_id(unichar_id));
663  if (unichars[unichar_id].properties.isupper) return unichar_id;
664  return unichars[unichar_id].properties.other_case;
665  }
666 
667  // Returns true if this UNICHARSET has the special codes in
668  // SpecialUnicharCodes available. If false then there are normal unichars
669  // at these codes and they should not be used.
670  bool has_special_codes() const {
671  return get_fragment(UNICHAR_BROKEN) != NULL &&
674  }
675 
676  // Returns true if there are any repeated unicodes in the normalized
677  // text of any unichar-id in the unicharset.
678  bool AnyRepeatedUnicodes() const;
679 
680  // Return a pointer to the CHAR_FRAGMENT class if the given
681  // unichar id represents a character fragment.
682  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
683  if (INVALID_UNICHAR_ID == unichar_id) return NULL;
684  ASSERT_HOST(contains_unichar_id(unichar_id));
685  return unichars[unichar_id].properties.fragment;
686  }
687 
688  // Return the isalpha property of the given unichar representation.
689  bool get_isalpha(const char* const unichar_repr) const {
690  return get_isalpha(unichar_to_id(unichar_repr));
691  }
692 
693  // Return the islower property of the given unichar representation.
694  bool get_islower(const char* const unichar_repr) const {
695  return get_islower(unichar_to_id(unichar_repr));
696  }
697 
698  // Return the isupper property of the given unichar representation.
699  bool get_isupper(const char* const unichar_repr) const {
700  return get_isupper(unichar_to_id(unichar_repr));
701  }
702 
703  // Return the isdigit property of the given unichar representation.
704  bool get_isdigit(const char* const unichar_repr) const {
705  return get_isdigit(unichar_to_id(unichar_repr));
706  }
707 
708  // Return the ispunctuation property of the given unichar representation.
709  bool get_ispunctuation(const char* const unichar_repr) const {
710  return get_ispunctuation(unichar_to_id(unichar_repr));
711  }
712 
713  // Return the character properties, eg. alpha/upper/lower/digit/punct,
714  // of the given unichar representation
715  unsigned int get_properties(const char* const unichar_repr) const {
716  return get_properties(unichar_to_id(unichar_repr));
717  }
718 
719  char get_chartype(const char* const unichar_repr) const {
720  return get_chartype(unichar_to_id(unichar_repr));
721  }
722 
723  // Return the script name of the given unichar representation.
724  // The returned pointer will always be the same for the same script, it's
725  // managed by unicharset and thus MUST NOT be deleted
726  int get_script(const char* const unichar_repr) const {
727  return get_script(unichar_to_id(unichar_repr));
728  }
729 
730  // Return a pointer to the CHAR_FRAGMENT class struct if the given
731  // unichar representation represents a character fragment.
732  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
733  if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
734  !ids.contains(unichar_repr)) {
735  return NULL;
736  }
737  return get_fragment(unichar_to_id(unichar_repr));
738  }
739 
740  // Return the isalpha property of the given unichar representation.
741  // Only the first length characters from unichar_repr are used.
742  bool get_isalpha(const char* const unichar_repr,
743  int length) const {
744  return get_isalpha(unichar_to_id(unichar_repr, length));
745  }
746 
747  // Return the islower property of the given unichar representation.
748  // Only the first length characters from unichar_repr are used.
749  bool get_islower(const char* const unichar_repr,
750  int length) const {
751  return get_islower(unichar_to_id(unichar_repr, length));
752  }
753 
754  // Return the isupper property of the given unichar representation.
755  // Only the first length characters from unichar_repr are used.
756  bool get_isupper(const char* const unichar_repr,
757  int length) const {
758  return get_isupper(unichar_to_id(unichar_repr, length));
759  }
760 
761  // Return the isdigit property of the given unichar representation.
762  // Only the first length characters from unichar_repr are used.
763  bool get_isdigit(const char* const unichar_repr,
764  int length) const {
765  return get_isdigit(unichar_to_id(unichar_repr, length));
766  }
767 
768  // Return the ispunctuation property of the given unichar representation.
769  // Only the first length characters from unichar_repr are used.
770  bool get_ispunctuation(const char* const unichar_repr,
771  int length) const {
772  return get_ispunctuation(unichar_to_id(unichar_repr, length));
773  }
774 
775  // Returns normalized version of unichar with the given unichar_id.
776  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
777  if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
778  return unichars[unichar_id].properties.normed.string();
779  }
780  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
781  // version of the given id. There may be more than one UNICHAR_ID in the
782  // vector if unichar_id represents a ligature.
784  return unichars[unichar_id].properties.normed_ids;
785  }
786 
787  // Return the script name of the given unichar representation.
788  // Only the first length characters from unichar_repr are used.
789  // The returned pointer will always be the same for the same script, it's
790  // managed by unicharset and thus MUST NOT be deleted
791  int get_script(const char* const unichar_repr,
792  int length) const {
793  return get_script(unichar_to_id(unichar_repr, length));
794  }
795 
796  // Return the (current) number of scripts in the script table
797  int get_script_table_size() const {
798  return script_table_size_used;
799  }
800 
801  // Return the script string from its id
802  const char* get_script_from_script_id(int id) const {
803  if (id >= script_table_size_used || id < 0)
804  return null_script;
805  return script_table[id];
806  }
807 
808  // Returns the id from the name of the script, or 0 if script is not found.
809  // Note that this is an expensive operation since it involves iteratively
810  // comparing strings in the script table. To avoid dependency on STL, we
811  // won't use a hash. Instead, the calling function can use this to lookup
812  // and save the ID for relevant scripts for fast comparisons later.
813  int get_script_id_from_name(const char* script_name) const;
814 
815  // Return true if the given script is the null script
816  bool is_null_script(const char* script) const {
817  return script == null_script;
818  }
819 
820  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
821  // then the returned pointer will be the same.
822  // The script parameter is copied and thus can be a temporary.
823  int add_script(const char* script);
824 
825  // Return the enabled property of the given unichar.
826  bool get_enabled(UNICHAR_ID unichar_id) const {
827  return unichars[unichar_id].properties.enabled;
828  }
829 
830 
831  int null_sid() const { return null_sid_; }
832  int common_sid() const { return common_sid_; }
833  int latin_sid() const { return latin_sid_; }
834  int cyrillic_sid() const { return cyrillic_sid_; }
835  int greek_sid() const { return greek_sid_; }
836  int han_sid() const { return han_sid_; }
837  int hiragana_sid() const { return hiragana_sid_; }
838  int katakana_sid() const { return katakana_sid_; }
839  int default_sid() const { return default_sid_; }
840 
841  // Returns true if the unicharset has the concept of upper/lower case.
842  bool script_has_upper_lower() const {
843  return script_has_upper_lower_;
844  }
845  // Returns true if the unicharset has the concept of x-height.
846  // script_has_xheight can be true even if script_has_upper_lower is not,
847  // when the script has a sufficiently predominant top line with ascenders,
848  // such as Devanagari and Thai.
849  bool script_has_xheight() const {
850  return script_has_xheight_;
851  }
852 
853  private:
854 
855  struct UNICHAR_PROPERTIES {
856  UNICHAR_PROPERTIES();
857  // Initializes all properties to sensible default values.
858  void Init();
859  // Sets all ranges wide open. Initialization default in case there are
860  // no useful values available.
861  void SetRangesOpen();
862  // Sets all ranges to empty. Used before expanding with font-based data.
863  void SetRangesEmpty();
864  // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
865  // is emtpy.
866  bool AnyRangeEmpty() const;
867  // Expands the ranges with the ranges from the src properties.
868  void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
869  // Copies the properties from src into this.
870  void CopyFrom(const UNICHAR_PROPERTIES& src);
871 
872  bool isalpha;
873  bool islower;
874  bool isupper;
875  bool isdigit;
876  bool ispunctuation;
877  bool isngram;
878  bool enabled;
879  // Possible limits of the top and bottom of the bounding box in
880  // baseline-normalized coordinates, ie, where the baseline is
881  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
882  // (See normalis.h for the definitions).
883  uinT8 min_bottom;
884  uinT8 max_bottom;
885  uinT8 min_top;
886  uinT8 max_top;
887  // Statstics of the widths of bounding box, relative to the median advance.
888  float width;
889  float width_sd;
890  // Stats of the x-bearing and advance, also relative to the median advance.
891  float bearing;
892  float bearing_sd;
893  float advance;
894  float advance_sd;
895  int script_id;
896  UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
897  Direction direction; // direction of this unichar
898  // Mirror property is useful for reverse DAWG lookup for words in
899  // right-to-left languages (e.g. "(word)" would be in
900  // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
901  // However, what we want in our DAWG is
902  // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
903  // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
904  UNICHAR_ID mirror;
905  // A string of unichar_ids that represent the corresponding normed string.
906  // For awkward characters like em-dash, this gives hyphen.
907  // For ligatures, this gives the string of normal unichars.
909  STRING normed; // normalized version of this unichar
910  // Contains meta information about the fragment if a unichar represents
911  // a fragment of a character, otherwise should be set to NULL.
912  // It is assumed that character fragments are added to the unicharset
913  // after the corresponding 'base' characters.
914  CHAR_FRAGMENT *fragment;
915  };
916 
917  struct UNICHAR_SLOT {
918  char representation[UNICHAR_LEN + 1];
919  UNICHAR_PROPERTIES properties;
920  };
921 
922  // Internal recursive version of encode_string above.
923  // str is the start of the whole string.
924  // str_index is the current position in str.
925  // str_length is the length of str.
926  // encoding is a working encoding of str.
927  // lengths is a working set of lengths of each element of encoding.
928  // best_total_length is the longest length of str that has been successfully
929  // encoded so far.
930  // On return:
931  // best_encoding contains the encoding that used the longest part of str.
932  // best_lengths (may be null) contains the lengths of best_encoding.
933  void encode_string(const char* str, int str_index, int str_length,
934  GenericVector<UNICHAR_ID>* encoding,
935  GenericVector<char>* lengths,
936  int* best_total_length,
937  GenericVector<UNICHAR_ID>* best_encoding,
938  GenericVector<char>* best_lengths) const;
939 
940  // Gets the properties for a grapheme string, combining properties for
941  // multiple characters in a meaningful way where possible.
942  // Returns false if no valid match was found in the unicharset.
943  // NOTE that script_id, mirror, and other_case refer to this unicharset on
944  // return and will need redirecting if the target unicharset is different.
945  bool GetStrProperties(const char* utf8_str,
946  UNICHAR_PROPERTIES* props) const;
947 
948  // Load ourselves from a "file" where our only interface to the file is
949  // an implementation of fgets(). This is the parsing primitive accessed by
950  // the public routines load_from_file() and load_from_inmemory_file().
951  bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
952  bool skip_fragments);
953 
954  UNICHAR_SLOT* unichars;
955  UNICHARMAP ids;
956  int size_used;
957  int size_reserved;
958  char** script_table;
959  int script_table_size_used;
960  int script_table_size_reserved;
961  const char* null_script;
962  // True if the unichars have their tops/bottoms set.
963  bool top_bottom_set_;
964  // True if the unicharset has significant upper/lower case chars.
965  bool script_has_upper_lower_;
966  // True if the unicharset has a significant mean-line with significant
967  // ascenders above that.
968  bool script_has_xheight_;
969 
970  // A few convenient script name-to-id mapping without using hash.
971  // These are initialized when unicharset file is loaded. Anything
972  // missing from this list can be looked up using get_script_id_from_name.
973  int null_sid_;
974  int common_sid_;
975  int latin_sid_;
976  int cyrillic_sid_;
977  int greek_sid_;
978  int han_sid_;
979  int hiragana_sid_;
980  int katakana_sid_;
981  // The most frequently occurring script in the charset.
982  int default_sid_;
983 };
984 
985 #endif // TESSERACT_CCUTIL_UNICHARSET_H__
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:948
bool is_null_script(const char *script) const
Definition: unicharset.h:816
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:503
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:554
void clear()
Definition: unicharmap.cpp:154
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:660
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
int get_script_id_from_name(const char *script_name) const
short inT16
Definition: host.h:33
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
bool top_bottom_useful() const
Definition: unicharset.h:495
int greek_sid() const
Definition: unicharset.h:835
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:709
bool TESS_API contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:394
inT32 length() const
Definition: strngs.cpp:196
bool load_from_file(FILE *file)
Definition: unicharset.h:361
int get_script_table_size() const
Definition: unicharset.h:797
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:131
void reserve(int unichars_number)
Definition: unicharset.cpp:179
int size() const
Definition: unicharset.h:297
int get_total() const
Definition: unicharset.h:66
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
#define TESS_API
Definition: platform.h:81
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:440
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:645
void set_ranges_empty()
Definition: unicharset.cpp:371
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:638
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:776
bool load_from_file(const char *const filename)
Definition: unicharset.h:354
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:802
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:598
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:222
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:689
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:603
unsigned char uinT8
Definition: host.h:32
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:274
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:92
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:425
int han_sid() const
Definition: unicharset.h:836
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:699
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:380
static const int kMaxLen
Definition: unicharset.h:47
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:724
bool has_special_codes() const
Definition: unicharset.h:670
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
bool AnyRepeatedUnicodes() const
Definition: unicharset.cpp:986
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:404
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:742
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:84
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:232
bool is_ending() const
Definition: unicharset.h:102
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:588
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:399
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:694
#define MAX_UINT8
Definition: host.h:54
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:756
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:588
static const int kMinLen
Definition: unicharset.h:45
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:435
void set_total(int t)
Definition: unicharset.h:63
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:770
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:79
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:147
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:704
void set_unichar(const char *uch)
Definition: unicharset.h:58
bool major_right_to_left() const
Definition: unicharset.cpp:931
#define UNICHAR_LEN
Definition: unichar.h:30
int cyrillic_sid() const
Definition: unicharset.h:834
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:565
void set_pos(int p)
Definition: unicharset.h:62
int get_pos() const
Definition: unicharset.h:65
const char * get_unichar() const
Definition: unicharset.h:64
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:763
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool TESS_API save_to_string(STRING *str) const
Definition: unicharset.cpp:661
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:726
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:581
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:144
int latin_sid() const
Definition: unicharset.h:833
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
bool save_to_file(FILE *file) const
Definition: unicharset.h:316
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:715
void delete_pointers_in_unichars()
Definition: unicharset.h:255
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:430
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void clear()
Definition: unicharset.h:265
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:571
void TESS_API unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
UNICHAR_ID TESS_API unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:389
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:348
int hiragana_sid() const
Definition: unicharset.h:837
bool is_natural() const
Definition: unicharset.h:107
Definition: strngs.h:44
void set_natural(bool value)
Definition: unicharset.h:108
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:294
void post_load_setup()
Definition: unicharset.cpp:867
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:484
STRING to_string() const
Definition: unicharset.h:73
bool is_beginning() const
Definition: unicharset.h:99
int common_sid() const
Definition: unicharset.h:832
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:241
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:52
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:604
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:749
int add_script(const char *script)
bool script_has_xheight() const
Definition: unicharset.h:849
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
int default_sid() const
Definition: unicharset.h:839
int null_sid() const
Definition: unicharset.h:831
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:414
bool load_from_inmemory_file(const char *const memory, int mem_size)
Definition: unicharset.h:339
bool script_has_upper_lower() const
Definition: unicharset.h:842
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:410
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:826
static const int kMaxChunks
Definition: unicharset.h:49
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:719
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:409
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:652
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:791
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
#define ASSERT_HOST(x)
Definition: errcode.h:84
static CHAR_FRAGMENT * parse_from_string(const char *str)
int step(const char *str) const
Definition: unicharset.cpp:211
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:732
int katakana_sid() const
Definition: unicharset.h:838
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:420
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:322
int UNICHAR_ID
Definition: unichar.h:33
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:540
SpecialUnicharCodes
Definition: unicharset.h:34
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:363