tesseract  3.05.02
unichar.cpp
Go to the documentation of this file.
1 // File: unichar.cpp
3 // Description: Unicode character/ligature class.
4 // Author: Ray Smith
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "unichar.h"
21 #include "errcode.h"
22 #include "genericvector.h"
23 #include "tprintf.h"
24 
25 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
26 
27 // Construct from a utf8 string. If len<0 then the string is null terminated.
28 // If the string is too long to fit in the UNICHAR then it takes only what
29 // will fit. Checks for illegal input and stops at an illegal sequence.
30 // The resulting UNICHAR may be empty.
31 UNICHAR::UNICHAR(const char* utf8_str, int len) {
32  int total_len = 0;
33  int step = 0;
34  if (len < 0) {
35  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
36  }
37  for (total_len = 0; total_len < len; total_len += step) {
38  step = utf8_step(utf8_str + total_len);
39  if (total_len + step > UNICHAR_LEN)
40  break; // Too long.
41  if (step == 0)
42  break; // Illegal first byte.
43  int i;
44  for (i = 1; i < step; ++i)
45  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
46  break;
47  if (i < step)
48  break; // Illegal surrogate
49  }
50  memcpy(chars, utf8_str, total_len);
51  if (total_len < UNICHAR_LEN) {
52  chars[UNICHAR_LEN - 1] = total_len;
53  while (total_len < UNICHAR_LEN - 1)
54  chars[total_len++] = 0;
55  }
56 }
57 
58 // Construct from a single UCS4 character. Illegal values are ignored,
59 // resulting in an empty UNICHAR.
60 UNICHAR::UNICHAR(int unicode) {
61  const int bytemask = 0xBF;
62  const int bytemark = 0x80;
63 
64  if (unicode < 0x80) {
65  chars[UNICHAR_LEN - 1] = 1;
66  chars[2] = 0;
67  chars[1] = 0;
68  chars[0] = static_cast<char>(unicode);
69  } else if (unicode < 0x800) {
70  chars[UNICHAR_LEN - 1] = 2;
71  chars[2] = 0;
72  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
73  unicode >>= 6;
74  chars[0] = static_cast<char>(unicode | 0xc0);
75  } else if (unicode < 0x10000) {
76  chars[UNICHAR_LEN - 1] = 3;
77  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
78  unicode >>= 6;
79  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
80  unicode >>= 6;
81  chars[0] = static_cast<char>(unicode | 0xe0);
82  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
83  chars[UNICHAR_LEN - 1] = 4;
84  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
85  unicode >>= 6;
86  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
87  unicode >>= 6;
88  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
89  unicode >>= 6;
90  chars[0] = static_cast<char>(unicode | 0xf0);
91  } else {
92  memset(chars, 0, UNICHAR_LEN);
93  }
94 }
95 
96 // Get the first character as UCS-4.
97 int UNICHAR::first_uni() const {
98  static const int utf8_offsets[5] = {
99  0, 0, 0x3080, 0xE2080, 0x3C82080
100  };
101  int uni = 0;
102  int len = utf8_step(chars);
103  const char* src = chars;
104 
105  switch (len) {
106  default:
107  break;
108  case 4:
109  uni += static_cast<unsigned char>(*src++);
110  uni <<= 6;
111  case 3:
112  uni += static_cast<unsigned char>(*src++);
113  uni <<= 6;
114  case 2:
115  uni += static_cast<unsigned char>(*src++);
116  uni <<= 6;
117  case 1:
118  uni += static_cast<unsigned char>(*src++);
119  }
120  uni -= utf8_offsets[len];
121  return uni;
122 }
123 
124 // Get a terminated UTF8 string: Must delete[] it after use.
125 char* UNICHAR::utf8_str() const {
126  int len = utf8_len();
127  char* str = new char[len + 1];
128  memcpy(str, chars, len);
129  str[len] = 0;
130  return str;
131 }
132 
133 // Get the number of bytes in the first character of the given utf8 string.
134 int UNICHAR::utf8_step(const char* utf8_str) {
135  static const char utf8_bytes[256] = {
136  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
138  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
140  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
141  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
142  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
143  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
144  };
145 
146  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
147 }
148 
150  ASSERT_HOST(it_ != NULL);
151  int step = utf8_step(it_);
152  if (step == 0) {
153  tprintf("ERROR: Illegal UTF8 encountered.\n");
154  for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
155  tprintf("Index %d char = 0x%x\n", i, it_[i]);
156  }
157  step = 1;
158  }
159  it_ += step;
160  return *this;
161 }
162 
164  ASSERT_HOST(it_ != NULL);
165  const int len = utf8_step(it_);
166  if (len == 0) {
167  tprintf("WARNING: Illegal UTF8 encountered\n");
168  return ' ';
169  }
170  UNICHAR uch(it_, len);
171  return uch.first_uni();
172 }
173 
174 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
175  ASSERT_HOST(it_ != NULL);
176  const int len = utf8_step(it_);
177  if (len == 0) {
178  tprintf("WARNING: Illegal UTF8 encountered\n");
179  utf8_output[0] = ' ';
180  return 1;
181  }
182  strncpy(utf8_output, it_, len);
183  return len;
184 }
185 
187  ASSERT_HOST(it_ != NULL);
188  const int len = utf8_step(it_);
189  if (len == 0) {
190  tprintf("WARNING: Illegal UTF8 encountered\n");
191  return 1;
192  }
193  return len;
194 }
195 
197  return utf8_step(it_) > 0;
198 }
199 
200 UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
202 }
203 
204 UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
205  return UNICHAR::const_iterator(utf8_str + len);
206 }
207 
208 // Converts a utf-8 string to a vector of unicodes.
209 // Returns false if the input contains invalid UTF-8, and replaces
210 // the rest of the string with a single space.
212  GenericVector<int>* unicodes) {
213  const int utf8_length = strlen(utf8_str);
214  const_iterator end_it(end(utf8_str, utf8_length));
215  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
216  if (it.is_legal()) {
217  unicodes->push_back(*it);
218  } else {
219  unicodes->push_back(' ');
220  return false;
221  }
222  }
223  return true;
224 }
225 
int first_uni() const
Definition: unichar.cpp:97
bool is_legal() const
Definition: unichar.cpp:196
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
const_iterator & operator++()
Definition: unichar.cpp:149
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:25
int push_back(T object)
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
#define UNICHAR_LEN
Definition: unichar.h:30
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
int utf8_len() const
Definition: unichar.h:72
int get_utf8(char *buf) const
Definition: unichar.cpp:174
#define tprintf(...)
Definition: tprintf.h:31
int utf8_len() const
Definition: unichar.cpp:186
UNICHAR()
Definition: unichar.h:54
char * utf8_str() const
Definition: unichar.cpp:125
int operator*() const
Definition: unichar.cpp:163
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define ASSERT_HOST(x)
Definition: errcode.h:84