tesseract  3.05.02
UNICHAR Class Reference

#include <unichar.h>

Classes

class  const_iterator
 

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 
static const_iterator begin (const char *utf8_str, const int byte_length)
 
static const_iterator end (const char *utf8_str, const int byte_length)
 
static bool UTF8ToUnicode (const char *utf8_str, GenericVector< int > *unicodes)
 

Detailed Description

Definition at line 52 of file unichar.h.

Constructor & Destructor Documentation

◆ UNICHAR() [1/3]

UNICHAR::UNICHAR ( )
inline

Definition at line 54 of file unichar.h.

54  {
55  memset(chars, 0, UNICHAR_LEN);
56  }
#define UNICHAR_LEN
Definition: unichar.h:30

◆ UNICHAR() [2/3]

UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 31 of file unichar.cpp.

31  {
32  int total_len = 0;
33  int step = 0;
34  if (len < 0) {
35  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
36  }
37  for (total_len = 0; total_len < len; total_len += step) {
38  step = utf8_step(utf8_str + total_len);
39  if (total_len + step > UNICHAR_LEN)
40  break; // Too long.
41  if (step == 0)
42  break; // Illegal first byte.
43  int i;
44  for (i = 1; i < step; ++i)
45  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
46  break;
47  if (i < step)
48  break; // Illegal surrogate
49  }
50  memcpy(chars, utf8_str, total_len);
51  if (total_len < UNICHAR_LEN) {
52  chars[UNICHAR_LEN - 1] = total_len;
53  while (total_len < UNICHAR_LEN - 1)
54  chars[total_len++] = 0;
55  }
56 }
#define UNICHAR_LEN
Definition: unichar.h:30
char * utf8_str() const
Definition: unichar.cpp:125
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ UNICHAR() [3/3]

UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 60 of file unichar.cpp.

60  {
61  const int bytemask = 0xBF;
62  const int bytemark = 0x80;
63 
64  if (unicode < 0x80) {
65  chars[UNICHAR_LEN - 1] = 1;
66  chars[2] = 0;
67  chars[1] = 0;
68  chars[0] = static_cast<char>(unicode);
69  } else if (unicode < 0x800) {
70  chars[UNICHAR_LEN - 1] = 2;
71  chars[2] = 0;
72  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
73  unicode >>= 6;
74  chars[0] = static_cast<char>(unicode | 0xc0);
75  } else if (unicode < 0x10000) {
76  chars[UNICHAR_LEN - 1] = 3;
77  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
78  unicode >>= 6;
79  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
80  unicode >>= 6;
81  chars[0] = static_cast<char>(unicode | 0xe0);
82  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
83  chars[UNICHAR_LEN - 1] = 4;
84  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
85  unicode >>= 6;
86  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
87  unicode >>= 6;
88  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
89  unicode >>= 6;
90  chars[0] = static_cast<char>(unicode | 0xf0);
91  } else {
92  memset(chars, 0, UNICHAR_LEN);
93  }
94 }
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:25
#define UNICHAR_LEN
Definition: unichar.h:30

Member Function Documentation

◆ begin()

UNICHAR::const_iterator UNICHAR::begin ( const char *  utf8_str,
const int  byte_length 
)
static

Definition at line 200 of file unichar.cpp.

200  {
202 }
char * utf8_str() const
Definition: unichar.cpp:125

◆ end()

UNICHAR::const_iterator UNICHAR::end ( const char *  utf8_str,
const int  byte_length 
)
static

Definition at line 204 of file unichar.cpp.

204  {
205  return UNICHAR::const_iterator(utf8_str + len);
206 }
char * utf8_str() const
Definition: unichar.cpp:125

◆ first_uni()

int UNICHAR::first_uni ( ) const

Definition at line 97 of file unichar.cpp.

97  {
98  static const int utf8_offsets[5] = {
99  0, 0, 0x3080, 0xE2080, 0x3C82080
100  };
101  int uni = 0;
102  int len = utf8_step(chars);
103  const char* src = chars;
104 
105  switch (len) {
106  default:
107  break;
108  case 4:
109  uni += static_cast<unsigned char>(*src++);
110  uni <<= 6;
111  case 3:
112  uni += static_cast<unsigned char>(*src++);
113  uni <<= 6;
114  case 2:
115  uni += static_cast<unsigned char>(*src++);
116  uni <<= 6;
117  case 1:
118  uni += static_cast<unsigned char>(*src++);
119  }
120  uni -= utf8_offsets[len];
121  return uni;
122 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134

◆ utf8()

const char* UNICHAR::utf8 ( ) const
inline

Definition at line 78 of file unichar.h.

78  {
79  return chars;
80  }

◆ utf8_len()

int UNICHAR::utf8_len ( ) const
inline

Definition at line 72 of file unichar.h.

72  {
73  int len = chars[UNICHAR_LEN - 1];
74  return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
75  }
#define UNICHAR_LEN
Definition: unichar.h:30

◆ utf8_step()

int UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 134 of file unichar.cpp.

134  {
135  static const char utf8_bytes[256] = {
136  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
138  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
140  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
141  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
142  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
143  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
144  };
145 
146  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
147 }

◆ utf8_str()

char * UNICHAR::utf8_str ( ) const

Definition at line 125 of file unichar.cpp.

125  {
126  int len = utf8_len();
127  char* str = new char[len + 1];
128  memcpy(str, chars, len);
129  str[len] = 0;
130  return str;
131 }
int utf8_len() const
Definition: unichar.h:72

◆ UTF8ToUnicode()

bool UNICHAR::UTF8ToUnicode ( const char *  utf8_str,
GenericVector< int > *  unicodes 
)
static

Definition at line 211 of file unichar.cpp.

212  {
213  const int utf8_length = strlen(utf8_str);
214  const_iterator end_it(end(utf8_str, utf8_length));
215  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
216  if (it.is_legal()) {
217  unicodes->push_back(*it);
218  } else {
219  unicodes->push_back(' ');
220  return false;
221  }
222  }
223  return true;
224 }
int push_back(T object)
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
char * utf8_str() const
Definition: unichar.cpp:125

The documentation for this class was generated from the following files: