25 #include "unicode/normalizer2.h" 26 #include "unicode/translit.h" 27 #include "unicode/unorm2.h" 33 str32->
reserve(strlen(utf8_str));
34 int len = strlen(utf8_str);
36 for (
int ch = 0; ch < len; ch += step) {
39 UNICHAR uni_ch(utf8_str + ch, step);
48 for (
int i = 0; i < str32.
length(); ++i) {
59 static const int kNumHyphenPuncUnicodes = 13;
60 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
62 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,
70 for (
int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
71 if (kHyphenPuncUnicodes[i] == ch)
78 static const int kNumSingleQuoteUnicodes = 8;
79 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
90 for (
int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
91 if (kSingleQuoteUnicodes[i] == ch)
98 static const int kNumDoubleQuoteUnicodes = 8;
99 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
109 for (
int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
110 if (kDoubleQuoteUnicodes[i] == ch)
119 for (
int i = 0; i < str32.
length(); ++i) {
122 for (
int j = 0; j < norm_str.
length(); ++j) {
133 const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
134 NULL,
"nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code);
135 error_code.assertSuccess();
138 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
139 icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
140 error_code.assertSuccess();
143 for (
int i = 0; i < norm_str.length(); ++i) {
145 if (norm_str[i] ==
' ') {
172 return (static_cast<uinT32>(ch) < 0xD800)
173 || (ch >= 0xE000 && ch <= 0x10FFFF);
178 "Invalid Unicode codepoint: 0x%x\n", ch);
179 return u_isUWhiteSpace(static_cast<UChar32>(ch));
192 n_white += it.utf8_len();
203 n_notwhite += it.utf8_len();
210 !(ch >= 0xFDD0 && ch <= 0xFDEF) &&
211 !(ch >= 0xFFFE && ch <= 0xFFFF) &&
212 !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
213 !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
214 !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
215 !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
216 !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
217 !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
218 !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
219 !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
220 !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
221 !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
222 !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
223 !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
224 !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
225 !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
226 !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
227 !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
228 (!u_isISOControl(static_cast<UChar32>(ch)) ||
229 ch ==
'\n' || ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
235 (!u_isISOControl(static_cast<UChar32>(ch)) ||
236 ch ==
'\n' || ch ==
'\f' || ch ==
'\t' || ch ==
'\r');
242 if (ch != 0x3000)
return ch;
245 if (ch == 0xFF5F)
return 0x2985;
246 if (ch == 0xFF60)
return 0x2986;
249 icu::UnicodeString uch_str(static_cast<UChar32>(ch));
250 const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
251 "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
252 error_code.assertSuccess();
255 fulltohalf->transliterate(uch_str);
bool is_double_quote(const char32 ch)
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
void ensure(inT32 min_capacity)
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
void NormalizeChar32(char32 ch, bool decompose, GenericVector< char32 > *str)
bool IsUTF8Whitespace(const char *text)
int SpanUTF8NotWhitespace(const char *text)
STRING NormalizeUTF8String(bool decompose, const char *str8)
char32 OCRNormalize(char32 ch)
bool IsWhitespace(const char32 ch)
static const_iterator end(const char *utf8_str, const int byte_length)
bool IsInterchangeValid7BitAscii(const char32 ch)
bool is_single_quote(const char32 ch)
#define ASSERT_HOST_MSG(x,...)
void assign(const char *cstr, int len)
static const_iterator begin(const char *utf8_str, const int byte_length)
bool IsInterchangeValid(const char32 ch)
int SpanUTF8Whitespace(const char *text)
char32 FullwidthToHalfwidth(const char32 ch)
bool is_hyphen_punc(const char32 ch)
bool IsValidCodepoint(const char32 ch)
bool IsOCREquivalent(char32 ch1, char32 ch2)
static int utf8_step(const char *utf8_str)