proxygen
Unicode.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <folly/Unicode.h>
18 #include <folly/Conv.h>
19 
20 namespace folly {
21 
23 
25  std::string result;
26 
27  // Based on description from http://en.wikipedia.org/wiki/UTF-8.
28 
29  if (cp <= 0x7f) {
30  result.resize(1);
31  result[0] = static_cast<char>(cp);
32  } else if (cp <= 0x7FF) {
33  result.resize(2);
34  result[1] = static_cast<char>(0x80 | (0x3f & cp));
35  result[0] = static_cast<char>(0xC0 | (cp >> 6));
36  } else if (cp <= 0xFFFF) {
37  result.resize(3);
38  result[2] = static_cast<char>(0x80 | (0x3f & cp));
39  result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6))));
40  result[0] = (0xE0 | static_cast<char>(cp >> 12));
41  } else if (cp <= 0x10FFFF) {
42  result.resize(4);
43  result[3] = static_cast<char>(0x80 | (0x3f & cp));
44  result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
45  result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
46  result[0] = static_cast<char>(0xF0 | (cp >> 18));
47  }
48 
49  return result;
50 }
51 
52 char32_t utf8ToCodePoint(
53  const unsigned char*& p,
54  const unsigned char* const e,
55  bool skipOnError) {
56  /* The following encodings are valid, except for the 5 and 6 byte
57  * combinations:
58  * 0xxxxxxx
59  * 110xxxxx 10xxxxxx
60  * 1110xxxx 10xxxxxx 10xxxxxx
61  * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
62  * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63  * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64  */
65 
66  const auto skip = [&] {
67  ++p;
68  return U'\ufffd';
69  };
70 
71  if (p >= e) {
72  if (skipOnError) {
73  return skip();
74  }
75  throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
76  }
77 
78  unsigned char fst = *p;
79  if (!(fst & 0x80)) {
80  // trivial case
81  return *p++;
82  }
83 
84  static const uint32_t bitMask[] = {
85  (1 << 7) - 1,
86  (1 << 11) - 1,
87  (1 << 16) - 1,
88  (1 << 21) - 1,
89  };
90 
91  // upper control bits are masked out later
92  uint32_t d = fst;
93 
94  if ((fst & 0xC0) != 0xC0) {
95  if (skipOnError) {
96  return skip();
97  }
98  throw std::runtime_error(
99  to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
100  }
101 
102  fst <<= 1;
103 
104  for (unsigned int i = 1; i != 4 && p + i < e; ++i) {
105  const unsigned char tmp = p[i];
106 
107  if ((tmp & 0xC0) != 0x80) {
108  if (skipOnError) {
109  return skip();
110  }
111  throw std::runtime_error(to<std::string>(
112  "folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
113  }
114 
115  d = (d << 6) | (tmp & 0x3F);
116  fst <<= 1;
117 
118  if (!(fst & 0x80)) {
119  d &= bitMask[i];
120 
121  // overlong, could have been encoded with i bytes
122  if ((d & ~bitMask[i - 1]) == 0) {
123  if (skipOnError) {
124  return skip();
125  }
126  throw std::runtime_error(
127  to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
128  }
129 
130  // check for surrogates only needed for 3 bytes
131  if (i == 2) {
132  if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
133  if (skipOnError) {
134  return skip();
135  }
136  throw std::runtime_error(
137  to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
138  }
139  }
140 
141  p += i + 1;
142  return d;
143  }
144  }
145 
146  if (skipOnError) {
147  return skip();
148  }
149  throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
150 }
151 
153 
154 } // namespace folly
char32_t utf8ToCodePoint(const unsigned char *&p, const unsigned char *const e, bool skipOnError)
Definition: Unicode.cpp:52
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
detail::Skip skip(size_t count)
Definition: Base-inl.h:2598
std::string codePointToUtf8(char32_t cp)
Definition: Unicode.cpp:24
const char * string
Definition: Conv.cpp:212