proxygen
UnicodeTest.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <folly/Unicode.h>
17 
18 #include <initializer_list>
19 #include <stdexcept>
20 
21 #include <folly/Range.h>
23 
25 
26 void testValid(std::initializer_list<unsigned char> data, char32_t expected) {
27  {
28  const unsigned char* p = data.begin();
29  const unsigned char* e = data.end();
30  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ false), expected)
32  (const char*)data.begin(), (const char*)data.end());
33  }
34  {
35  const unsigned char* p = data.begin();
36  const unsigned char* e = data.end();
37  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), expected)
39  (const char*)data.begin(), (const char*)data.end());
40  }
41 }
42 
43 void testInvalid(std::initializer_list<unsigned char> data) {
44  {
45  const unsigned char* p = data.begin();
46  const unsigned char* e = data.end();
48  utf8ToCodePoint(p, e, /* skipOnError */ false), std::runtime_error)
50  (const char*)data.begin(), (const char*)data.end());
51  }
52  {
53  const unsigned char* p = data.begin();
54  const unsigned char* e = data.end();
55  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), 0xfffd)
57  (const char*)data.begin(), (const char*)data.end());
58  }
59 }
60 
61 TEST(InvalidUtf8ToCodePoint, rfc3629Overlong) {
62  // https://tools.ietf.org/html/rfc3629
63  // Implementations of the decoding algorithm above MUST protect against
64  // decoding invalid sequences. For instance, a naive implementation may
65  // decode the overlong UTF-8 sequence C0 80 into the character U+0000 [...]
66  // Decoding invalid sequences may have security consequences or cause other
67  // problems.
68  testInvalid({0xC0, 0x80});
69 }
70 
71 TEST(InvalidUtf8ToCodePoint, rfc3629SurrogatePair) {
72  // https://tools.ietf.org/html/rfc3629
73  // Implementations of the decoding algorithm above MUST protect against
74  // decoding invalid sequences. For instance, a naive implementation may
75  // decode [...] the surrogate pair ED A1 8C ED BE B4 into U+233B4.
76  // Decoding invalid sequences may have security consequences or cause other
77  // problems.
78  testInvalid({0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4});
79 }
80 
81 TEST(InvalidUtf8ToCodePoint, MarkusKuhnSingleUTF16Surrogates) {
82  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
83  // 5.1.1 U+D800 = ed a0 80
84  // 5.1.2 U+DB7F = ed ad bf
85  // 5.1.3 U+DB80 = ed ae 80
86  // 5.1.4 U+DBFF = ed af bf
87  // 5.1.5 U+DC00 = ed b0 80
88  // 5.1.6 U+DF80 = ed be 80
89  // 5.1.7 U+DFFF = ed bf bf
90  testInvalid({0xed, 0xa0, 0x80});
91  testInvalid({0xed, 0xad, 0xbf});
92  testInvalid({0xed, 0xae, 0x80});
93  testInvalid({0xed, 0xaf, 0xbf});
94  testInvalid({0xed, 0xb0, 0x80});
95  testInvalid({0xed, 0xbe, 0x80});
96  testInvalid({0xed, 0xbf, 0xbf});
97 }
98 
99 TEST(InvalidUtf8ToCodePoint, MarkusKuhnPairedUTF16Surrogates) {
100  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
101  // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
102  // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
103  // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
104  // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
105  // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
106  // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
107  // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
108  // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
109  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80});
110  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf});
111  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80});
112  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf});
113  testInvalid({0xed, 0xae, 0x80, 0xed, 0xb0, 0x80});
114  testInvalid({0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf});
115  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80});
116  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf});
117 }
118 
119 TEST(ValidUtf8ToCodePoint, FourCloverLeaf) {
120  testValid({0xF0, 0x9F, 0x8D, 0x80}, 0x1F340); // u8"\U0001F340";
121 }
122 
123 TEST(InvalidUtf8ToCodePoint, FourCloverLeafAsSurrogates) {
124  testInvalid({0xd8, 0x3c, 0xdf, 0x40}); // u8"\U0001F340";
125 }
126 
127 TEST(ValidUtf8ToCodePoint, LastCodePoint) {
128  testValid({0xF4, 0x8F, 0xBF, 0xBF}, 0x10FFFF); // u8"\U0010FFFF";
129 }
#define EXPECT_THROW(statement, expected_exception)
Definition: gtest.h:1843
void testValid(std::initializer_list< unsigned char > data, char32_t expected)
Definition: UnicodeTest.cpp:26
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:1922
char32_t utf8ToCodePoint(const unsigned char *&p, const unsigned char *const e, bool skipOnError)
Definition: Unicode.cpp:52
constexpr Iter begin() const
Definition: Range.h:452
TEST(InvalidUtf8ToCodePoint, rfc3629Overlong)
Definition: UnicodeTest.cpp:61
Range< const char * > StringPiece
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
static constexpr uint64_t data[1]
Definition: Fingerprint.cpp:43