proxygen
UnicodeTest.cpp File Reference
#include <folly/Unicode.h>
#include <initializer_list>
#include <stdexcept>
#include <folly/Range.h>
#include <folly/portability/GTest.h>

Go to the source code of this file.

Functions

void testValid (std::initializer_list< unsigned char > data, char32_t expected)
 
void testInvalid (std::initializer_list< unsigned char > data)
 
 TEST (InvalidUtf8ToCodePoint, rfc3629Overlong)
 
 TEST (InvalidUtf8ToCodePoint, rfc3629SurrogatePair)
 
 TEST (InvalidUtf8ToCodePoint, MarkusKuhnSingleUTF16Surrogates)
 
 TEST (InvalidUtf8ToCodePoint, MarkusKuhnPairedUTF16Surrogates)
 
 TEST (ValidUtf8ToCodePoint, FourCloverLeaf)
 
 TEST (InvalidUtf8ToCodePoint, FourCloverLeafAsSurrogates)
 
 TEST (ValidUtf8ToCodePoint, LastCodePoint)
 

Function Documentation

TEST ( InvalidUtf8ToCodePoint  ,
rfc3629Overlong   
)

Definition at line 61 of file UnicodeTest.cpp.

References testInvalid().

61  {
62  // https://tools.ietf.org/html/rfc3629
63  // Implementations of the decoding algorithm above MUST protect against
64  // decoding invalid sequences. For instance, a naive implementation may
65  // decode the overlong UTF-8 sequence C0 80 into the character U+0000 [...]
66  // Decoding invalid sequences may have security consequences or cause other
67  // problems.
68  testInvalid({0xC0, 0x80});
69 }
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
TEST ( InvalidUtf8ToCodePoint  ,
rfc3629SurrogatePair   
)

Definition at line 71 of file UnicodeTest.cpp.

References testInvalid().

71  {
72  // https://tools.ietf.org/html/rfc3629
73  // Implementations of the decoding algorithm above MUST protect against
74  // decoding invalid sequences. For instance, a naive implementation may
75  // decode [...] the surrogate pair ED A1 8C ED BE B4 into U+233B4.
76  // Decoding invalid sequences may have security consequences or cause other
77  // problems.
78  testInvalid({0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4});
79 }
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
TEST ( InvalidUtf8ToCodePoint  ,
MarkusKuhnSingleUTF16Surrogates   
)

Definition at line 81 of file UnicodeTest.cpp.

References testInvalid().

81  {
82  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
83  // 5.1.1 U+D800 = ed a0 80
84  // 5.1.2 U+DB7F = ed ad bf
85  // 5.1.3 U+DB80 = ed ae 80
86  // 5.1.4 U+DBFF = ed af bf
87  // 5.1.5 U+DC00 = ed b0 80
88  // 5.1.6 U+DF80 = ed be 80
89  // 5.1.7 U+DFFF = ed bf bf
90  testInvalid({0xed, 0xa0, 0x80});
91  testInvalid({0xed, 0xad, 0xbf});
92  testInvalid({0xed, 0xae, 0x80});
93  testInvalid({0xed, 0xaf, 0xbf});
94  testInvalid({0xed, 0xb0, 0x80});
95  testInvalid({0xed, 0xbe, 0x80});
96  testInvalid({0xed, 0xbf, 0xbf});
97 }
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
TEST ( InvalidUtf8ToCodePoint  ,
MarkusKuhnPairedUTF16Surrogates   
)

Definition at line 99 of file UnicodeTest.cpp.

References testInvalid().

99  {
100  // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
101  // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
102  // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
103  // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
104  // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
105  // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
106  // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
107  // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
108  // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
109  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80});
110  testInvalid({0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf});
111  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80});
112  testInvalid({0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf});
113  testInvalid({0xed, 0xae, 0x80, 0xed, 0xb0, 0x80});
114  testInvalid({0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf});
115  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80});
116  testInvalid({0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf});
117 }
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
TEST ( ValidUtf8ToCodePoint  ,
FourCloverLeaf   
)

Definition at line 119 of file UnicodeTest.cpp.

References testValid().

119  {
120  testValid({0xF0, 0x9F, 0x8D, 0x80}, 0x1F340); // u8"\U0001F340";
121 }
void testValid(std::initializer_list< unsigned char > data, char32_t expected)
Definition: UnicodeTest.cpp:26
TEST ( InvalidUtf8ToCodePoint  ,
FourCloverLeafAsSurrogates   
)

Definition at line 123 of file UnicodeTest.cpp.

References testInvalid().

123  {
124  testInvalid({0xd8, 0x3c, 0xdf, 0x40}); // u8"\U0001F340";
125 }
void testInvalid(std::initializer_list< unsigned char > data)
Definition: UnicodeTest.cpp:43
TEST ( ValidUtf8ToCodePoint  ,
LastCodePoint   
)

Definition at line 127 of file UnicodeTest.cpp.

References testValid().

127  {
128  testValid({0xF4, 0x8F, 0xBF, 0xBF}, 0x10FFFF); // u8"\U0010FFFF";
129 }
void testValid(std::initializer_list< unsigned char > data, char32_t expected)
Definition: UnicodeTest.cpp:26
void testInvalid ( std::initializer_list< unsigned char >  data)

Definition at line 43 of file UnicodeTest.cpp.

References folly::Range< Iter >::begin(), EXPECT_EQ, EXPECT_THROW, and folly::utf8ToCodePoint().

Referenced by TEST().

43  {
44  {
45  const unsigned char* p = data.begin();
46  const unsigned char* e = data.end();
48  utf8ToCodePoint(p, e, /* skipOnError */ false), std::runtime_error)
50  (const char*)data.begin(), (const char*)data.end());
51  }
52  {
53  const unsigned char* p = data.begin();
54  const unsigned char* e = data.end();
55  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), 0xfffd)
57  (const char*)data.begin(), (const char*)data.end());
58  }
59 }
#define EXPECT_THROW(statement, expected_exception)
Definition: gtest.h:1843
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:1922
char32_t utf8ToCodePoint(const unsigned char *&p, const unsigned char *const e, bool skipOnError)
Definition: Unicode.cpp:52
Range< const char * > StringPiece
static constexpr uint64_t data[1]
Definition: Fingerprint.cpp:43
void testValid ( std::initializer_list< unsigned char >  data,
char32_t  expected 
)

Definition at line 26 of file UnicodeTest.cpp.

References folly::Range< Iter >::begin(), EXPECT_EQ, and folly::utf8ToCodePoint().

Referenced by TEST().

26  {
27  {
28  const unsigned char* p = data.begin();
29  const unsigned char* e = data.end();
30  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ false), expected)
32  (const char*)data.begin(), (const char*)data.end());
33  }
34  {
35  const unsigned char* p = data.begin();
36  const unsigned char* e = data.end();
37  EXPECT_EQ(utf8ToCodePoint(p, e, /* skipOnError */ true), expected)
39  (const char*)data.begin(), (const char*)data.end());
40  }
41 }
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:1922
char32_t utf8ToCodePoint(const unsigned char *&p, const unsigned char *const e, bool skipOnError)
Definition: Unicode.cpp:52
Range< const char * > StringPiece
static constexpr uint64_t data[1]
Definition: Fingerprint.cpp:43