1 | #include "data_lang/utf8.h"
|
2 |
|
3 | #include <inttypes.h>
|
4 |
|
5 | //#include "mycpp/runtime.h"
|
6 | #include "vendor/greatest.h"
|
7 |
|
8 | // Copied from UTF-8 proc
|
9 | // https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
|
10 | int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
|
11 | if (uc < 0x80) {
|
12 | dst[0] = (uint8_t)uc;
|
13 | return 1;
|
14 | } else if (uc < 0x800) {
|
15 | dst[0] = (uint8_t)(0xC0 + (uc >> 6));
|
16 | dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
|
17 | return 2;
|
18 | // Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
19 | // the API, however, these are actually invalid in UTF-8
|
20 | } else if (uc < 0x10000) {
|
21 | dst[0] = (uint8_t)(0xE0 + (uc >> 12));
|
22 | dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
23 | dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
|
24 | return 3;
|
25 | } else if (uc < 0x110000) {
|
26 | dst[0] = (uint8_t)(0xF0 + (uc >> 18));
|
27 | dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
28 | dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
29 | dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
|
30 | return 4;
|
31 | } else
|
32 | return 0;
|
33 | }
|
34 |
|
35 | TEST identity_test() {
|
36 | // check that decode(encode(x)) = x for all code-points (and surrogates)
|
37 | uint8_t buf[5] = {0};
|
38 | for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
|
39 | int len = utf8proc_encode_char(cp, buf);
|
40 | Utf8Result result;
|
41 | utf8_decode(buf, &result);
|
42 |
|
43 | if (cp < 0xD800 || cp > 0xDFFF) {
|
44 | ASSERT_EQ(result.error, UTF8_OK);
|
45 | } else {
|
46 | ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
|
47 | }
|
48 | ASSERT_EQ(result.codepoint, cp);
|
49 | ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
|
50 | }
|
51 |
|
52 | PASS();
|
53 | }
|
54 |
|
55 | TEST overlong_test() {
|
56 | // All encode U+41 ('A')
|
57 | Utf8Result ok, overlong2, overlong3, overlong4;
|
58 | utf8_decode((unsigned char*)"\x41", &ok);
|
59 | utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
|
60 | utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
|
61 | utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
|
62 |
|
63 | ASSERT_EQ(ok.error, UTF8_OK);
|
64 | ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
|
65 | ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
|
66 | ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
|
67 |
|
68 | ASSERT_EQ(ok.codepoint, 0x41);
|
69 | ASSERT_EQ(overlong2.codepoint, 0x41);
|
70 | ASSERT_EQ(overlong3.codepoint, 0x41);
|
71 | ASSERT_EQ(overlong4.codepoint, 0x41);
|
72 |
|
73 | ASSERT_EQ(ok.bytes_read, 1);
|
74 | ASSERT_EQ(overlong2.bytes_read, 2);
|
75 | ASSERT_EQ(overlong3.bytes_read, 3);
|
76 | ASSERT_EQ(overlong4.bytes_read, 4);
|
77 |
|
78 | PASS();
|
79 | }
|
80 |
|
81 | TEST too_large_test() {
|
82 | // Encoding of 0x111111 (via Table 3-6)
|
83 | // = 00010001 00010001 00010001
|
84 | // uuuuu -> 10001
|
85 | // zzzz -> 00001
|
86 | // yyyyyy -> 000100
|
87 | // xxxxxx -> 010001
|
88 | //
|
89 | // -> 11110100 10010001 10000100 10010001
|
90 | // = F4 91 84 91
|
91 | Utf8Result result;
|
92 | utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
|
93 |
|
94 | ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
|
95 | ASSERT_EQ(result.codepoint, 0x111111);
|
96 | ASSERT_EQ(result.bytes_read, 4);
|
97 |
|
98 | PASS();
|
99 | }
|
100 |
|
101 | TEST truncated_test() {
|
102 | Utf8Result result;
|
103 |
|
104 | constexpr const int NUM_INPUTS = 6;
|
105 | const char* inputs[NUM_INPUTS] = {
|
106 | "\xC5", "\xED", "\xED\x9F", "\xF4", "\xF4\x80", "\xF4\x80\x80",
|
107 | };
|
108 |
|
109 | for (int i = 0; i < NUM_INPUTS; i++) {
|
110 | utf8_decode((unsigned char*)inputs[i], &result);
|
111 | ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
|
112 | ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
|
113 | }
|
114 |
|
115 | PASS();
|
116 | }
|
117 |
|
118 | GREATEST_MAIN_DEFS();
|
119 |
|
120 | int main(int argc, char** argv) {
|
121 | GREATEST_MAIN_BEGIN();
|
122 |
|
123 | RUN_TEST(identity_test);
|
124 | RUN_TEST(overlong_test);
|
125 | RUN_TEST(too_large_test);
|
126 | RUN_TEST(truncated_test);
|
127 |
|
128 | GREATEST_MAIN_END();
|
129 | return 0;
|
130 | }
|