OILS / data_lang / utf8_test.cc View on Github | oilshell.org

130 lines, 83 significant
1#include "data_lang/utf8.h"
2
3#include <inttypes.h>
4
5//#include "mycpp/runtime.h"
6#include "vendor/greatest.h"
7
8// Copied from UTF-8 proc
9// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
10int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
11 if (uc < 0x80) {
12 dst[0] = (uint8_t)uc;
13 return 1;
14 } else if (uc < 0x800) {
15 dst[0] = (uint8_t)(0xC0 + (uc >> 6));
16 dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
17 return 2;
18 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
19 // the API, however, these are actually invalid in UTF-8
20 } else if (uc < 0x10000) {
21 dst[0] = (uint8_t)(0xE0 + (uc >> 12));
22 dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
23 dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
24 return 3;
25 } else if (uc < 0x110000) {
26 dst[0] = (uint8_t)(0xF0 + (uc >> 18));
27 dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
28 dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
29 dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
30 return 4;
31 } else
32 return 0;
33}
34
35TEST identity_test() {
36 // check that decode(encode(x)) = x for all code-points (and surrogates)
37 uint8_t buf[5] = {0};
38 for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
39 int len = utf8proc_encode_char(cp, buf);
40 Utf8Result result;
41 utf8_decode(buf, &result);
42
43 if (cp < 0xD800 || cp > 0xDFFF) {
44 ASSERT_EQ(result.error, UTF8_OK);
45 } else {
46 ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
47 }
48 ASSERT_EQ(result.codepoint, cp);
49 ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
50 }
51
52 PASS();
53}
54
55TEST overlong_test() {
56 // All encode U+41 ('A')
57 Utf8Result ok, overlong2, overlong3, overlong4;
58 utf8_decode((unsigned char*)"\x41", &ok);
59 utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
60 utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
61 utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
62
63 ASSERT_EQ(ok.error, UTF8_OK);
64 ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
65 ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
66 ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
67
68 ASSERT_EQ(ok.codepoint, 0x41);
69 ASSERT_EQ(overlong2.codepoint, 0x41);
70 ASSERT_EQ(overlong3.codepoint, 0x41);
71 ASSERT_EQ(overlong4.codepoint, 0x41);
72
73 ASSERT_EQ(ok.bytes_read, 1);
74 ASSERT_EQ(overlong2.bytes_read, 2);
75 ASSERT_EQ(overlong3.bytes_read, 3);
76 ASSERT_EQ(overlong4.bytes_read, 4);
77
78 PASS();
79}
80
81TEST too_large_test() {
82 // Encoding of 0x111111 (via Table 3-6)
83 // = 00010001 00010001 00010001
84 // uuuuu -> 10001
85 // zzzz -> 00001
86 // yyyyyy -> 000100
87 // xxxxxx -> 010001
88 //
89 // -> 11110100 10010001 10000100 10010001
90 // = F4 91 84 91
91 Utf8Result result;
92 utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
93
94 ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
95 ASSERT_EQ(result.codepoint, 0x111111);
96 ASSERT_EQ(result.bytes_read, 4);
97
98 PASS();
99}
100
101TEST truncated_test() {
102 Utf8Result result;
103
104 constexpr const int NUM_INPUTS = 6;
105 const char* inputs[NUM_INPUTS] = {
106 "\xC5", "\xED", "\xED\x9F", "\xF4", "\xF4\x80", "\xF4\x80\x80",
107 };
108
109 for (int i = 0; i < NUM_INPUTS; i++) {
110 utf8_decode((unsigned char*)inputs[i], &result);
111 ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
112 ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
113 }
114
115 PASS();
116}
117
118GREATEST_MAIN_DEFS();
119
120int main(int argc, char** argv) {
121 GREATEST_MAIN_BEGIN();
122
123 RUN_TEST(identity_test);
124 RUN_TEST(overlong_test);
125 RUN_TEST(too_large_test);
126 RUN_TEST(truncated_test);
127
128 GREATEST_MAIN_END();
129 return 0;
130}