data_lang/utf8

OILS / data_lang / utf8_test.cc View on Github | oilshell.org

130 lines, 83 significant

1	#include "data_lang/utf8.h"
2
3	#include <inttypes.h>
4
5	//#include "mycpp/runtime.h"
6	#include "vendor/greatest.h"
7
8	// Copied from UTF-8 proc
9	// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
10	int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
11	if (uc < 0x80) {
12	dst[0] = (uint8_t)uc;
13	return 1;
14	} else if (uc < 0x800) {
15	dst[0] = (uint8_t)(0xC0 + (uc >> 6));
16	dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
17	return 2;
18	// Note: we allow encoding 0xd800-0xdfff here, so as not to change
19	// the API, however, these are actually invalid in UTF-8
20	} else if (uc < 0x10000) {
21	dst[0] = (uint8_t)(0xE0 + (uc >> 12));
22	dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
23	dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
24	return 3;
25	} else if (uc < 0x110000) {
26	dst[0] = (uint8_t)(0xF0 + (uc >> 18));
27	dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
28	dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
29	dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
30	return 4;
31	} else
32	return 0;
33	}
34
35	TEST identity_test() {
36	// check that decode(encode(x)) = x for all code-points (and surrogates)
37	uint8_t buf[5] = {0};
38	for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
39	int len = utf8proc_encode_char(cp, buf);
40	Utf8Result result;
41	utf8_decode(buf, &result);
42
43	if (cp < 0xD800 \|\| cp > 0xDFFF) {
44	ASSERT_EQ(result.error, UTF8_OK);
45	} else {
46	ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
47	}
48	ASSERT_EQ(result.codepoint, cp);
49	ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
50	}
51
52	PASS();
53	}
54
55	TEST overlong_test() {
56	// All encode U+41 ('A')
57	Utf8Result ok, overlong2, overlong3, overlong4;
58	utf8_decode((unsigned char*)"\x41", &ok);
59	utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
60	utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
61	utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
62
63	ASSERT_EQ(ok.error, UTF8_OK);
64	ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
65	ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
66	ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
67
68	ASSERT_EQ(ok.codepoint, 0x41);
69	ASSERT_EQ(overlong2.codepoint, 0x41);
70	ASSERT_EQ(overlong3.codepoint, 0x41);
71	ASSERT_EQ(overlong4.codepoint, 0x41);
72
73	ASSERT_EQ(ok.bytes_read, 1);
74	ASSERT_EQ(overlong2.bytes_read, 2);
75	ASSERT_EQ(overlong3.bytes_read, 3);
76	ASSERT_EQ(overlong4.bytes_read, 4);
77
78	PASS();
79	}
80
81	TEST too_large_test() {
82	// Encoding of 0x111111 (via Table 3-6)
83	// = 00010001 00010001 00010001
84	// uuuuu -> 10001
85	// zzzz -> 00001
86	// yyyyyy -> 000100
87	// xxxxxx -> 010001
88	//
89	// -> 11110100 10010001 10000100 10010001
90	// = F4 91 84 91
91	Utf8Result result;
92	utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
93
94	ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
95	ASSERT_EQ(result.codepoint, 0x111111);
96	ASSERT_EQ(result.bytes_read, 4);
97
98	PASS();
99	}
100
101	TEST truncated_test() {
102	Utf8Result result;
103
104	constexpr const int NUM_INPUTS = 6;
105	const char* inputs[NUM_INPUTS] = {
106	"\xC5", "\xED", "\xED\x9F", "\xF4", "\xF4\x80", "\xF4\x80\x80",
107	};
108
109	for (int i = 0; i < NUM_INPUTS; i++) {
110	utf8_decode((unsigned char*)inputs[i], &result);
111	ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
112	ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
113	}
114
115	PASS();
116	}
117
118	GREATEST_MAIN_DEFS();
119
120	int main(int argc, char** argv) {
121	GREATEST_MAIN_BEGIN();
122
123	RUN_TEST(identity_test);
124	RUN_TEST(overlong_test);
125	RUN_TEST(too_large_test);
126	RUN_TEST(truncated_test);
127
128	GREATEST_MAIN_END();
129	return 0;
130	}