data_lang/j8

OILS / data_lang / j8_test.cc View on Github | oilshell.org

220 lines, 131 significant

1	#include "data_lang/j8.h"
2
3	#include <string>
4
5	#include "data_lang/j8_test_lib.h"
6	#include "vendor/greatest.h"
7
8	// Naive buffer
9	struct Buf {
10	unsigned char* data;
11	int capacity;
12	int len;
13	};
14
15	void EncodeNaive(char* s, int n, Buf* buf, int j8_fallback) {
16	char* orig_s = s; // save for rewinding
17
18	unsigned char* in = (unsigned char*)s;
19	unsigned char* input_end = (unsigned char*)s + n;
20
21	unsigned char* out = buf->data; // mutated
22	unsigned char* orig_out = buf->data; // not mutated
23
24	unsigned char** p_out = &out; // for J8_OUT()
25
26	J8_OUT('"');
27	// printf("in %p out %p\n", in, out);
28
29	int invalid_utf8 = 0;
30	while (in < input_end) {
31	// printf("1 in %p out %p\n", in, out);
32
33	// TODO: check *out vs. capacity and maybe grow buffer
34	invalid_utf8 = J8EncodeOne(&in, &out, 0); // JSON escaping
35
36	// Try again with J8 escaping
37	if (invalid_utf8 && j8_fallback) {
38	in = (unsigned char*)orig_s;
39	out = orig_out;
40
41	J8_OUT('b');
42	J8_OUT('\'');
43
44	// TODO: check *out vs. capacity and maybe grow buffer
45
46	while (in < input_end) {
47	// printf("2 in %p out %p\n", in, out);
48	J8EncodeOne(&in, &out, 1); // Now with J8 escaping
49	}
50
51	J8_OUT('\'');
52	buf->len = out - orig_out;
53	return;
54	}
55	}
56
57	J8_OUT('"');
58	buf->len = out - orig_out;
59	}
60
61	void EncodeBString(char* s, int n, std::string* result) {
62	uint8_t* in = reinterpret_cast<uint8_t*>(s);
63	uint8_t* in_end = in + n;
64
65	result->append("b'");
66
67	while (in < in_end) {
68	int chunk_pos = result->size(); // current position
69
70	// Same logic as EncodeBString()
71	int chunk_size = in_end - in + 3; // 3 for the quotes
72	// clamp it to account for tiny gaps and huge strings
73	if (chunk_size < J8_MIN_CAPACITY) {
74	chunk_size = J8_MIN_CAPACITY;
75	} else if (chunk_size > 4096) {
76	chunk_size = 4096;
77	}
78	printf("\t[2] in %p chunk %d\n", in, chunk_size);
79	result->append(chunk_size, '\0'); // "pre-allocated" bytes to overwrite
80
81	// Need C-style pointers to call the helper function
82	uint8_t* raw_data = (uint8_t*)result->data();
83
84	uint8_t* out = raw_data + chunk_pos;
85	uint8_t* orig_out = out;
86
87	uint8_t* out_end = raw_data + result->size();
88
89	// printf("\tEncodeChunk JSON\n");
90	J8EncodeChunk(&in, in_end, &out, out_end, true);
91
92	int bytes_this_chunk = out - orig_out;
93	int end_index = chunk_pos + bytes_this_chunk;
94	printf("\t bytes_this_chunk %d\n", bytes_this_chunk);
95	printf("\t end_index %d\n", end_index);
96
97	result->erase(end_index, std::string::npos);
98	}
99	result->append("'");
100	}
101
102	void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
103	uint8_t* in = reinterpret_cast<uint8_t*>(s);
104	uint8_t* in_end = in + n;
105
106	int begin_index = result->size(); // position before writing opening quote
107
108	result->append("\"");
109
110	printf("\t***str len %d\n", n);
111
112	while (in < in_end) {
113	int chunk_pos = result->size(); // current position
114
115	// Compute chunk size assuming that we'll output about 5 bytes "foo" for
116	// the string foo. Cases like \u{1f}\u{1e} blow it up by a factor of 6, in
117	// which case we'll make more trips through the loop.
118	int chunk_size = in_end - in + 3; // 3 for the quotes
119	// clamp it to account for tiny gaps and huge strings
120	if (chunk_size < J8_MIN_CAPACITY) {
121	chunk_size = J8_MIN_CAPACITY;
122	} else if (chunk_size > 4096) {
123	chunk_size = 4096;
124	}
125	printf("\t[1] in %p chunk %d\n", in, chunk_size);
126
127	result->append(chunk_size, '\0'); // "pre-allocated" bytes to overwrite
128
129	// Need C-style pointers to call the helper function
130	uint8_t* raw_data = (uint8_t*)result->data();
131
132	uint8_t* out = raw_data + chunk_pos;
133	uint8_t* orig_out = out;
134
135	uint8_t* out_end = raw_data + result->size();
136
137	// printf("\tEncodeChunk JSON\n");
138	int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
139	if (invalid_utf8 && j8_fallback) {
140	// printf("RETRY\n");
141	result->erase(begin_index, std::string::npos);
142	EncodeBString(s, n, result); // fall back to b''
143	printf("\t[1] result len %d\n", static_cast<int>(result->size()));
144	return;
145	}
146
147	int bytes_this_chunk = out - orig_out;
148	int end_index = chunk_pos + bytes_this_chunk;
149	printf("\t bytes_this_chunk %d\n", bytes_this_chunk);
150	printf("\t end_index %d\n", end_index);
151
152	result->erase(end_index, std::string::npos);
153	}
154	result->append("\"");
155	printf("\t[1] result len %d\n", static_cast<int>(result->size()));
156	}
157
158	void EncodeAndPrint(char* s, int n, int j8_fallback) {
159	#if 0
160	Buf buf = {0};
161	buf.data = (unsigned char*)malloc(64);
162	buf.capacity = 64;
163
164	EncodeNaive(s, n, &buf, j8_fallback);
165	buf.data[buf.len] = '\0'; // NUL terminate
166
167	printf("out = %s\n", buf.data);
168	free(buf.data);
169	#else
170
171	std::string result;
172	EncodeString(s, n, &result, j8_fallback);
173	printf("out = %s\n", result.c_str());
174
175	#endif
176	}
177
178	TEST encode_test() {
179	#if 1
180	const char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
181	EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 0);
182	EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 1);
183	#endif
184
185	const char* a = "ab";
186	EncodeAndPrint(const_cast<char*>(a), strlen(a), 0);
187	EncodeAndPrint(const_cast<char*>(a), strlen(a), 1);
188
189	const char* b = "0123456789";
190	EncodeAndPrint(const_cast<char*>(b), strlen(b), 0);
191	EncodeAndPrint(const_cast<char*>(b), strlen(b), 1);
192
193	const char* u = "hi \u4000 \u03bc";
194	EncodeAndPrint(const_cast<char*>(b), strlen(u), 0);
195	EncodeAndPrint(const_cast<char*>(b), strlen(u), 1);
196
197	// Internal NUL
198	const char* bin = "\x00\x01\xff";
199	EncodeAndPrint(const_cast<char*>(bin), 3, 0);
200	EncodeAndPrint(const_cast<char*>(bin), 3, 1);
201
202	// Blow up size
203	const char* blowup =
204	"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe";
205	EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 0);
206	EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 1);
207
208	PASS();
209	}
210
211	GREATEST_MAIN_DEFS();
212
213	int main(int argc, char** argv) {
214	GREATEST_MAIN_BEGIN();
215
216	RUN_TEST(encode_test);
217
218	GREATEST_MAIN_END();
219	return 0;
220	}