OILS / data_lang / j8_test.cc View on Github | oilshell.org

220 lines, 131 significant
1#include "data_lang/j8.h"
2
3#include <string>
4
5#include "data_lang/j8_test_lib.h"
6#include "vendor/greatest.h"
7
8// Naive buffer
9struct Buf {
10 unsigned char* data;
11 int capacity;
12 int len;
13};
14
15void EncodeNaive(char* s, int n, Buf* buf, int j8_fallback) {
16 char* orig_s = s; // save for rewinding
17
18 unsigned char* in = (unsigned char*)s;
19 unsigned char* input_end = (unsigned char*)s + n;
20
21 unsigned char* out = buf->data; // mutated
22 unsigned char* orig_out = buf->data; // not mutated
23
24 unsigned char** p_out = &out; // for J8_OUT()
25
26 J8_OUT('"');
27 // printf("*in %p *out %p\n", *in, *out);
28
29 int invalid_utf8 = 0;
30 while (in < input_end) {
31 // printf("1 in %p *out %p\n", in, *out);
32
33 // TODO: check *out vs. capacity and maybe grow buffer
34 invalid_utf8 = J8EncodeOne(&in, &out, 0); // JSON escaping
35
36 // Try again with J8 escaping
37 if (invalid_utf8 && j8_fallback) {
38 in = (unsigned char*)orig_s;
39 out = orig_out;
40
41 J8_OUT('b');
42 J8_OUT('\'');
43
44 // TODO: check *out vs. capacity and maybe grow buffer
45
46 while (in < input_end) {
47 // printf("2 in %p *out %p\n", in, *out);
48 J8EncodeOne(&in, &out, 1); // Now with J8 escaping
49 }
50
51 J8_OUT('\'');
52 buf->len = out - orig_out;
53 return;
54 }
55 }
56
57 J8_OUT('"');
58 buf->len = out - orig_out;
59}
60
61void EncodeBString(char* s, int n, std::string* result) {
62 uint8_t* in = reinterpret_cast<uint8_t*>(s);
63 uint8_t* in_end = in + n;
64
65 result->append("b'");
66
67 while (in < in_end) {
68 int chunk_pos = result->size(); // current position
69
70 // Same logic as EncodeBString()
71 int chunk_size = in_end - in + 3; // 3 for the quotes
72 // clamp it to account for tiny gaps and huge strings
73 if (chunk_size < J8_MIN_CAPACITY) {
74 chunk_size = J8_MIN_CAPACITY;
75 } else if (chunk_size > 4096) {
76 chunk_size = 4096;
77 }
78 printf("\t[2] in %p chunk %d\n", in, chunk_size);
79 result->append(chunk_size, '\0'); // "pre-allocated" bytes to overwrite
80
81 // Need C-style pointers to call the helper function
82 uint8_t* raw_data = (uint8_t*)result->data();
83
84 uint8_t* out = raw_data + chunk_pos;
85 uint8_t* orig_out = out;
86
87 uint8_t* out_end = raw_data + result->size();
88
89 // printf("\tEncodeChunk JSON\n");
90 J8EncodeChunk(&in, in_end, &out, out_end, true);
91
92 int bytes_this_chunk = out - orig_out;
93 int end_index = chunk_pos + bytes_this_chunk;
94 printf("\t bytes_this_chunk %d\n", bytes_this_chunk);
95 printf("\t end_index %d\n", end_index);
96
97 result->erase(end_index, std::string::npos);
98 }
99 result->append("'");
100}
101
102void EncodeString(char* s, int n, std::string* result, int j8_fallback) {
103 uint8_t* in = reinterpret_cast<uint8_t*>(s);
104 uint8_t* in_end = in + n;
105
106 int begin_index = result->size(); // position before writing opening quote
107
108 result->append("\"");
109
110 printf("\t***str len %d\n", n);
111
112 while (in < in_end) {
113 int chunk_pos = result->size(); // current position
114
115 // Compute chunk size assuming that we'll output about 5 bytes "foo" for
116 // the string foo. Cases like \u{1f}\u{1e} blow it up by a factor of 6, in
117 // which case we'll make more trips through the loop.
118 int chunk_size = in_end - in + 3; // 3 for the quotes
119 // clamp it to account for tiny gaps and huge strings
120 if (chunk_size < J8_MIN_CAPACITY) {
121 chunk_size = J8_MIN_CAPACITY;
122 } else if (chunk_size > 4096) {
123 chunk_size = 4096;
124 }
125 printf("\t[1] in %p chunk %d\n", in, chunk_size);
126
127 result->append(chunk_size, '\0'); // "pre-allocated" bytes to overwrite
128
129 // Need C-style pointers to call the helper function
130 uint8_t* raw_data = (uint8_t*)result->data();
131
132 uint8_t* out = raw_data + chunk_pos;
133 uint8_t* orig_out = out;
134
135 uint8_t* out_end = raw_data + result->size();
136
137 // printf("\tEncodeChunk JSON\n");
138 int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
139 if (invalid_utf8 && j8_fallback) {
140 // printf("RETRY\n");
141 result->erase(begin_index, std::string::npos);
142 EncodeBString(s, n, result); // fall back to b''
143 printf("\t[1] result len %d\n", static_cast<int>(result->size()));
144 return;
145 }
146
147 int bytes_this_chunk = out - orig_out;
148 int end_index = chunk_pos + bytes_this_chunk;
149 printf("\t bytes_this_chunk %d\n", bytes_this_chunk);
150 printf("\t end_index %d\n", end_index);
151
152 result->erase(end_index, std::string::npos);
153 }
154 result->append("\"");
155 printf("\t[1] result len %d\n", static_cast<int>(result->size()));
156}
157
158void EncodeAndPrint(char* s, int n, int j8_fallback) {
159#if 0
160 Buf buf = {0};
161 buf.data = (unsigned char*)malloc(64);
162 buf.capacity = 64;
163
164 EncodeNaive(s, n, &buf, j8_fallback);
165 buf.data[buf.len] = '\0'; // NUL terminate
166
167 printf("out = %s\n", buf.data);
168 free(buf.data);
169#else
170
171 std::string result;
172 EncodeString(s, n, &result, j8_fallback);
173 printf("out = %s\n", result.c_str());
174
175#endif
176}
177
178TEST encode_test() {
179#if 1
180 const char* mixed = "hi \x01 \u4000\xfe\u4001\xff\xfd ' \" new \n \\ \u03bc";
181 EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 0);
182 EncodeAndPrint(const_cast<char*>(mixed), strlen(mixed), 1);
183#endif
184
185 const char* a = "ab";
186 EncodeAndPrint(const_cast<char*>(a), strlen(a), 0);
187 EncodeAndPrint(const_cast<char*>(a), strlen(a), 1);
188
189 const char* b = "0123456789";
190 EncodeAndPrint(const_cast<char*>(b), strlen(b), 0);
191 EncodeAndPrint(const_cast<char*>(b), strlen(b), 1);
192
193 const char* u = "hi \u4000 \u03bc";
194 EncodeAndPrint(const_cast<char*>(b), strlen(u), 0);
195 EncodeAndPrint(const_cast<char*>(b), strlen(u), 1);
196
197 // Internal NUL
198 const char* bin = "\x00\x01\xff";
199 EncodeAndPrint(const_cast<char*>(bin), 3, 0);
200 EncodeAndPrint(const_cast<char*>(bin), 3, 1);
201
202 // Blow up size
203 const char* blowup =
204 "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0e\x0f\x10\xfe";
205 EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 0);
206 EncodeAndPrint(const_cast<char*>(blowup), strlen(blowup), 1);
207
208 PASS();
209}
210
211GREATEST_MAIN_DEFS();
212
213int main(int argc, char** argv) {
214 GREATEST_MAIN_BEGIN();
215
216 RUN_TEST(encode_test);
217
218 GREATEST_MAIN_END();
219 return 0;
220}