OILS / data_lang / j8.h View on Github | oilshell.org

314 lines, 151 significant
1#ifndef DATA_LANG_J8_H
2#define DATA_LANG_J8_H
3
4#include <stdio.h> // sprintf
5#include <string.h> // memcmp, memcpy, strlen
6
7#include "data_lang/utf8.h"
8
9#define J8_OUT(ch) \
10 **p_out = (ch); \
11 (*p_out)++
12
13static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
14 int j8_escape) {
15 // We use a slightly weird double pointer style because
16 // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17 // *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19 // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() all
20 // call utf8_decode() which require that p_in MUST have a NUL terminator. This
21 // is so INCOMPLETE UTF-8 sequences are terminated with an INVALID byte, and
22 // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
23 // to do more bounds checks in these functions.
24
25 // CALLER MUST CHECK that we are able to write up to 6 bytes!
26 // Because the longest output is \u001f or \u{1f} for control chars, since
27 // we don't emit escapes like \u{1f926} right now
28 //
29 // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
30 // \yff instead of Unicode replacement char
31 // \u{1} instead of \u0001 for unprintable low chars
32
33 // Returns:
34 // 0 wrote valid UTF-8 (encoded or not)
35 // 1 wrote byte that's invalid UTF-8
36
37 unsigned char ch = **p_in;
38
39 //
40 // Handle \\ \b \f \n \r \t
41 //
42
43 // clang-format off
44 switch (ch) {
45 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
46 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
47 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
48 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
49 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
50 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
51 }
52 // clang-format on
53
54 //
55 // Conditionally handle \' and \"
56 //
57 if (ch == '\'' && j8_escape) { // J8-style strings \'
58 J8_OUT('\\');
59 J8_OUT('\'');
60 (*p_in)++;
61 return 0;
62 }
63 if (ch == '"' && !j8_escape) { // JSON-style strings \"
64 J8_OUT('\\');
65 J8_OUT('"');
66 (*p_in)++;
67 return 0;
68 }
69
70 //
71 // Unprintable ASCII control codes
72 //
73 if (ch < 0x20) {
74 if (j8_escape) {
75 // printf("Writing for %04x %p\n", ch, *p_out);
76 int n = sprintf((char*)*p_out, "\\u{%x}", ch);
77 // printf("! Wrote %d bytes for %04x\n", n, ch);
78 *p_out += n;
79 } else {
80 // printf("Writing for %04x %p\n", ch, *p_out);
81 int n = sprintf((char*)*p_out, "\\u%04x", ch);
82 *p_out += n;
83 // printf("Wrote %d bytes for %04x\n", n, ch);
84 }
85 (*p_in)++;
86 return 0;
87 }
88
89 //
90 // UTF-8 encoded runes and invalid bytes
91 //
92 Utf8Result_t result;
93 utf8_decode(*p_in, &result);
94
95 if (result.error == UTF8_OK) {
96 memcpy(*p_out, *p_in, result.bytes_read);
97 *p_in += result.bytes_read;
98 *p_out += result.bytes_read;
99 return 0;
100 }
101
102 // We have a UTF-8 decoding error. This is handled one of three ways:
103 // 1. Losslessly encode as J8 byte literals (only applicable in J8)
104 // 2. Try to encode a lone surrogate
105 // 3. Insert a Unicode replacement char
106
107 if (j8_escape) {
108 int n = sprintf((char*)*p_out, "\\y%02x", ch);
109 *p_in += 1;
110 *p_out += n;
111 } else if (result.error == UTF8_ERR_SURROGATE) {
112 int n = sprintf((char*)*p_out, "\\u%04x", result.codepoint);
113 *p_in += result.bytes_read;
114 *p_out += n;
115 return 1;
116 } else {
117 // Unicode replacement char is U+FFFD, so write encoded form
118 // >>> '\ufffd'.encode('utf-8')
119 // b'\xef\xbf\xbd'
120 J8_OUT('\xef');
121 J8_OUT('\xbf');
122 J8_OUT('\xbd');
123 *p_in += 1; // Advance past the byte we wrote
124 }
125
126 return 1;
127}
128
129// Like the above, but
130//
131// \xff instead of \yff
132// \u001f always, never \u{1f}
133// No JSON vs. J8
134// No \" escape ever
135// No errors -- it can encode everything
136
137static inline void BashDollarEncodeOne(unsigned char** p_in,
138 unsigned char** p_out) {
139 unsigned char ch = **p_in;
140
141 //
142 // Handle \\ \b \f \n \r \t \'
143 //
144
145 // clang-format off
146 switch (ch) {
147 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
148 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
149 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
150 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
151 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
152 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
153 case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
154 }
155 // clang-format on
156
157 //
158 // Unprintable ASCII control codes
159 //
160 if (ch < 0x20) {
161 // printf("Writing for %04x %p\n", ch, *p_out);
162 int n = sprintf((char*)*p_out, "\\u%04x", ch);
163 *p_out += n;
164 // printf("Wrote %d bytes for %04x\n", n, ch);
165 (*p_in)++;
166 return;
167 }
168
169 //
170 // UTF-8 encoded runes and invalid bytes
171 //
172 Utf8Result_t result;
173 utf8_decode(*p_in, &result);
174 if (result.error == UTF8_OK) {
175 memcpy(*p_out, *p_in, result.bytes_read);
176 *p_in += result.bytes_read;
177 *p_out += result.bytes_read;
178 } else {
179 // If not a valid UTF-8 byte sequence, losslessly encode the bad bytes
180 int n = sprintf((char*)*p_out, "\\x%02x", **p_in);
181 *p_out += n;
182 *p_in += 1; // Advance past the byte we wrote
183 }
184}
185
186// BourneShellEncodeOne rules:
187//
188// must be valid UTF-8
189// no control chars
190// no ' is required
191// no \ -- not required, but avoids ambiguous '\n'
192//
193// For example we write $'\\' or b'\\' not '\'
194// The latter should be written r'\', but we're not outputing
195
196static inline int BourneShellEncodeOne(unsigned char** p_in,
197 unsigned char** p_out) {
198 unsigned char ch = **p_in;
199
200 if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell ''
201 return 1;
202 }
203 if (ch < 0x20) { // Unprintable ASCII control codes
204 return 1;
205 }
206
207 // UTF-8 encoded runes and invalid bytes
208 Utf8Result_t result;
209 utf8_decode(*p_in, &result);
210 if (result.error == UTF8_OK) {
211 memcpy(*p_out, *p_in, result.bytes_read);
212 *p_in += result.bytes_read;
213 *p_out += result.bytes_read;
214 return 0;
215 } else {
216 return 1;
217 }
218}
219
220// Right now \u001f and \u{1f} are the longest output sequences for a byte.
221// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
222// though we don't technically need it)
223
224// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
225// If this is too small, we would enter an infinite loop
226// +1 for NUL terminator
227
228#define J8_MAX_BYTES_PER_INPUT_BYTE 7
229
230// The minimum capacity must be more than the number above.
231// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
232#define J8_MIN_CAPACITY 16
233
234static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
235 unsigned char** p_out, unsigned char* out_end,
236 int j8_escape) {
237 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
238 // printf("iter %d %p < %p \n", i++, *p_out, out_end);
239 int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
240 if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
241 return invalid_utf8; // early return
242 }
243 }
244 return 0;
245}
246
247static inline int BashDollarEncodeChunk(unsigned char** p_in,
248 unsigned char* in_end,
249 unsigned char** p_out,
250 unsigned char* out_end) {
251 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
252 BashDollarEncodeOne(p_in, p_out);
253 }
254 return 0;
255}
256
257static inline int BourneShellEncodeChunk(unsigned char** p_in,
258 unsigned char* in_end,
259 unsigned char** p_out,
260 unsigned char* out_end) {
261 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
262 int cannot_encode = BourneShellEncodeOne(p_in, p_out);
263 if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
264 return cannot_encode; // early return
265 }
266 }
267 return 0;
268}
269
270static inline int CanOmitQuotes(unsigned char* s, int len) {
271 if (len == 0) { // empty string has to be quoted
272 return 0;
273 }
274
275 // 3 special case keywords
276 if (len == 4) {
277 if (memcmp(s, "null", 4) == 0) {
278 return 0;
279 }
280 if (memcmp(s, "true", 4) == 0) {
281 return 0;
282 }
283 }
284 if (len == 5) {
285 if (memcmp(s, "false", 5) == 0) {
286 return 0;
287 }
288 }
289
290 for (int i = 0; i < len; ++i) {
291 unsigned char ch = s[i];
292
293 // Corresponds to regex [a-zA-Z0-9./_-]
294 if ('a' <= ch && ch <= 'z') {
295 continue;
296 }
297 if ('A' <= ch && ch <= 'Z') {
298 continue;
299 }
300 if ('0' <= ch && ch <= '9') {
301 continue;
302 }
303 if (ch == '.' || ch == '/' || ch == '_' || ch == '-') {
304 continue;
305 }
306 // some byte requires quotes
307 // Not including UTF-8 here because it can have chars that look like space
308 // or quotes
309 return 0;
310 }
311 return 1; // everything OK
312}
313
314#endif // DATA_LANG_J8_H