data_lang/j8.h

OILS / data_lang / j8.h View on Github | oilshell.org

314 lines, 151 significant

1	#ifndef DATA_LANG_J8_H
2	#define DATA_LANG_J8_H
3
4	#include <stdio.h> // sprintf
5	#include <string.h> // memcmp, memcpy, strlen
6
7	#include "data_lang/utf8.h"
8
9	#define J8_OUT(ch) \
10	**p_out = (ch); \
11	(*p_out)++
12
13	static inline int J8EncodeOne(unsigned char p_in, unsigned char p_out,
14	int j8_escape) {
15	// We use a slightly weird double pointer style because
16	// *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17	// *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19	// IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() all
20	// call utf8_decode() which require that p_in MUST have a NUL terminator. This
21	// is so INCOMPLETE UTF-8 sequences are terminated with an INVALID byte, and
22	// 0x00 can only be ITSELF, never part of a sequence. An alternative would be
23	// to do more bounds checks in these functions.
24
25	// CALLER MUST CHECK that we are able to write up to 6 bytes!
26	// Because the longest output is \u001f or \u{1f} for control chars, since
27	// we don't emit escapes like \u{1f926} right now
28	//
29	// j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
30	// \yff instead of Unicode replacement char
31	// \u{1} instead of \u0001 for unprintable low chars
32
33	// Returns:
34	// 0 wrote valid UTF-8 (encoded or not)
35	// 1 wrote byte that's invalid UTF-8
36
37	unsigned char ch = **p_in;
38
39	//
40	// Handle \\ \b \f \n \r \t
41	//
42
43	// clang-format off
44	switch (ch) {
45	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
46	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
47	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
48	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
49	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
50	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
51	}
52	// clang-format on
53
54	//
55	// Conditionally handle \' and \"
56	//
57	if (ch == '\'' && j8_escape) { // J8-style strings \'
58	J8_OUT('\\');
59	J8_OUT('\'');
60	(*p_in)++;
61	return 0;
62	}
63	if (ch == '"' && !j8_escape) { // JSON-style strings \"
64	J8_OUT('\\');
65	J8_OUT('"');
66	(*p_in)++;
67	return 0;
68	}
69
70	//
71	// Unprintable ASCII control codes
72	//
73	if (ch < 0x20) {
74	if (j8_escape) {
75	// printf("Writing for %04x %p\n", ch, *p_out);
76	int n = sprintf((char)p_out, "\\u{%x}", ch);
77	// printf("! Wrote %d bytes for %04x\n", n, ch);
78	*p_out += n;
79	} else {
80	// printf("Writing for %04x %p\n", ch, *p_out);
81	int n = sprintf((char)p_out, "\\u%04x", ch);
82	*p_out += n;
83	// printf("Wrote %d bytes for %04x\n", n, ch);
84	}
85	(*p_in)++;
86	return 0;
87	}
88
89	//
90	// UTF-8 encoded runes and invalid bytes
91	//
92	Utf8Result_t result;
93	utf8_decode(*p_in, &result);
94
95	if (result.error == UTF8_OK) {
96	memcpy(p_out, p_in, result.bytes_read);
97	*p_in += result.bytes_read;
98	*p_out += result.bytes_read;
99	return 0;
100	}
101
102	// We have a UTF-8 decoding error. This is handled one of three ways:
103	// 1. Losslessly encode as J8 byte literals (only applicable in J8)
104	// 2. Try to encode a lone surrogate
105	// 3. Insert a Unicode replacement char
106
107	if (j8_escape) {
108	int n = sprintf((char)p_out, "\\y%02x", ch);
109	*p_in += 1;
110	*p_out += n;
111	} else if (result.error == UTF8_ERR_SURROGATE) {
112	int n = sprintf((char)p_out, "\\u%04x", result.codepoint);
113	*p_in += result.bytes_read;
114	*p_out += n;
115	return 1;
116	} else {
117	// Unicode replacement char is U+FFFD, so write encoded form
118	// >>> '\ufffd'.encode('utf-8')
119	// b'\xef\xbf\xbd'
120	J8_OUT('\xef');
121	J8_OUT('\xbf');
122	J8_OUT('\xbd');
123	*p_in += 1; // Advance past the byte we wrote
124	}
125
126	return 1;
127	}
128
129	// Like the above, but
130	//
131	// \xff instead of \yff
132	// \u001f always, never \u{1f}
133	// No JSON vs. J8
134	// No \" escape ever
135	// No errors -- it can encode everything
136
137	static inline void BashDollarEncodeOne(unsigned char** p_in,
138	unsigned char** p_out) {
139	unsigned char ch = **p_in;
140
141	//
142	// Handle \\ \b \f \n \r \t \'
143	//
144
145	// clang-format off
146	switch (ch) {
147	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
148	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
149	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
150	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
151	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
152	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
153	case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
154	}
155	// clang-format on
156
157	//
158	// Unprintable ASCII control codes
159	//
160	if (ch < 0x20) {
161	// printf("Writing for %04x %p\n", ch, *p_out);
162	int n = sprintf((char)p_out, "\\u%04x", ch);
163	*p_out += n;
164	// printf("Wrote %d bytes for %04x\n", n, ch);
165	(*p_in)++;
166	return;
167	}
168
169	//
170	// UTF-8 encoded runes and invalid bytes
171	//
172	Utf8Result_t result;
173	utf8_decode(*p_in, &result);
174	if (result.error == UTF8_OK) {
175	memcpy(p_out, p_in, result.bytes_read);
176	*p_in += result.bytes_read;
177	*p_out += result.bytes_read;
178	} else {
179	// If not a valid UTF-8 byte sequence, losslessly encode the bad bytes
180	int n = sprintf((char)p_out, "\\x%02x", **p_in);
181	*p_out += n;
182	*p_in += 1; // Advance past the byte we wrote
183	}
184	}
185
186	// BourneShellEncodeOne rules:
187	//
188	// must be valid UTF-8
189	// no control chars
190	// no ' is required
191	// no \ -- not required, but avoids ambiguous '\n'
192	//
193	// For example we write $'\\' or b'\\' not '\'
194	// The latter should be written r'\', but we're not outputing
195
196	static inline int BourneShellEncodeOne(unsigned char** p_in,
197	unsigned char** p_out) {
198	unsigned char ch = **p_in;
199
200	if (ch == '\'' \|\| ch == '\\') { // can't encode these in Bourne shell ''
201	return 1;
202	}
203	if (ch < 0x20) { // Unprintable ASCII control codes
204	return 1;
205	}
206
207	// UTF-8 encoded runes and invalid bytes
208	Utf8Result_t result;
209	utf8_decode(*p_in, &result);
210	if (result.error == UTF8_OK) {
211	memcpy(p_out, p_in, result.bytes_read);
212	*p_in += result.bytes_read;
213	*p_out += result.bytes_read;
214	return 0;
215	} else {
216	return 1;
217	}
218	}
219
220	// Right now \u001f and \u{1f} are the longest output sequences for a byte.
221	// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
222	// though we don't technically need it)
223
224	// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
225	// If this is too small, we would enter an infinite loop
226	// +1 for NUL terminator
227
228	#define J8_MAX_BYTES_PER_INPUT_BYTE 7
229
230	// The minimum capacity must be more than the number above.
231	// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
232	#define J8_MIN_CAPACITY 16
233
234	static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
235	unsigned char** p_out, unsigned char* out_end,
236	int j8_escape) {
237	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
238	// printf("iter %d %p < %p \n", i++, *p_out, out_end);
239	int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
240	if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
241	return invalid_utf8; // early return
242	}
243	}
244	return 0;
245	}
246
247	static inline int BashDollarEncodeChunk(unsigned char** p_in,
248	unsigned char* in_end,
249	unsigned char** p_out,
250	unsigned char* out_end) {
251	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
252	BashDollarEncodeOne(p_in, p_out);
253	}
254	return 0;
255	}
256
257	static inline int BourneShellEncodeChunk(unsigned char** p_in,
258	unsigned char* in_end,
259	unsigned char** p_out,
260	unsigned char* out_end) {
261	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
262	int cannot_encode = BourneShellEncodeOne(p_in, p_out);
263	if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
264	return cannot_encode; // early return
265	}
266	}
267	return 0;
268	}
269
270	static inline int CanOmitQuotes(unsigned char* s, int len) {
271	if (len == 0) { // empty string has to be quoted
272	return 0;
273	}
274
275	// 3 special case keywords
276	if (len == 4) {
277	if (memcmp(s, "null", 4) == 0) {
278	return 0;
279	}
280	if (memcmp(s, "true", 4) == 0) {
281	return 0;
282	}
283	}
284	if (len == 5) {
285	if (memcmp(s, "false", 5) == 0) {
286	return 0;
287	}
288	}
289
290	for (int i = 0; i < len; ++i) {
291	unsigned char ch = s[i];
292
293	// Corresponds to regex [a-zA-Z0-9./_-]
294	if ('a' <= ch && ch <= 'z') {
295	continue;
296	}
297	if ('A' <= ch && ch <= 'Z') {
298	continue;
299	}
300	if ('0' <= ch && ch <= '9') {
301	continue;
302	}
303	if (ch == '.' \|\| ch == '/' \|\| ch == '_' \|\| ch == '-') {
304	continue;
305	}
306	// some byte requires quotes
307	// Not including UTF-8 here because it can have chars that look like space
308	// or quotes
309	return 0;
310	}
311	return 1; // everything OK
312	}
313
314	#endif // DATA_LANG_J8_H