data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1356 lines, 679 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	# TODO: Omit type at top level
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	def EncodeString(s, buf, unquoted_ok=False):
195	# type: (str, mylib.BufWriter, bool) -> None
196	""" For pp proc, etc."""
197
198	if unquoted_ok and fastfunc.CanOmitQuotes(s):
199	buf.write(s)
200	return
201
202	_Print(value.Str(s), buf, -1)
203
204
205	def MaybeEncodeString(s):
206	# type: (str) -> str
207	""" For write --json8 $s and compexport """
208
209	# TODO: add unquoted_ok here?
210	# /usr/local/foo-bar/x.y/a_b
211
212	buf = mylib.BufWriter()
213	_Print(value.Str(s), buf, -1)
214	return buf.getvalue()
215
216
217	def MaybeEncodeJsonString(s):
218	# type: (str) -> str
219	""" For write --json """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225	return buf.getvalue()
226
227
228	# DFS traversal state
229	UNSEEN = 0
230	EXPLORING = 1
231	FINISHED = 2
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	# Value is always True
245	# Dict[int, None] doesn't translate -- it would be nice to have a set()
246	self.visited = {} # type: Dict[int, int]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintDict(self, val, level):
297	# type: (value.Dict, int) -> None
298
299	if len(val.d) == 0: # Special case like Python/JS
300	self.buf.write('{}')
301	else:
302	self.buf.write('{')
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(val.d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write('}')
324
325	def _PrintBashPrefix(self, type_str, level):
326	# type: (str, int) -> None
327
328	self.buf.write('{')
329	self._MaybeNewline()
330	self._ItemIndent(level)
331	self.buf.write('"type":')
332	self._MaybeSpace()
333	self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335	self._MaybeNewline()
336
337	self._ItemIndent(level)
338	self.buf.write('"data":')
339	self._MaybeSpace()
340
341	def _PrintBashSuffix(self, level):
342	# type: (int) -> None
343	self._MaybeNewline()
344	self._BracketIndent(level)
345	self.buf.write('}')
346
347	def _PrintSparseArray(self, val, level):
348	# type: (value.SparseArray, int) -> None
349
350	self._PrintBashPrefix('"SparseArray",', level)
351
352	if len(val.d) == 0: # Special case like Python/JS
353	self.buf.write('{}')
354	else:
355	self.buf.write('{')
356	self._MaybeNewline()
357
358	first = True
359	i = 0
360	for k, v in iteritems(val.d):
361	if i != 0:
362	self.buf.write(',')
363	self._MaybeNewline()
364
365	self._ItemIndent(level + 1)
366	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
367
368	self.buf.write(':')
369	self._MaybeSpace()
370
371	pyj8.WriteString(v, self.options, self.buf)
372
373	i += 1
374
375	self._MaybeNewline()
376
377	self._BracketIndent(level + 1)
378	self.buf.write('}')
379
380	self._PrintBashSuffix(level)
381
382	def _PrintBashArray(self, val, level):
383	# type: (value.BashArray, int) -> None
384
385	self._PrintBashPrefix('"BashArray",', level)
386
387	if len(val.strs) == 0: # Special case like Python/JS
388	self.buf.write('{}')
389	else:
390	self.buf.write('{')
391	self._MaybeNewline()
392
393	first = True
394	for i, s in enumerate(val.strs):
395	if s is None:
396	continue
397
398	if not first:
399	self.buf.write(',')
400	self._MaybeNewline()
401
402	self._ItemIndent(level + 1)
403	pyj8.WriteString(str(i), self.options, self.buf)
404
405	self.buf.write(':')
406	self._MaybeSpace()
407
408	pyj8.WriteString(s, self.options, self.buf)
409
410	first = False
411
412	self._MaybeNewline()
413
414	self._BracketIndent(level + 1)
415	self.buf.write('}')
416
417	self._PrintBashSuffix(level)
418
419	def _PrintBashAssoc(self, val, level):
420	# type: (value.BashAssoc, int) -> None
421
422	self._PrintBashPrefix('"BashAssoc",', level)
423
424	if len(val.d) == 0: # Special case like Python/JS
425	self.buf.write('{}')
426	else:
427	self.buf.write('{')
428	self._MaybeNewline()
429
430	i = 0
431	for k2, v2 in iteritems(val.d):
432	if i != 0:
433	self.buf.write(',')
434	self._MaybeNewline()
435
436	self._ItemIndent(level + 1)
437	pyj8.WriteString(k2, self.options, self.buf)
438
439	self.buf.write(':')
440	self._MaybeSpace()
441
442	pyj8.WriteString(v2, self.options, self.buf)
443
444	i += 1
445
446	self._MaybeNewline()
447
448	self._BracketIndent(level + 1)
449	self.buf.write('}')
450
451	self._PrintBashSuffix(level)
452
453	def Print(self, val, level=0):
454	# type: (value_t, int) -> None
455
456	# special value that means everything is on one line
457	# It's like
458	# JSON.stringify(d, null, 0)
459	# except we use -1, not 0. 0 can still have newlines.
460
461	UP_val = val
462	with tagswitch(val) as case:
463	if case(value_e.Null):
464	self.buf.write('null')
465
466	elif case(value_e.Bool):
467	val = cast(value.Bool, UP_val)
468	self.buf.write('true' if val.b else 'false')
469
470	elif case(value_e.Int):
471	val = cast(value.Int, UP_val)
472	# TODO: avoid intermediate allocation with
473	# self.buf.WriteBigInt(val.i)
474	#
475	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
476	# be of arbitrary length, and will need a growth strategy.
477	# Although that is not very common, so we could allocate in
478	# that case.
479
480	self.buf.write(mops.ToStr(val.i))
481
482	elif case(value_e.Float):
483	val = cast(value.Float, UP_val)
484
485	fl = val.f
486	if math.isinf(fl):
487	if self.options & INF_NAN_ARE_NULL:
488	s = 'null' # negative infinity is null too
489	else:
490	s = 'INFINITY'
491	if fl < 0:
492	s = '-' + s
493	elif math.isnan(fl):
494	if self.options & INF_NAN_ARE_NULL:
495	# JavaScript JSON lib behavior: Inf and NaN are null
496	# Python has a bug in the encoder by default, and then
497	# allow_nan=False raises an error
498	s = 'null'
499	else:
500	s = 'NAN'
501	else:
502	# TODO: can we avoid intermediate allocation?
503	# self.buf.WriteFloat(val.f)
504	s = str(fl)
505
506	self.buf.write(s)
507
508	elif case(value_e.Str):
509	val = cast(value.Str, UP_val)
510
511	pyj8.WriteString(val.s, self.options, self.buf)
512
513	elif case(value_e.List):
514	val = cast(value.List, UP_val)
515
516	# Cycle detection, only for containers that can be in cycles
517	heap_id = HeapValueId(val)
518
519	node_state = self.visited.get(heap_id, UNSEEN)
520	if node_state == FINISHED:
521	# Print it AGAIN. We print a JSON tree, which means we can
522	# visit and print nodes MANY TIMES, as long as they're not
523	# in a cycle.
524	self._PrintList(val, level)
525	return
526	if node_state == EXPLORING:
527	if self.options & SHOW_CYCLES:
528	self.buf.write('[ -->%s ]' % ValueIdString(val))
529	return
530	else:
531	# node.js prints which index closes the cycle
532	raise error.Encode(
533	"Can't encode List%s in object cycle" %
534	ValueIdString(val))
535
536	self.visited[heap_id] = EXPLORING
537	self._PrintList(val, level)
538	self.visited[heap_id] = FINISHED
539
540	elif case(value_e.Dict):
541	val = cast(value.Dict, UP_val)
542
543	# Cycle detection, only for containers that can be in cycles
544	heap_id = HeapValueId(val)
545
546	node_state = self.visited.get(heap_id, UNSEEN)
547	if node_state == FINISHED:
548	# Print it AGAIN. We print a JSON tree, which means we can
549	# visit and print nodes MANY TIMES, as long as they're not
550	# in a cycle.
551	self._PrintDict(val, level)
552	return
553	if node_state == EXPLORING:
554	if self.options & SHOW_CYCLES:
555	self.buf.write('{ -->%s }' % ValueIdString(val))
556	return
557	else:
558	# node.js prints which key closes the cycle
559	raise error.Encode(
560	"Can't encode Dict%s in object cycle" %
561	ValueIdString(val))
562
563	self.visited[heap_id] = EXPLORING
564	self._PrintDict(val, level)
565	self.visited[heap_id] = FINISHED
566
567	elif case(value_e.SparseArray):
568	val = cast(value.SparseArray, UP_val)
569	self._PrintSparseArray(val, level)
570
571	elif case(value_e.BashArray):
572	val = cast(value.BashArray, UP_val)
573	self._PrintBashArray(val, level)
574
575	elif case(value_e.BashAssoc):
576	val = cast(value.BashAssoc, UP_val)
577	self._PrintBashAssoc(val, level)
578
579	else:
580	pass # mycpp workaround
581	if self.options & SHOW_NON_DATA:
582	# Similar to = operator, ui.DebugPrint()
583	# TODO: that prints value.Range in a special way
584	ysh_type = ValType(val)
585	id_str = ValueIdString(val)
586	self.buf.write('<%s%s>' % (ysh_type, id_str))
587	else:
588	raise error.Encode("Can't serialize object of type %s" %
589	ValType(val))
590
591
592	class PrettyPrinter(object):
593	""" Unused right now, but could enhance the = operator.
594
595	Output to polymorphic ColorOutput
596
597	Features like asdl/format.py:
598	- line wrapping
599	- color
600	- sharing detection by passing in a REF COUTN dict
601	- print @123 the first time, and then print ... the second time
602
603	and
604
605	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
606	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
607
608	- Omitting commas for ASDL? Maybe we can use two spaces
609
610	(Token id: Id.VSub_DollarName start: 0 length: 3)
611	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
612	"""
613
614	def __init__(self, max_col):
615	# type: (int) -> None
616	self.max_col = max_col
617
618	# This could be an optimized set an C++ bit set like
619	# mark_sweep_heap.h, rather than a Dict
620	#self.unique_objs = mylib.UniqueObjects()
621
622	# first pass of object ID -> number of times references
623
624	self.ref_count = {} # type: Dict[int, int]
625
626	def PrettyTree(self, val, f):
627	# type: (value_t, fmt.ColorOutput) -> None
628
629	# TODO: first convert to hnode.asdl types?
630
631	# Although we might want
632	# hnode.AlreadyShown = (str type, int unique_id)
633	pass
634
635	def Print(self, val, buf):
636	# type: (value_t, mylib.BufWriter) -> None
637
638	# Or print to stderr?
639	f = fmt.DetectConsoleOutput(mylib.Stdout())
640	self.PrettyTree(val, f)
641
642	# Then print those with ASDL
643	pass
644
645
646	class LexerDecoder(object):
647	"""J8 lexer and string decoder.
648
649	Similar interface as SimpleLexer, except we return an optional decoded
650	string
651	"""
652
653	def __init__(self, s, is_j8, lang_str):
654	# type: (str, bool, str) -> None
655	self.s = s
656	self.is_j8 = is_j8
657	self.lang_str = lang_str
658
659	self.pos = 0
660
661	# current line being lexed -- for error messages
662	self.cur_line_num = 1
663
664	# Reuse this instance to save GC objects. JSON objects could have
665	# thousands of strings.
666	self.decoded = mylib.BufWriter()
667
668	def _Error(self, msg, end_pos):
669	# type: (str, int) -> error.Decode
670
671	# Use the current position as start pos
672	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
673
674	def Next(self):
675	# type: () -> Tuple[Id_t, int, Optional[str]]
676	""" Returns a token and updates self.pos """
677
678	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
679
680	if not self.is_j8:
681	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
682	raise self._Error(
683	"Single quotes aren't part of JSON; you may want 'json8 read'",
684	end_pos)
685	if tok_id == Id.Ignored_Comment:
686	raise self._Error(
687	"Comments aren't part of JSON; you may want 'json8 read'",
688	end_pos)
689
690	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
691	Id.Left_USingleQuote):
692	return self._DecodeString(tok_id, end_pos)
693
694	if tok_id == Id.Left_JDoubleQuote:
695	if self.is_j8:
696	return self._DecodeString(tok_id, end_pos)
697	else:
698	raise self._Error('Pure JSON does not accept j"" prefix',
699	end_pos)
700
701	if tok_id == Id.Ignored_Newline:
702	#log('LINE %d', self.cur_line_num)
703	self.cur_line_num += 1
704
705	self.pos = end_pos
706	return tok_id, end_pos, None
707
708	def NextForLines(self):
709	# type: () -> Tuple[Id_t, int, Optional[str]]
710	""" Like Next(), but for J8 Lines """
711
712	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
713
714	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
715	Id.Left_BSingleQuote, Id.Left_USingleQuote):
716	return self._DecodeString(tok_id, end_pos)
717
718	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
719	# this for quoted strings.)
720	if (tok_id == Id.Lit_Chars and
721	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
722	raise self._Error(
723	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
724	if tok_id == Id.Char_AsciiControl:
725	raise self._Error(
726	"J8 Lines can't have unescaped ASCII control chars", end_pos)
727
728	if tok_id == Id.J8_Newline:
729	#log('LINE %d', self.cur_line_num)
730	self.cur_line_num += 1
731
732	self.pos = end_pos
733	return tok_id, end_pos, None
734
735	def _DecodeString(self, left_id, str_pos):
736	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
737	""" Returns a string token and updates self.pos """
738
739	while True:
740	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
741	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
742	else:
743	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
744
745	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
746
747	if tok_id == Id.Eol_Tok:
748	# TODO: point to beginning of # quote?
749	raise self._Error(
750	'Unexpected EOF while lexing %s string' % self.lang_str,
751	str_end)
752	if tok_id == Id.Unknown_Backslash:
753	raise self._Error(
754	'Bad backslash escape in %s string' % self.lang_str,
755	str_end)
756	if tok_id == Id.Char_AsciiControl:
757	raise self._Error(
758	"%s strings can't have unescaped ASCII control chars" %
759	self.lang_str, str_end)
760
761	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
762
763	self.pos = str_end
764
765	s = self.decoded.getvalue()
766	self.decoded.clear() # reuse this instance
767
768	#log('decoded %r', self.decoded.getvalue())
769	return Id.J8_String, str_end, s
770
771	#
772	# Now handle each kind of token
773	#
774
775	if tok_id == Id.Lit_Chars: # JSON and J8
776	part = self.s[str_pos:str_end]
777	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
778	raise self._Error(
779	'Invalid UTF-8 in %s string literal' % self.lang_str,
780	str_end)
781
782	# TODO: would be nice to avoid allocation in all these cases.
783	# But LookupCharC() would have to change.
784
785	elif tok_id == Id.Char_OneChar: # JSON and J8
786	ch = self.s[str_pos + 1]
787	part = consts.LookupCharC(ch)
788
789	elif tok_id == Id.Char_UBraced: # J8 only
790	h = self.s[str_pos + 3:str_end - 1]
791	i = int(h, 16)
792
793	# Same checks in osh/word_compile.py
794	if i > 0x10ffff:
795	raise self._Error(
796	"Code point can't be greater than U+10ffff", str_end)
797	if 0xD800 <= i and i < 0xE000:
798	raise self._Error(
799	r"\u{%s} escape is illegal because it's in the surrogate range"
800	% h, str_end)
801
802	part = Utf8Encode(i)
803
804	elif tok_id == Id.Char_YHex: # J8 only
805	h = self.s[str_pos + 2:str_end]
806
807	# Same check in osh/word_parse.py
808	if left_id != Id.Left_BSingleQuote:
809	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
810	raise self._Error(
811	r"\y%s escapes not allowed in u'' strings" % h,
812	str_end)
813
814	i = int(h, 16)
815	part = chr(i)
816
817	elif tok_id == Id.Char_SurrogatePair:
818	h1 = self.s[str_pos + 2:str_pos + 6]
819	h2 = self.s[str_pos + 8:str_pos + 12]
820
821	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
822	i1 = int(h1, 16) - 0xD800 # high surrogate
823	i2 = int(h2, 16) - 0xDC00 # low surrogate
824	code_point = 0x10000 + (i1 << 10) + i2
825
826	part = Utf8Encode(code_point)
827
828	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
829	h = self.s[str_pos + 2:str_end]
830	i = int(h, 16)
831	part = Utf8Encode(i)
832
833	else:
834	# Should never happen
835	raise AssertionError(Id_str(tok_id))
836
837	#log('%s part %r', Id_str(tok_id), part)
838	self.decoded.write(part)
839	str_pos = str_end
840
841
842	class _Parser(object):
843
844	def __init__(self, s, is_j8):
845	# type: (str, bool) -> None
846	self.s = s
847	self.is_j8 = is_j8
848	self.lang_str = "J8" if is_j8 else "JSON"
849
850	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
851	self.tok_id = Id.Undefined_Tok
852	self.start_pos = 0
853	self.end_pos = 0
854	self.decoded = '' # decoded J8 string
855
856	def _Next(self):
857	# type: () -> None
858
859	# This isn't the start of a J8_Bool token, it's the END of the token before it
860	while True:
861	self.start_pos = self.end_pos
862	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
863	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
864	Id.Ignored_Comment):
865	break
866	# TODO: add Ignored_Newline to count lines, and show line numbers
867	# in errors messages. The position of the last newline and a token
868	# can be used to calculate a column number.
869
870	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
871
872	def _Eat(self, tok_id):
873	# type: (Id_t) -> None
874
875	if self.tok_id != tok_id:
876	#log('position %r %d-%d %r', self.s, self.start_pos,
877	# self.end_pos, self.s[self.start_pos:self.end_pos])
878	raise self._ParseError("Expected %s, got %s" %
879	(Id_str(tok_id), Id_str(self.tok_id)))
880	self._Next()
881
882	def _NextForLines(self):
883	# type: () -> None
884	"""Like _Next, but use the J8 Lines lexer."""
885	self.start_pos = self.end_pos
886	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
887
888	def _ParseError(self, msg):
889	# type: (str) -> error.Decode
890	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
891	self.lexer.cur_line_num)
892
893
894	class Parser(_Parser):
895	"""JSON and JSON8 Parser."""
896
897	def __init__(self, s, is_j8):
898	# type: (str, bool) -> None
899	_Parser.__init__(self, s, is_j8)
900
901	def _ParsePair(self):
902	# type: () -> Tuple[str, value_t]
903
904	k = self.decoded # Save the potential string value
905	self._Eat(Id.J8_String) # Check that it's a string
906	assert k is not None
907
908	self._Eat(Id.J8_Colon)
909
910	v = self._ParseValue()
911	return k, v
912
913	def _ParseDict(self):
914	# type: () -> value_t
915	"""
916	pair = string ':' value
917	Dict = '{' '}'
918	\| '{' pair (',' pair)* '}'
919	"""
920	# precondition
921	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
922
923	#log('> Dict')
924
925	d = NewDict() # type: Dict[str, value_t]
926
927	self._Next()
928	if self.tok_id == Id.J8_RBrace:
929	self._Next()
930	return value.Dict(d)
931
932	k, v = self._ParsePair()
933	d[k] = v
934	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
935
936	while self.tok_id == Id.J8_Comma:
937	self._Next()
938	k, v = self._ParsePair()
939	d[k] = v
940	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
941
942	self._Eat(Id.J8_RBrace)
943
944	#log('< Dict')
945
946	return value.Dict(d)
947
948	def _ParseList(self):
949	# type: () -> value_t
950	"""
951	List = '[' ']'
952	\| '[' value (',' value)* ']'
953	"""
954	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
955
956	items = [] # type: List[value_t]
957
958	self._Next()
959	if self.tok_id == Id.J8_RBracket:
960	self._Next()
961	return value.List(items)
962
963	items.append(self._ParseValue())
964
965	while self.tok_id == Id.J8_Comma:
966	self._Next()
967	items.append(self._ParseValue())
968
969	self._Eat(Id.J8_RBracket)
970
971	return value.List(items)
972
973	def _ParseValue(self):
974	# type: () -> value_t
975	if self.tok_id == Id.J8_LBrace:
976	return self._ParseDict()
977
978	elif self.tok_id == Id.J8_LBracket:
979	return self._ParseList()
980
981	elif self.tok_id == Id.J8_Null:
982	self._Next()
983	return value.Null
984
985	elif self.tok_id == Id.J8_Bool:
986	#log('%r %d', self.s[self.start_pos], self.start_pos)
987	b = value.Bool(self.s[self.start_pos] == 't')
988	self._Next()
989	return b
990
991	elif self.tok_id == Id.J8_Int:
992	part = self.s[self.start_pos:self.end_pos]
993	self._Next()
994	try:
995	big = mops.FromStr(part)
996	except ValueError:
997	raise self._ParseError('Integer is too big')
998	return value.Int(big)
999
1000	elif self.tok_id == Id.J8_Float:
1001	part = self.s[self.start_pos:self.end_pos]
1002	self._Next()
1003	return value.Float(float(part))
1004
1005	# UString, BString too
1006	elif self.tok_id == Id.J8_String:
1007	str_val = value.Str(self.decoded)
1008	#log('d %r', self.decoded)
1009	self._Next()
1010	return str_val
1011
1012	elif self.tok_id == Id.Eol_Tok:
1013	raise self._ParseError('Unexpected EOF while parsing %s' %
1014	self.lang_str)
1015
1016	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1017	raise self._ParseError('Invalid token while parsing %s: %s' %
1018	(self.lang_str, Id_str(self.tok_id)))
1019
1020	def ParseValue(self):
1021	# type: () -> value_t
1022	""" Raises error.Decode. """
1023	self._Next()
1024	obj = self._ParseValue()
1025
1026	n = len(self.s)
1027	if self.start_pos != n:
1028	extra = n - self.start_pos
1029	#log('n %d pos %d', n, self.start_pos)
1030	raise self._ParseError(
1031	'Got %d bytes of unexpected trailing input' % extra)
1032	return obj
1033
1034
1035	class Nil8Parser(_Parser):
1036	"""
1037	Tokens not in JSON8:
1038	LParen RParen Symbol
1039
1040	Tokens not in JSON, but in JSON8 and NIL8:
1041	Identifier (unquoted keys)
1042	Ignored_Comment
1043	"""
1044
1045	def __init__(self, s, is_j8):
1046	# type: (str, bool) -> None
1047	_Parser.__init__(self, s, is_j8)
1048
1049	if 0:
1050
1051	def _LookAhead(self):
1052	# type: () -> Id_t
1053	"""
1054	Don't need this right now
1055	"""
1056	end_pos = self.end_pos # look ahead from last token
1057	while True:
1058	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1059	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1060	Id.Ignored_Comment):
1061	break
1062	return tok_id
1063
1064	def _ParseRecord(self):
1065	# type: () -> nvalue_t
1066	"""
1067	Yaks
1068	(self->Next) => (-> self Next)
1069	(self->Next obj.field) => ((-> self Next) (. obj field))
1070
1071	Similar to
1072	((identity identity) 42) => 42 in Clojure
1073
1074	ASDL
1075	(Node left:(. x4beef2))
1076	(Node left !x4beef2)
1077
1078	# Ambiguous because value can be identifier.
1079	# We have to look ahead to and see if there's a colon :
1080	field =
1081	Identifier ':' value
1082	\| value
1083
1084	record = '(' head field* ')'
1085
1086	- Identifier \| Symbol are treated the same, it's a side effect of
1087	the lexing style
1088	- do positional args come before named args
1089	- () is invalid? Use [] for empty list
1090	"""
1091	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1092
1093	items = [] # type: List[nvalue_t]
1094
1095	self._Next()
1096	if self.tok_id == Id.J8_RParen:
1097	self._Next()
1098	return nvalue.List(items)
1099
1100	#log('TOK %s', Id_str(self.tok_id))
1101	while self.tok_id != Id.J8_RParen:
1102	items.append(self._ParseNil8())
1103	#log('TOK 2 %s', Id_str(self.tok_id))
1104
1105	self._Eat(Id.J8_RParen)
1106
1107	return nvalue.List(items)
1108
1109	def _ParseList8(self):
1110	# type: () -> nvalue_t
1111	"""
1112	List8 = '[' value* ']'
1113
1114	No commas, not even optional ones for now.
1115	"""
1116	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1117
1118	items = [] # type: List[nvalue_t]
1119
1120	self._Next()
1121	if self.tok_id == Id.J8_RBracket:
1122	self._Next()
1123	return nvalue.List(items)
1124
1125	#log('TOK %s', Id_str(self.tok_id))
1126	while self.tok_id != Id.J8_RBracket:
1127	items.append(self._ParseNil8())
1128	#log('TOK 2 %s', Id_str(self.tok_id))
1129
1130	self._Eat(Id.J8_RBracket)
1131
1132	return nvalue.List(items)
1133
1134	def _ParseNil8(self):
1135	# type: () -> nvalue_t
1136	if self.tok_id == Id.J8_LParen:
1137	obj = self._ParseRecord() # type: nvalue_t
1138	#return obj
1139
1140	elif self.tok_id == Id.J8_LBracket:
1141	obj = self._ParseList8()
1142	#return obj
1143
1144	# Primitives are copied from J8 above.
1145	# TODO: We also want hex literals.
1146	elif self.tok_id == Id.J8_Null:
1147	self._Next()
1148	obj = nvalue.Null
1149
1150	elif self.tok_id == Id.J8_Bool:
1151	b = nvalue.Bool(self.s[self.start_pos] == 't')
1152	self._Next()
1153	obj = b
1154
1155	elif self.tok_id == Id.J8_Int:
1156	part = self.s[self.start_pos:self.end_pos]
1157	self._Next()
1158	obj = nvalue.Int(int(part))
1159
1160	elif self.tok_id == Id.J8_Float:
1161	part = self.s[self.start_pos:self.end_pos]
1162	self._Next()
1163	obj = nvalue.Float(float(part))
1164
1165	elif self.tok_id == Id.J8_String:
1166	str_val = nvalue.Str(self.decoded)
1167	self._Next()
1168	obj = str_val
1169
1170	# <- etc.
1171	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1172	Id.J8_Comma):
1173	# unquoted "word" treated like a string
1174	part = self.s[self.start_pos:self.end_pos]
1175	self._Next()
1176	obj = nvalue.Symbol(part)
1177
1178	elif self.tok_id == Id.Eol_Tok:
1179	raise self._ParseError('Unexpected EOF while parsing %s' %
1180	self.lang_str)
1181
1182	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1183	raise self._ParseError('Invalid token while parsing %s: %s' %
1184	(self.lang_str, Id_str(self.tok_id)))
1185
1186	#log('YO %s', Id_str(self.tok_id))
1187	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1188	#log('AT %s', Id_str(self.tok_id))
1189
1190	# key: "value" -> (: key "value")
1191	part = self.s[self.start_pos:self.end_pos]
1192	op = nvalue.Symbol(part)
1193
1194	self._Next()
1195	operand2 = self._ParseNil8()
1196	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1197	#print("--> INFIX %d %s" % (id(infix), infix))
1198	return infix
1199
1200	#next_id = self._LookAhead()
1201	#print('NEXT %s' % Id_str(next_id))
1202
1203	#raise AssertionError()
1204	#print("--> OBJ %d %s" % (id(obj), obj))
1205	return obj
1206
1207	def ParseNil8(self):
1208	# type: () -> nvalue_t
1209	""" Raises error.Decode. """
1210	self._Next()
1211	#print('yo')
1212	obj = self._ParseNil8()
1213	#print("==> %d %s" % (id(obj), obj))
1214	if self.tok_id != Id.Eol_Tok:
1215	raise self._ParseError('Unexpected trailing input')
1216	return obj
1217
1218
1219	class J8LinesParser(_Parser):
1220	"""Decode lines from a string with newlines.
1221
1222	We specify this with a grammar, to preserve location info and to reduce
1223	allocations. (But note that unquoted_line is more like a LOOP than it is
1224	grammatical.)
1225
1226	Grammar:
1227
1228	end = J8_Newline \| Eol_Tok
1229
1230	empty_line = WS_Space? end
1231
1232	# special case: read until end token, but REMOVE trailing WS_Space
1233	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1234
1235	j8_line = WS_Space? J8_String WS_Space? end
1236
1237	lines = (empty_line \| unquoted_line \| j8_line)*
1238
1239	where Lit_Chars is valid UTF-8
1240
1241	Notes:
1242
1243	(1) We disallow multiple strings on a line, like:
1244
1245	"json" "json2"
1246	"json" unquoted
1247
1248	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1249
1250	foo "" u''
1251
1252	The "" and u'' are not a decoded string, because the line started with
1253	Id.Lit_Chars literals.
1254
1255	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1256	Does it have - for empty cell?
1257	"""
1258
1259	def __init__(self, s):
1260	# type: (str) -> None
1261	_Parser.__init__(self, s, True)
1262
1263	def _Show(self, s):
1264	# type: (str) -> None
1265	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1266	self.end_pos)
1267
1268	def _ParseLine(self, out):
1269	# type: (List[str]) -> None
1270	""" May append a line to 'out' """
1271	#self._Show('1')
1272	if self.tok_id == Id.WS_Space:
1273	self._NextForLines()
1274
1275	# Empty line - return without doing anything
1276	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1277	self._NextForLines()
1278	return
1279
1280	# Quoted string on line
1281	if self.tok_id == Id.J8_String:
1282	out.append(self.decoded)
1283	self._NextForLines()
1284
1285	if self.tok_id == Id.WS_Space: # trailing whitespace
1286	self._NextForLines()
1287
1288	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1289	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1290	Id_str(self.tok_id))
1291
1292	self._NextForLines()
1293	return
1294
1295	# Unquoted line
1296	if self.tok_id == Id.Lit_Chars:
1297	# ' unquoted "" text on line ' # read every token until end
1298	string_start = self.start_pos
1299	while True:
1300	# for stripping whitespace
1301	prev_id = self.tok_id
1302	prev_start = self.start_pos
1303
1304	self._NextForLines()
1305
1306	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1307	# \r, but we're sticking with the JSON spec definition of
1308	# whitespace. (As another data point, CPython on Unix allows
1309	# \r in the middle of expressions, treating it as whitespace.)
1310	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1311	break
1312
1313	if prev_id == Id.WS_Space:
1314	string_end = prev_start # remove trailing whitespace
1315	else:
1316	string_end = self.start_pos
1317
1318	out.append(self.s[string_start:string_end])
1319
1320	self._NextForLines() # past newline
1321	return
1322
1323	raise AssertionError(Id_str(self.tok_id))
1324
1325	def Parse(self):
1326	# type: () -> List[str]
1327	""" Raises error.Decode. """
1328	self._NextForLines()
1329
1330	lines = [] # type: List[str]
1331	while self.tok_id != Id.Eol_Tok:
1332	self._ParseLine(lines)
1333
1334	if self.tok_id != Id.Eol_Tok:
1335	raise self._ParseError('Unexpected trailing input in J8 Lines')
1336
1337	return lines
1338
1339
1340	def SplitJ8Lines(s):
1341	# type: (str) -> List[str]
1342	"""Used by @(echo split command sub)
1343
1344	Raises:
1345	error.Decode
1346
1347	3 Errors:
1348	- J8 string syntax error inside quotes
1349	- Extra input on line
1350	- unquoted line isn't utf-8
1351	"""
1352	p = J8LinesParser(s)
1353	return p.Parse()
1354
1355
1356	# vim: sw=4