data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1330 lines, 656 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	# TODO: Omit type at top level
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	def EncodeString(s, buf, unquoted_ok=False):
195	# type: (str, mylib.BufWriter, bool) -> None
196	""" For pp proc, etc."""
197
198	if unquoted_ok and fastfunc.CanOmitQuotes(s):
199	buf.write(s)
200	return
201
202	_Print(value.Str(s), buf, -1)
203
204
205	def MaybeEncodeString(s):
206	# type: (str) -> str
207	""" For write --json8 $s and compexport """
208
209	# TODO: add unquoted_ok here?
210	# /usr/local/foo-bar/x.y/a_b
211
212	buf = mylib.BufWriter()
213	_Print(value.Str(s), buf, -1)
214	return buf.getvalue()
215
216
217	def MaybeEncodeJsonString(s):
218	# type: (str) -> str
219	""" For write --json """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225	return buf.getvalue()
226
227
228	# DFS traversal state
229	UNSEEN = 0
230	EXPLORING = 1
231	FINISHED = 2
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	# Value is always True
245	# Dict[int, None] doesn't translate -- it would be nice to have a set()
246	self.visited = {} # type: Dict[int, int]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintDict(self, val, level):
297	# type: (value.Dict, int) -> None
298
299	if len(val.d) == 0: # Special case like Python/JS
300	self.buf.write('{}')
301	else:
302	self.buf.write('{')
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(val.d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write('}')
324
325	def _PrintBashPrefix(self, type_str, level):
326	# type: (str, int) -> None
327
328	self.buf.write('{')
329	self._MaybeNewline()
330	self._ItemIndent(level)
331	self.buf.write('"type":')
332	self._MaybeSpace()
333	self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335	self._MaybeNewline()
336
337	self._ItemIndent(level)
338	self.buf.write('"data":')
339	self._MaybeSpace()
340
341	def _PrintBashSuffix(self, level):
342	# type: (int) -> None
343	level -= 1
344	self._MaybeNewline()
345	self._BracketIndent(level)
346	self.buf.write('}')
347
348	def _PrintBashArray(self, val, level):
349	# type: (value.BashArray, int) -> None
350
351	self._PrintBashPrefix('"BashArray",', level)
352
353	if len(val.strs) == 0: # Special case like Python/JS
354	self.buf.write('{}')
355	else:
356	self.buf.write('{')
357	self._MaybeNewline()
358
359	level += 1
360	first = True
361	for i, s in enumerate(val.strs):
362	if s is None:
363	continue
364
365	if not first:
366	self.buf.write(',')
367	self._MaybeNewline()
368
369	self._ItemIndent(level)
370
371	pyj8.WriteString(str(i), self.options, self.buf)
372	self.buf.write(':')
373	self._MaybeSpace()
374
375	pyj8.WriteString(s, self.options, self.buf)
376
377	first = False
378
379	self._MaybeNewline()
380
381	self._BracketIndent(level)
382	self.buf.write('}')
383
384	self._PrintBashSuffix(level)
385
386	def _PrintBashAssoc(self, val, level):
387	# type: (value.BashAssoc, int) -> None
388
389	self._PrintBashPrefix('"BashAssoc",', level)
390
391	if len(val.d) == 0: # Special case like Python/JS
392	self.buf.write('{}')
393	else:
394	self.buf.write('{')
395	self._MaybeNewline()
396
397	level += 1
398	i = 0
399	for k2, v2 in iteritems(val.d):
400	if i != 0:
401	self.buf.write(',')
402	self._MaybeNewline()
403
404	self._ItemIndent(level)
405
406	pyj8.WriteString(k2, self.options, self.buf)
407
408	self.buf.write(':')
409	self._MaybeSpace()
410
411	pyj8.WriteString(v2, self.options, self.buf)
412
413	i += 1
414
415	self._MaybeNewline()
416
417	self._BracketIndent(level)
418	self.buf.write('}')
419
420	self._PrintBashSuffix(level)
421
422	def Print(self, val, level=0):
423	# type: (value_t, int) -> None
424
425	# special value that means everything is on one line
426	# It's like
427	# JSON.stringify(d, null, 0)
428	# except we use -1, not 0. 0 can still have newlines.
429
430	UP_val = val
431	with tagswitch(val) as case:
432	if case(value_e.Null):
433	self.buf.write('null')
434
435	elif case(value_e.Bool):
436	val = cast(value.Bool, UP_val)
437	self.buf.write('true' if val.b else 'false')
438
439	elif case(value_e.Int):
440	val = cast(value.Int, UP_val)
441	# TODO: avoid intermediate allocation with
442	# self.buf.WriteBigInt(val.i)
443	#
444	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
445	# be of arbitrary length, and will need a growth strategy.
446	# Although that is not very common, so we could allocate in
447	# that case.
448
449	self.buf.write(mops.ToStr(val.i))
450
451	elif case(value_e.Float):
452	val = cast(value.Float, UP_val)
453
454	fl = val.f
455	if math.isinf(fl):
456	if self.options & INF_NAN_ARE_NULL:
457	s = 'null' # negative infinity is null too
458	else:
459	s = 'INFINITY'
460	if fl < 0:
461	s = '-' + s
462	elif math.isnan(fl):
463	if self.options & INF_NAN_ARE_NULL:
464	# JavaScript JSON lib behavior: Inf and NaN are null
465	# Python has a bug in the encoder by default, and then
466	# allow_nan=False raises an error
467	s = 'null'
468	else:
469	s = 'NAN'
470	else:
471	# TODO: can we avoid intermediate allocation?
472	# self.buf.WriteFloat(val.f)
473	s = str(fl)
474
475	self.buf.write(s)
476
477	elif case(value_e.Str):
478	val = cast(value.Str, UP_val)
479
480	pyj8.WriteString(val.s, self.options, self.buf)
481
482	elif case(value_e.List):
483	val = cast(value.List, UP_val)
484
485	# Cycle detection, only for containers that can be in cycles
486	heap_id = HeapValueId(val)
487
488	node_state = self.visited.get(heap_id, UNSEEN)
489	if node_state == FINISHED:
490	# Print it AGAIN. We print a JSON tree, which means we can
491	# visit and print nodes MANY TIMES, as long as they're not
492	# in a cycle.
493	self._PrintList(val, level)
494	return
495	if node_state == EXPLORING:
496	if self.options & SHOW_CYCLES:
497	self.buf.write('[ -->%s ]' % ValueIdString(val))
498	return
499	else:
500	# node.js prints which index closes the cycle
501	raise error.Encode(
502	"Can't encode List%s in object cycle" %
503	ValueIdString(val))
504
505	self.visited[heap_id] = EXPLORING
506	self._PrintList(val, level)
507	self.visited[heap_id] = FINISHED
508
509	elif case(value_e.Dict):
510	val = cast(value.Dict, UP_val)
511
512	# Cycle detection, only for containers that can be in cycles
513	heap_id = HeapValueId(val)
514
515	node_state = self.visited.get(heap_id, UNSEEN)
516	if node_state == FINISHED:
517	# Print it AGAIN. We print a JSON tree, which means we can
518	# visit and print nodes MANY TIMES, as long as they're not
519	# in a cycle.
520	self._PrintDict(val, level)
521	return
522	if node_state == EXPLORING:
523	if self.options & SHOW_CYCLES:
524	self.buf.write('{ -->%s }' % ValueIdString(val))
525	return
526	else:
527	# node.js prints which key closes the cycle
528	raise error.Encode(
529	"Can't encode Dict%s in object cycle" %
530	ValueIdString(val))
531
532	self.visited[heap_id] = EXPLORING
533	self._PrintDict(val, level)
534	self.visited[heap_id] = FINISHED
535
536	# TODO: New format, which should consistent with pretty printing
537	# pp line (x) supports BashArray and BashAssoc, e.g. for spec
538	# tests.
539
540	# - BashAssoc is Dict[str, str]
541	# (BashAssoc ['1']='foo' ['3']='bar')
542	# - BashArray will be Dict[int, str] - SparseArray. We should write it like
543	# (BashArray [1]='foo' [3]='bar')
544
545	elif case(value_e.BashArray):
546	val = cast(value.BashArray, UP_val)
547	self._PrintBashArray(val, level)
548
549	elif case(value_e.BashAssoc):
550	val = cast(value.BashAssoc, UP_val)
551	self._PrintBashAssoc(val, level)
552
553	else:
554	pass # mycpp workaround
555	if self.options & SHOW_NON_DATA:
556	# Similar to = operator, ui.DebugPrint()
557	# TODO: that prints value.Range in a special way
558	ysh_type = ValType(val)
559	id_str = ValueIdString(val)
560	self.buf.write('<%s%s>' % (ysh_type, id_str))
561	else:
562	raise error.Encode("Can't serialize object of type %s" %
563	ValType(val))
564
565
566	class PrettyPrinter(object):
567	""" Unused right now, but could enhance the = operator.
568
569	Output to polymorphic ColorOutput
570
571	Features like asdl/format.py:
572	- line wrapping
573	- color
574	- sharing detection by passing in a REF COUTN dict
575	- print @123 the first time, and then print ... the second time
576
577	and
578
579	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
580	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
581
582	- Omitting commas for ASDL? Maybe we can use two spaces
583
584	(Token id: Id.VSub_DollarName start: 0 length: 3)
585	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
586	"""
587
588	def __init__(self, max_col):
589	# type: (int) -> None
590	self.max_col = max_col
591
592	# This could be an optimized set an C++ bit set like
593	# mark_sweep_heap.h, rather than a Dict
594	#self.unique_objs = mylib.UniqueObjects()
595
596	# first pass of object ID -> number of times references
597
598	self.ref_count = {} # type: Dict[int, int]
599
600	def PrettyTree(self, val, f):
601	# type: (value_t, fmt.ColorOutput) -> None
602
603	# TODO: first convert to hnode.asdl types?
604
605	# Although we might want
606	# hnode.AlreadyShown = (str type, int unique_id)
607	pass
608
609	def Print(self, val, buf):
610	# type: (value_t, mylib.BufWriter) -> None
611
612	# Or print to stderr?
613	f = fmt.DetectConsoleOutput(mylib.Stdout())
614	self.PrettyTree(val, f)
615
616	# Then print those with ASDL
617	pass
618
619
620	class LexerDecoder(object):
621	"""J8 lexer and string decoder.
622
623	Similar interface as SimpleLexer, except we return an optional decoded
624	string
625	"""
626
627	def __init__(self, s, is_j8, lang_str):
628	# type: (str, bool, str) -> None
629	self.s = s
630	self.is_j8 = is_j8
631	self.lang_str = lang_str
632
633	self.pos = 0
634
635	# current line being lexed -- for error messages
636	self.cur_line_num = 1
637
638	# Reuse this instance to save GC objects. JSON objects could have
639	# thousands of strings.
640	self.decoded = mylib.BufWriter()
641
642	def _Error(self, msg, end_pos):
643	# type: (str, int) -> error.Decode
644
645	# Use the current position as start pos
646	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
647
648	def Next(self):
649	# type: () -> Tuple[Id_t, int, Optional[str]]
650	""" Returns a token and updates self.pos """
651
652	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
653
654	if not self.is_j8:
655	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
656	raise self._Error(
657	"Single quotes aren't part of JSON; you may want 'json8 read'",
658	end_pos)
659	if tok_id == Id.Ignored_Comment:
660	raise self._Error(
661	"Comments aren't part of JSON; you may want 'json8 read'",
662	end_pos)
663
664	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
665	Id.Left_USingleQuote):
666	return self._DecodeString(tok_id, end_pos)
667
668	if tok_id == Id.Left_JDoubleQuote:
669	if self.is_j8:
670	return self._DecodeString(tok_id, end_pos)
671	else:
672	raise self._Error('Pure JSON does not accept j"" prefix',
673	end_pos)
674
675	if tok_id == Id.Ignored_Newline:
676	#log('LINE %d', self.cur_line_num)
677	self.cur_line_num += 1
678
679	self.pos = end_pos
680	return tok_id, end_pos, None
681
682	def NextForLines(self):
683	# type: () -> Tuple[Id_t, int, Optional[str]]
684	""" Like Next(), but for J8 Lines """
685
686	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
687
688	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
689	Id.Left_BSingleQuote, Id.Left_USingleQuote):
690	return self._DecodeString(tok_id, end_pos)
691
692	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
693	# this for quoted strings.)
694	if (tok_id == Id.Lit_Chars and
695	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
696	raise self._Error(
697	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
698	if tok_id == Id.Char_AsciiControl:
699	raise self._Error(
700	"J8 Lines can't have unescaped ASCII control chars", end_pos)
701
702	if tok_id == Id.J8_Newline:
703	#log('LINE %d', self.cur_line_num)
704	self.cur_line_num += 1
705
706	self.pos = end_pos
707	return tok_id, end_pos, None
708
709	def _DecodeString(self, left_id, str_pos):
710	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
711	""" Returns a string token and updates self.pos """
712
713	while True:
714	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
715	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
716	else:
717	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
718
719	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
720
721	if tok_id == Id.Eol_Tok:
722	# TODO: point to beginning of # quote?
723	raise self._Error(
724	'Unexpected EOF while lexing %s string' % self.lang_str,
725	str_end)
726	if tok_id == Id.Unknown_Backslash:
727	raise self._Error(
728	'Bad backslash escape in %s string' % self.lang_str,
729	str_end)
730	if tok_id == Id.Char_AsciiControl:
731	raise self._Error(
732	"%s strings can't have unescaped ASCII control chars" %
733	self.lang_str, str_end)
734
735	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
736
737	self.pos = str_end
738
739	s = self.decoded.getvalue()
740	self.decoded.clear() # reuse this instance
741
742	#log('decoded %r', self.decoded.getvalue())
743	return Id.J8_String, str_end, s
744
745	#
746	# Now handle each kind of token
747	#
748
749	if tok_id == Id.Lit_Chars: # JSON and J8
750	part = self.s[str_pos:str_end]
751	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
752	raise self._Error(
753	'Invalid UTF-8 in %s string literal' % self.lang_str,
754	str_end)
755
756	# TODO: would be nice to avoid allocation in all these cases.
757	# But LookupCharC() would have to change.
758
759	elif tok_id == Id.Char_OneChar: # JSON and J8
760	ch = self.s[str_pos + 1]
761	part = consts.LookupCharC(ch)
762
763	elif tok_id == Id.Char_UBraced: # J8 only
764	h = self.s[str_pos + 3:str_end - 1]
765	i = int(h, 16)
766
767	# Same checks in osh/word_compile.py
768	if i > 0x10ffff:
769	raise self._Error(
770	"Code point can't be greater than U+10ffff", str_end)
771	if 0xD800 <= i and i < 0xE000:
772	raise self._Error(
773	r"\u{%s} escape is illegal because it's in the surrogate range"
774	% h, str_end)
775
776	part = Utf8Encode(i)
777
778	elif tok_id == Id.Char_YHex: # J8 only
779	h = self.s[str_pos + 2:str_end]
780
781	# Same check in osh/word_parse.py
782	if left_id != Id.Left_BSingleQuote:
783	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
784	raise self._Error(
785	r"\y%s escapes not allowed in u'' strings" % h,
786	str_end)
787
788	i = int(h, 16)
789	part = chr(i)
790
791	elif tok_id == Id.Char_SurrogatePair:
792	h1 = self.s[str_pos + 2:str_pos + 6]
793	h2 = self.s[str_pos + 8:str_pos + 12]
794
795	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
796	i1 = int(h1, 16) - 0xD800 # high surrogate
797	i2 = int(h2, 16) - 0xDC00 # low surrogate
798	code_point = 0x10000 + (i1 << 10) + i2
799
800	part = Utf8Encode(code_point)
801
802	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
803	h = self.s[str_pos + 2:str_end]
804	i = int(h, 16)
805	part = Utf8Encode(i)
806
807	else:
808	# Should never happen
809	raise AssertionError(Id_str(tok_id))
810
811	#log('%s part %r', Id_str(tok_id), part)
812	self.decoded.write(part)
813	str_pos = str_end
814
815
816	class _Parser(object):
817
818	def __init__(self, s, is_j8):
819	# type: (str, bool) -> None
820	self.s = s
821	self.is_j8 = is_j8
822	self.lang_str = "J8" if is_j8 else "JSON"
823
824	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
825	self.tok_id = Id.Undefined_Tok
826	self.start_pos = 0
827	self.end_pos = 0
828	self.decoded = '' # decoded J8 string
829
830	def _Next(self):
831	# type: () -> None
832
833	# This isn't the start of a J8_Bool token, it's the END of the token before it
834	while True:
835	self.start_pos = self.end_pos
836	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
837	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
838	Id.Ignored_Comment):
839	break
840	# TODO: add Ignored_Newline to count lines, and show line numbers
841	# in errors messages. The position of the last newline and a token
842	# can be used to calculate a column number.
843
844	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
845
846	def _Eat(self, tok_id):
847	# type: (Id_t) -> None
848
849	if self.tok_id != tok_id:
850	#log('position %r %d-%d %r', self.s, self.start_pos,
851	# self.end_pos, self.s[self.start_pos:self.end_pos])
852	raise self._ParseError("Expected %s, got %s" %
853	(Id_str(tok_id), Id_str(self.tok_id)))
854	self._Next()
855
856	def _NextForLines(self):
857	# type: () -> None
858	"""Like _Next, but use the J8 Lines lexer."""
859	self.start_pos = self.end_pos
860	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
861
862	def _ParseError(self, msg):
863	# type: (str) -> error.Decode
864	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
865	self.lexer.cur_line_num)
866
867
868	class Parser(_Parser):
869	"""JSON and JSON8 Parser."""
870
871	def __init__(self, s, is_j8):
872	# type: (str, bool) -> None
873	_Parser.__init__(self, s, is_j8)
874
875	def _ParsePair(self):
876	# type: () -> Tuple[str, value_t]
877
878	k = self.decoded # Save the potential string value
879	self._Eat(Id.J8_String) # Check that it's a string
880	assert k is not None
881
882	self._Eat(Id.J8_Colon)
883
884	v = self._ParseValue()
885	return k, v
886
887	def _ParseDict(self):
888	# type: () -> value_t
889	"""
890	pair = string ':' value
891	Dict = '{' '}'
892	\| '{' pair (',' pair)* '}'
893	"""
894	# precondition
895	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
896
897	#log('> Dict')
898
899	d = NewDict() # type: Dict[str, value_t]
900
901	self._Next()
902	if self.tok_id == Id.J8_RBrace:
903	self._Next()
904	return value.Dict(d)
905
906	k, v = self._ParsePair()
907	d[k] = v
908	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
909
910	while self.tok_id == Id.J8_Comma:
911	self._Next()
912	k, v = self._ParsePair()
913	d[k] = v
914	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
915
916	self._Eat(Id.J8_RBrace)
917
918	#log('< Dict')
919
920	return value.Dict(d)
921
922	def _ParseList(self):
923	# type: () -> value_t
924	"""
925	List = '[' ']'
926	\| '[' value (',' value)* ']'
927	"""
928	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
929
930	items = [] # type: List[value_t]
931
932	self._Next()
933	if self.tok_id == Id.J8_RBracket:
934	self._Next()
935	return value.List(items)
936
937	items.append(self._ParseValue())
938
939	while self.tok_id == Id.J8_Comma:
940	self._Next()
941	items.append(self._ParseValue())
942
943	self._Eat(Id.J8_RBracket)
944
945	return value.List(items)
946
947	def _ParseValue(self):
948	# type: () -> value_t
949	if self.tok_id == Id.J8_LBrace:
950	return self._ParseDict()
951
952	elif self.tok_id == Id.J8_LBracket:
953	return self._ParseList()
954
955	elif self.tok_id == Id.J8_Null:
956	self._Next()
957	return value.Null
958
959	elif self.tok_id == Id.J8_Bool:
960	#log('%r %d', self.s[self.start_pos], self.start_pos)
961	b = value.Bool(self.s[self.start_pos] == 't')
962	self._Next()
963	return b
964
965	elif self.tok_id == Id.J8_Int:
966	part = self.s[self.start_pos:self.end_pos]
967	self._Next()
968	try:
969	big = mops.FromStr(part)
970	except ValueError:
971	raise self._ParseError('Integer is too big')
972	return value.Int(big)
973
974	elif self.tok_id == Id.J8_Float:
975	part = self.s[self.start_pos:self.end_pos]
976	self._Next()
977	return value.Float(float(part))
978
979	# UString, BString too
980	elif self.tok_id == Id.J8_String:
981	str_val = value.Str(self.decoded)
982	#log('d %r', self.decoded)
983	self._Next()
984	return str_val
985
986	elif self.tok_id == Id.Eol_Tok:
987	raise self._ParseError('Unexpected EOF while parsing %s' %
988	self.lang_str)
989
990	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
991	raise self._ParseError('Invalid token while parsing %s: %s' %
992	(self.lang_str, Id_str(self.tok_id)))
993
994	def ParseValue(self):
995	# type: () -> value_t
996	""" Raises error.Decode. """
997	self._Next()
998	obj = self._ParseValue()
999
1000	n = len(self.s)
1001	if self.start_pos != n:
1002	extra = n - self.start_pos
1003	#log('n %d pos %d', n, self.start_pos)
1004	raise self._ParseError(
1005	'Got %d bytes of unexpected trailing input' % extra)
1006	return obj
1007
1008
1009	class Nil8Parser(_Parser):
1010	"""
1011	Tokens not in JSON8:
1012	LParen RParen Symbol
1013
1014	Tokens not in JSON, but in JSON8 and NIL8:
1015	Identifier (unquoted keys)
1016	Ignored_Comment
1017	"""
1018
1019	def __init__(self, s, is_j8):
1020	# type: (str, bool) -> None
1021	_Parser.__init__(self, s, is_j8)
1022
1023	if 0:
1024
1025	def _LookAhead(self):
1026	# type: () -> Id_t
1027	"""
1028	Don't need this right now
1029	"""
1030	end_pos = self.end_pos # look ahead from last token
1031	while True:
1032	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1033	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1034	Id.Ignored_Comment):
1035	break
1036	return tok_id
1037
1038	def _ParseRecord(self):
1039	# type: () -> nvalue_t
1040	"""
1041	Yaks
1042	(self->Next) => (-> self Next)
1043	(self->Next obj.field) => ((-> self Next) (. obj field))
1044
1045	Similar to
1046	((identity identity) 42) => 42 in Clojure
1047
1048	ASDL
1049	(Node left:(. x4beef2))
1050	(Node left !x4beef2)
1051
1052	# Ambiguous because value can be identifier.
1053	# We have to look ahead to and see if there's a colon :
1054	field =
1055	Identifier ':' value
1056	\| value
1057
1058	record = '(' head field* ')'
1059
1060	- Identifier \| Symbol are treated the same, it's a side effect of
1061	the lexing style
1062	- do positional args come before named args
1063	- () is invalid? Use [] for empty list
1064	"""
1065	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1066
1067	items = [] # type: List[nvalue_t]
1068
1069	self._Next()
1070	if self.tok_id == Id.J8_RParen:
1071	self._Next()
1072	return nvalue.List(items)
1073
1074	#log('TOK %s', Id_str(self.tok_id))
1075	while self.tok_id != Id.J8_RParen:
1076	items.append(self._ParseNil8())
1077	#log('TOK 2 %s', Id_str(self.tok_id))
1078
1079	self._Eat(Id.J8_RParen)
1080
1081	return nvalue.List(items)
1082
1083	def _ParseList8(self):
1084	# type: () -> nvalue_t
1085	"""
1086	List8 = '[' value* ']'
1087
1088	No commas, not even optional ones for now.
1089	"""
1090	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1091
1092	items = [] # type: List[nvalue_t]
1093
1094	self._Next()
1095	if self.tok_id == Id.J8_RBracket:
1096	self._Next()
1097	return nvalue.List(items)
1098
1099	#log('TOK %s', Id_str(self.tok_id))
1100	while self.tok_id != Id.J8_RBracket:
1101	items.append(self._ParseNil8())
1102	#log('TOK 2 %s', Id_str(self.tok_id))
1103
1104	self._Eat(Id.J8_RBracket)
1105
1106	return nvalue.List(items)
1107
1108	def _ParseNil8(self):
1109	# type: () -> nvalue_t
1110	if self.tok_id == Id.J8_LParen:
1111	obj = self._ParseRecord() # type: nvalue_t
1112	#return obj
1113
1114	elif self.tok_id == Id.J8_LBracket:
1115	obj = self._ParseList8()
1116	#return obj
1117
1118	# Primitives are copied from J8 above.
1119	# TODO: We also want hex literals.
1120	elif self.tok_id == Id.J8_Null:
1121	self._Next()
1122	obj = nvalue.Null
1123
1124	elif self.tok_id == Id.J8_Bool:
1125	b = nvalue.Bool(self.s[self.start_pos] == 't')
1126	self._Next()
1127	obj = b
1128
1129	elif self.tok_id == Id.J8_Int:
1130	part = self.s[self.start_pos:self.end_pos]
1131	self._Next()
1132	obj = nvalue.Int(int(part))
1133
1134	elif self.tok_id == Id.J8_Float:
1135	part = self.s[self.start_pos:self.end_pos]
1136	self._Next()
1137	obj = nvalue.Float(float(part))
1138
1139	elif self.tok_id == Id.J8_String:
1140	str_val = nvalue.Str(self.decoded)
1141	self._Next()
1142	obj = str_val
1143
1144	# <- etc.
1145	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1146	Id.J8_Comma):
1147	# unquoted "word" treated like a string
1148	part = self.s[self.start_pos:self.end_pos]
1149	self._Next()
1150	obj = nvalue.Symbol(part)
1151
1152	elif self.tok_id == Id.Eol_Tok:
1153	raise self._ParseError('Unexpected EOF while parsing %s' %
1154	self.lang_str)
1155
1156	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1157	raise self._ParseError('Invalid token while parsing %s: %s' %
1158	(self.lang_str, Id_str(self.tok_id)))
1159
1160	#log('YO %s', Id_str(self.tok_id))
1161	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1162	#log('AT %s', Id_str(self.tok_id))
1163
1164	# key: "value" -> (: key "value")
1165	part = self.s[self.start_pos:self.end_pos]
1166	op = nvalue.Symbol(part)
1167
1168	self._Next()
1169	operand2 = self._ParseNil8()
1170	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1171	#print("--> INFIX %d %s" % (id(infix), infix))
1172	return infix
1173
1174	#next_id = self._LookAhead()
1175	#print('NEXT %s' % Id_str(next_id))
1176
1177	#raise AssertionError()
1178	#print("--> OBJ %d %s" % (id(obj), obj))
1179	return obj
1180
1181	def ParseNil8(self):
1182	# type: () -> nvalue_t
1183	""" Raises error.Decode. """
1184	self._Next()
1185	#print('yo')
1186	obj = self._ParseNil8()
1187	#print("==> %d %s" % (id(obj), obj))
1188	if self.tok_id != Id.Eol_Tok:
1189	raise self._ParseError('Unexpected trailing input')
1190	return obj
1191
1192
1193	class J8LinesParser(_Parser):
1194	"""Decode lines from a string with newlines.
1195
1196	We specify this with a grammar, to preserve location info and to reduce
1197	allocations. (But note that unquoted_line is more like a LOOP than it is
1198	grammatical.)
1199
1200	Grammar:
1201
1202	end = J8_Newline \| Eol_Tok
1203
1204	empty_line = WS_Space? end
1205
1206	# special case: read until end token, but REMOVE trailing WS_Space
1207	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1208
1209	j8_line = WS_Space? J8_String WS_Space? end
1210
1211	lines = (empty_line \| unquoted_line \| j8_line)*
1212
1213	where Lit_Chars is valid UTF-8
1214
1215	Notes:
1216
1217	(1) We disallow multiple strings on a line, like:
1218
1219	"json" "json2"
1220	"json" unquoted
1221
1222	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1223
1224	foo "" u''
1225
1226	The "" and u'' are not a decoded string, because the line started with
1227	Id.Lit_Chars literals.
1228
1229	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1230	Does it have - for empty cell?
1231	"""
1232
1233	def __init__(self, s):
1234	# type: (str) -> None
1235	_Parser.__init__(self, s, True)
1236
1237	def _Show(self, s):
1238	# type: (str) -> None
1239	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1240	self.end_pos)
1241
1242	def _ParseLine(self, out):
1243	# type: (List[str]) -> None
1244	""" May append a line to 'out' """
1245	#self._Show('1')
1246	if self.tok_id == Id.WS_Space:
1247	self._NextForLines()
1248
1249	# Empty line - return without doing anything
1250	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1251	self._NextForLines()
1252	return
1253
1254	# Quoted string on line
1255	if self.tok_id == Id.J8_String:
1256	out.append(self.decoded)
1257	self._NextForLines()
1258
1259	if self.tok_id == Id.WS_Space: # trailing whitespace
1260	self._NextForLines()
1261
1262	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1263	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1264	Id_str(self.tok_id))
1265
1266	self._NextForLines()
1267	return
1268
1269	# Unquoted line
1270	if self.tok_id == Id.Lit_Chars:
1271	# ' unquoted "" text on line ' # read every token until end
1272	string_start = self.start_pos
1273	while True:
1274	# for stripping whitespace
1275	prev_id = self.tok_id
1276	prev_start = self.start_pos
1277
1278	self._NextForLines()
1279
1280	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1281	# \r, but we're sticking with the JSON spec definition of
1282	# whitespace. (As another data point, CPython on Unix allows
1283	# \r in the middle of expressions, treating it as whitespace.)
1284	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1285	break
1286
1287	if prev_id == Id.WS_Space:
1288	string_end = prev_start # remove trailing whitespace
1289	else:
1290	string_end = self.start_pos
1291
1292	out.append(self.s[string_start:string_end])
1293
1294	self._NextForLines() # past newline
1295	return
1296
1297	raise AssertionError(Id_str(self.tok_id))
1298
1299	def Parse(self):
1300	# type: () -> List[str]
1301	""" Raises error.Decode. """
1302	self._NextForLines()
1303
1304	lines = [] # type: List[str]
1305	while self.tok_id != Id.Eol_Tok:
1306	self._ParseLine(lines)
1307
1308	if self.tok_id != Id.Eol_Tok:
1309	raise self._ParseError('Unexpected trailing input in J8 Lines')
1310
1311	return lines
1312
1313
1314	def SplitJ8Lines(s):
1315	# type: (str) -> List[str]
1316	"""Used by @(echo split command sub)
1317
1318	Raises:
1319	error.Decode
1320
1321	3 Errors:
1322	- J8 string syntax error inside quotes
1323	- Extra input on line
1324	- unquoted line isn't utf-8
1325	"""
1326	p = J8LinesParser(s)
1327	return p.Parse()
1328
1329
1330	# vim: sw=4