data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1326 lines, 653 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	# TODO: Omit type at top level
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	def EncodeString(s, buf, unquoted_ok=False):
195	# type: (str, mylib.BufWriter, bool) -> None
196	""" For pp proc, etc."""
197
198	if unquoted_ok and fastfunc.CanOmitQuotes(s):
199	buf.write(s)
200	return
201
202	_Print(value.Str(s), buf, -1)
203
204
205	def MaybeEncodeString(s):
206	# type: (str) -> str
207	""" For write --json8 $s and compexport """
208
209	# TODO: add unquoted_ok here?
210	# /usr/local/foo-bar/x.y/a_b
211
212	buf = mylib.BufWriter()
213	_Print(value.Str(s), buf, -1)
214	return buf.getvalue()
215
216
217	def MaybeEncodeJsonString(s):
218	# type: (str) -> str
219	""" For write --json """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225	return buf.getvalue()
226
227
228	# DFS traversal state
229	UNSEEN = 0
230	EXPLORING = 1
231	FINISHED = 2
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	# Value is always True
245	# Dict[int, None] doesn't translate -- it would be nice to have a set()
246	self.visited = {} # type: Dict[int, int]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintDict(self, val, level):
297	# type: (value.Dict, int) -> None
298
299	if len(val.d) == 0: # Special case like Python/JS
300	self.buf.write('{}')
301	else:
302	self.buf.write('{')
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(val.d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write('}')
324
325	def _PrintBashPrefix(self, type_str, level):
326	# type: (str, int) -> None
327
328	self.buf.write('{')
329	self._MaybeNewline()
330	self._ItemIndent(level)
331	self.buf.write('"type":')
332	self._MaybeSpace()
333	self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335	self._MaybeNewline()
336
337	self._ItemIndent(level)
338	self.buf.write('"data":')
339	self._MaybeSpace()
340
341	def _PrintBashSuffix(self, level):
342	# type: (int) -> None
343	self._MaybeNewline()
344	self._BracketIndent(level)
345	self.buf.write('}')
346
347	def _PrintBashArray(self, val, level):
348	# type: (value.BashArray, int) -> None
349
350	self._PrintBashPrefix('"BashArray",', level)
351
352	if len(val.strs) == 0: # Special case like Python/JS
353	self.buf.write('{}')
354	else:
355	self.buf.write('{')
356	self._MaybeNewline()
357
358	first = True
359	for i, s in enumerate(val.strs):
360	if s is None:
361	continue
362
363	if not first:
364	self.buf.write(',')
365	self._MaybeNewline()
366
367	self._ItemIndent(level + 1)
368	pyj8.WriteString(str(i), self.options, self.buf)
369
370	self.buf.write(':')
371	self._MaybeSpace()
372
373	pyj8.WriteString(s, self.options, self.buf)
374
375	first = False
376
377	self._MaybeNewline()
378
379	self._BracketIndent(level + 1)
380	self.buf.write('}')
381
382	self._PrintBashSuffix(level)
383
384	def _PrintBashAssoc(self, val, level):
385	# type: (value.BashAssoc, int) -> None
386
387	self._PrintBashPrefix('"BashAssoc",', level)
388
389	if len(val.d) == 0: # Special case like Python/JS
390	self.buf.write('{}')
391	else:
392	self.buf.write('{')
393	self._MaybeNewline()
394
395	i = 0
396	for k2, v2 in iteritems(val.d):
397	if i != 0:
398	self.buf.write(',')
399	self._MaybeNewline()
400
401	self._ItemIndent(level + 1)
402	pyj8.WriteString(k2, self.options, self.buf)
403
404	self.buf.write(':')
405	self._MaybeSpace()
406
407	pyj8.WriteString(v2, self.options, self.buf)
408
409	i += 1
410
411	self._MaybeNewline()
412
413	self._BracketIndent(level + 1)
414	self.buf.write('}')
415
416	self._PrintBashSuffix(level)
417
418	def Print(self, val, level=0):
419	# type: (value_t, int) -> None
420
421	# special value that means everything is on one line
422	# It's like
423	# JSON.stringify(d, null, 0)
424	# except we use -1, not 0. 0 can still have newlines.
425
426	UP_val = val
427	with tagswitch(val) as case:
428	if case(value_e.Null):
429	self.buf.write('null')
430
431	elif case(value_e.Bool):
432	val = cast(value.Bool, UP_val)
433	self.buf.write('true' if val.b else 'false')
434
435	elif case(value_e.Int):
436	val = cast(value.Int, UP_val)
437	# TODO: avoid intermediate allocation with
438	# self.buf.WriteBigInt(val.i)
439	#
440	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
441	# be of arbitrary length, and will need a growth strategy.
442	# Although that is not very common, so we could allocate in
443	# that case.
444
445	self.buf.write(mops.ToStr(val.i))
446
447	elif case(value_e.Float):
448	val = cast(value.Float, UP_val)
449
450	fl = val.f
451	if math.isinf(fl):
452	if self.options & INF_NAN_ARE_NULL:
453	s = 'null' # negative infinity is null too
454	else:
455	s = 'INFINITY'
456	if fl < 0:
457	s = '-' + s
458	elif math.isnan(fl):
459	if self.options & INF_NAN_ARE_NULL:
460	# JavaScript JSON lib behavior: Inf and NaN are null
461	# Python has a bug in the encoder by default, and then
462	# allow_nan=False raises an error
463	s = 'null'
464	else:
465	s = 'NAN'
466	else:
467	# TODO: can we avoid intermediate allocation?
468	# self.buf.WriteFloat(val.f)
469	s = str(fl)
470
471	self.buf.write(s)
472
473	elif case(value_e.Str):
474	val = cast(value.Str, UP_val)
475
476	pyj8.WriteString(val.s, self.options, self.buf)
477
478	elif case(value_e.List):
479	val = cast(value.List, UP_val)
480
481	# Cycle detection, only for containers that can be in cycles
482	heap_id = HeapValueId(val)
483
484	node_state = self.visited.get(heap_id, UNSEEN)
485	if node_state == FINISHED:
486	# Print it AGAIN. We print a JSON tree, which means we can
487	# visit and print nodes MANY TIMES, as long as they're not
488	# in a cycle.
489	self._PrintList(val, level)
490	return
491	if node_state == EXPLORING:
492	if self.options & SHOW_CYCLES:
493	self.buf.write('[ -->%s ]' % ValueIdString(val))
494	return
495	else:
496	# node.js prints which index closes the cycle
497	raise error.Encode(
498	"Can't encode List%s in object cycle" %
499	ValueIdString(val))
500
501	self.visited[heap_id] = EXPLORING
502	self._PrintList(val, level)
503	self.visited[heap_id] = FINISHED
504
505	elif case(value_e.Dict):
506	val = cast(value.Dict, UP_val)
507
508	# Cycle detection, only for containers that can be in cycles
509	heap_id = HeapValueId(val)
510
511	node_state = self.visited.get(heap_id, UNSEEN)
512	if node_state == FINISHED:
513	# Print it AGAIN. We print a JSON tree, which means we can
514	# visit and print nodes MANY TIMES, as long as they're not
515	# in a cycle.
516	self._PrintDict(val, level)
517	return
518	if node_state == EXPLORING:
519	if self.options & SHOW_CYCLES:
520	self.buf.write('{ -->%s }' % ValueIdString(val))
521	return
522	else:
523	# node.js prints which key closes the cycle
524	raise error.Encode(
525	"Can't encode Dict%s in object cycle" %
526	ValueIdString(val))
527
528	self.visited[heap_id] = EXPLORING
529	self._PrintDict(val, level)
530	self.visited[heap_id] = FINISHED
531
532	# TODO: New format, which should consistent with pretty printing
533	# pp line (x) supports BashArray and BashAssoc, e.g. for spec
534	# tests.
535
536	# - BashAssoc is Dict[str, str]
537	# (BashAssoc ['1']='foo' ['3']='bar')
538	# - BashArray will be Dict[int, str] - SparseArray. We should write it like
539	# (BashArray [1]='foo' [3]='bar')
540
541	elif case(value_e.BashArray):
542	val = cast(value.BashArray, UP_val)
543	self._PrintBashArray(val, level)
544
545	elif case(value_e.BashAssoc):
546	val = cast(value.BashAssoc, UP_val)
547	self._PrintBashAssoc(val, level)
548
549	else:
550	pass # mycpp workaround
551	if self.options & SHOW_NON_DATA:
552	# Similar to = operator, ui.DebugPrint()
553	# TODO: that prints value.Range in a special way
554	ysh_type = ValType(val)
555	id_str = ValueIdString(val)
556	self.buf.write('<%s%s>' % (ysh_type, id_str))
557	else:
558	raise error.Encode("Can't serialize object of type %s" %
559	ValType(val))
560
561
562	class PrettyPrinter(object):
563	""" Unused right now, but could enhance the = operator.
564
565	Output to polymorphic ColorOutput
566
567	Features like asdl/format.py:
568	- line wrapping
569	- color
570	- sharing detection by passing in a REF COUTN dict
571	- print @123 the first time, and then print ... the second time
572
573	and
574
575	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
576	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
577
578	- Omitting commas for ASDL? Maybe we can use two spaces
579
580	(Token id: Id.VSub_DollarName start: 0 length: 3)
581	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
582	"""
583
584	def __init__(self, max_col):
585	# type: (int) -> None
586	self.max_col = max_col
587
588	# This could be an optimized set an C++ bit set like
589	# mark_sweep_heap.h, rather than a Dict
590	#self.unique_objs = mylib.UniqueObjects()
591
592	# first pass of object ID -> number of times references
593
594	self.ref_count = {} # type: Dict[int, int]
595
596	def PrettyTree(self, val, f):
597	# type: (value_t, fmt.ColorOutput) -> None
598
599	# TODO: first convert to hnode.asdl types?
600
601	# Although we might want
602	# hnode.AlreadyShown = (str type, int unique_id)
603	pass
604
605	def Print(self, val, buf):
606	# type: (value_t, mylib.BufWriter) -> None
607
608	# Or print to stderr?
609	f = fmt.DetectConsoleOutput(mylib.Stdout())
610	self.PrettyTree(val, f)
611
612	# Then print those with ASDL
613	pass
614
615
616	class LexerDecoder(object):
617	"""J8 lexer and string decoder.
618
619	Similar interface as SimpleLexer, except we return an optional decoded
620	string
621	"""
622
623	def __init__(self, s, is_j8, lang_str):
624	# type: (str, bool, str) -> None
625	self.s = s
626	self.is_j8 = is_j8
627	self.lang_str = lang_str
628
629	self.pos = 0
630
631	# current line being lexed -- for error messages
632	self.cur_line_num = 1
633
634	# Reuse this instance to save GC objects. JSON objects could have
635	# thousands of strings.
636	self.decoded = mylib.BufWriter()
637
638	def _Error(self, msg, end_pos):
639	# type: (str, int) -> error.Decode
640
641	# Use the current position as start pos
642	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
643
644	def Next(self):
645	# type: () -> Tuple[Id_t, int, Optional[str]]
646	""" Returns a token and updates self.pos """
647
648	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
649
650	if not self.is_j8:
651	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
652	raise self._Error(
653	"Single quotes aren't part of JSON; you may want 'json8 read'",
654	end_pos)
655	if tok_id == Id.Ignored_Comment:
656	raise self._Error(
657	"Comments aren't part of JSON; you may want 'json8 read'",
658	end_pos)
659
660	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
661	Id.Left_USingleQuote):
662	return self._DecodeString(tok_id, end_pos)
663
664	if tok_id == Id.Left_JDoubleQuote:
665	if self.is_j8:
666	return self._DecodeString(tok_id, end_pos)
667	else:
668	raise self._Error('Pure JSON does not accept j"" prefix',
669	end_pos)
670
671	if tok_id == Id.Ignored_Newline:
672	#log('LINE %d', self.cur_line_num)
673	self.cur_line_num += 1
674
675	self.pos = end_pos
676	return tok_id, end_pos, None
677
678	def NextForLines(self):
679	# type: () -> Tuple[Id_t, int, Optional[str]]
680	""" Like Next(), but for J8 Lines """
681
682	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
683
684	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
685	Id.Left_BSingleQuote, Id.Left_USingleQuote):
686	return self._DecodeString(tok_id, end_pos)
687
688	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
689	# this for quoted strings.)
690	if (tok_id == Id.Lit_Chars and
691	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
692	raise self._Error(
693	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
694	if tok_id == Id.Char_AsciiControl:
695	raise self._Error(
696	"J8 Lines can't have unescaped ASCII control chars", end_pos)
697
698	if tok_id == Id.J8_Newline:
699	#log('LINE %d', self.cur_line_num)
700	self.cur_line_num += 1
701
702	self.pos = end_pos
703	return tok_id, end_pos, None
704
705	def _DecodeString(self, left_id, str_pos):
706	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
707	""" Returns a string token and updates self.pos """
708
709	while True:
710	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
711	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
712	else:
713	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
714
715	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
716
717	if tok_id == Id.Eol_Tok:
718	# TODO: point to beginning of # quote?
719	raise self._Error(
720	'Unexpected EOF while lexing %s string' % self.lang_str,
721	str_end)
722	if tok_id == Id.Unknown_Backslash:
723	raise self._Error(
724	'Bad backslash escape in %s string' % self.lang_str,
725	str_end)
726	if tok_id == Id.Char_AsciiControl:
727	raise self._Error(
728	"%s strings can't have unescaped ASCII control chars" %
729	self.lang_str, str_end)
730
731	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
732
733	self.pos = str_end
734
735	s = self.decoded.getvalue()
736	self.decoded.clear() # reuse this instance
737
738	#log('decoded %r', self.decoded.getvalue())
739	return Id.J8_String, str_end, s
740
741	#
742	# Now handle each kind of token
743	#
744
745	if tok_id == Id.Lit_Chars: # JSON and J8
746	part = self.s[str_pos:str_end]
747	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
748	raise self._Error(
749	'Invalid UTF-8 in %s string literal' % self.lang_str,
750	str_end)
751
752	# TODO: would be nice to avoid allocation in all these cases.
753	# But LookupCharC() would have to change.
754
755	elif tok_id == Id.Char_OneChar: # JSON and J8
756	ch = self.s[str_pos + 1]
757	part = consts.LookupCharC(ch)
758
759	elif tok_id == Id.Char_UBraced: # J8 only
760	h = self.s[str_pos + 3:str_end - 1]
761	i = int(h, 16)
762
763	# Same checks in osh/word_compile.py
764	if i > 0x10ffff:
765	raise self._Error(
766	"Code point can't be greater than U+10ffff", str_end)
767	if 0xD800 <= i and i < 0xE000:
768	raise self._Error(
769	r"\u{%s} escape is illegal because it's in the surrogate range"
770	% h, str_end)
771
772	part = Utf8Encode(i)
773
774	elif tok_id == Id.Char_YHex: # J8 only
775	h = self.s[str_pos + 2:str_end]
776
777	# Same check in osh/word_parse.py
778	if left_id != Id.Left_BSingleQuote:
779	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
780	raise self._Error(
781	r"\y%s escapes not allowed in u'' strings" % h,
782	str_end)
783
784	i = int(h, 16)
785	part = chr(i)
786
787	elif tok_id == Id.Char_SurrogatePair:
788	h1 = self.s[str_pos + 2:str_pos + 6]
789	h2 = self.s[str_pos + 8:str_pos + 12]
790
791	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
792	i1 = int(h1, 16) - 0xD800 # high surrogate
793	i2 = int(h2, 16) - 0xDC00 # low surrogate
794	code_point = 0x10000 + (i1 << 10) + i2
795
796	part = Utf8Encode(code_point)
797
798	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
799	h = self.s[str_pos + 2:str_end]
800	i = int(h, 16)
801	part = Utf8Encode(i)
802
803	else:
804	# Should never happen
805	raise AssertionError(Id_str(tok_id))
806
807	#log('%s part %r', Id_str(tok_id), part)
808	self.decoded.write(part)
809	str_pos = str_end
810
811
812	class _Parser(object):
813
814	def __init__(self, s, is_j8):
815	# type: (str, bool) -> None
816	self.s = s
817	self.is_j8 = is_j8
818	self.lang_str = "J8" if is_j8 else "JSON"
819
820	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
821	self.tok_id = Id.Undefined_Tok
822	self.start_pos = 0
823	self.end_pos = 0
824	self.decoded = '' # decoded J8 string
825
826	def _Next(self):
827	# type: () -> None
828
829	# This isn't the start of a J8_Bool token, it's the END of the token before it
830	while True:
831	self.start_pos = self.end_pos
832	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
833	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
834	Id.Ignored_Comment):
835	break
836	# TODO: add Ignored_Newline to count lines, and show line numbers
837	# in errors messages. The position of the last newline and a token
838	# can be used to calculate a column number.
839
840	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
841
842	def _Eat(self, tok_id):
843	# type: (Id_t) -> None
844
845	if self.tok_id != tok_id:
846	#log('position %r %d-%d %r', self.s, self.start_pos,
847	# self.end_pos, self.s[self.start_pos:self.end_pos])
848	raise self._ParseError("Expected %s, got %s" %
849	(Id_str(tok_id), Id_str(self.tok_id)))
850	self._Next()
851
852	def _NextForLines(self):
853	# type: () -> None
854	"""Like _Next, but use the J8 Lines lexer."""
855	self.start_pos = self.end_pos
856	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
857
858	def _ParseError(self, msg):
859	# type: (str) -> error.Decode
860	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
861	self.lexer.cur_line_num)
862
863
864	class Parser(_Parser):
865	"""JSON and JSON8 Parser."""
866
867	def __init__(self, s, is_j8):
868	# type: (str, bool) -> None
869	_Parser.__init__(self, s, is_j8)
870
871	def _ParsePair(self):
872	# type: () -> Tuple[str, value_t]
873
874	k = self.decoded # Save the potential string value
875	self._Eat(Id.J8_String) # Check that it's a string
876	assert k is not None
877
878	self._Eat(Id.J8_Colon)
879
880	v = self._ParseValue()
881	return k, v
882
883	def _ParseDict(self):
884	# type: () -> value_t
885	"""
886	pair = string ':' value
887	Dict = '{' '}'
888	\| '{' pair (',' pair)* '}'
889	"""
890	# precondition
891	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
892
893	#log('> Dict')
894
895	d = NewDict() # type: Dict[str, value_t]
896
897	self._Next()
898	if self.tok_id == Id.J8_RBrace:
899	self._Next()
900	return value.Dict(d)
901
902	k, v = self._ParsePair()
903	d[k] = v
904	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
905
906	while self.tok_id == Id.J8_Comma:
907	self._Next()
908	k, v = self._ParsePair()
909	d[k] = v
910	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
911
912	self._Eat(Id.J8_RBrace)
913
914	#log('< Dict')
915
916	return value.Dict(d)
917
918	def _ParseList(self):
919	# type: () -> value_t
920	"""
921	List = '[' ']'
922	\| '[' value (',' value)* ']'
923	"""
924	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
925
926	items = [] # type: List[value_t]
927
928	self._Next()
929	if self.tok_id == Id.J8_RBracket:
930	self._Next()
931	return value.List(items)
932
933	items.append(self._ParseValue())
934
935	while self.tok_id == Id.J8_Comma:
936	self._Next()
937	items.append(self._ParseValue())
938
939	self._Eat(Id.J8_RBracket)
940
941	return value.List(items)
942
943	def _ParseValue(self):
944	# type: () -> value_t
945	if self.tok_id == Id.J8_LBrace:
946	return self._ParseDict()
947
948	elif self.tok_id == Id.J8_LBracket:
949	return self._ParseList()
950
951	elif self.tok_id == Id.J8_Null:
952	self._Next()
953	return value.Null
954
955	elif self.tok_id == Id.J8_Bool:
956	#log('%r %d', self.s[self.start_pos], self.start_pos)
957	b = value.Bool(self.s[self.start_pos] == 't')
958	self._Next()
959	return b
960
961	elif self.tok_id == Id.J8_Int:
962	part = self.s[self.start_pos:self.end_pos]
963	self._Next()
964	try:
965	big = mops.FromStr(part)
966	except ValueError:
967	raise self._ParseError('Integer is too big')
968	return value.Int(big)
969
970	elif self.tok_id == Id.J8_Float:
971	part = self.s[self.start_pos:self.end_pos]
972	self._Next()
973	return value.Float(float(part))
974
975	# UString, BString too
976	elif self.tok_id == Id.J8_String:
977	str_val = value.Str(self.decoded)
978	#log('d %r', self.decoded)
979	self._Next()
980	return str_val
981
982	elif self.tok_id == Id.Eol_Tok:
983	raise self._ParseError('Unexpected EOF while parsing %s' %
984	self.lang_str)
985
986	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
987	raise self._ParseError('Invalid token while parsing %s: %s' %
988	(self.lang_str, Id_str(self.tok_id)))
989
990	def ParseValue(self):
991	# type: () -> value_t
992	""" Raises error.Decode. """
993	self._Next()
994	obj = self._ParseValue()
995
996	n = len(self.s)
997	if self.start_pos != n:
998	extra = n - self.start_pos
999	#log('n %d pos %d', n, self.start_pos)
1000	raise self._ParseError(
1001	'Got %d bytes of unexpected trailing input' % extra)
1002	return obj
1003
1004
1005	class Nil8Parser(_Parser):
1006	"""
1007	Tokens not in JSON8:
1008	LParen RParen Symbol
1009
1010	Tokens not in JSON, but in JSON8 and NIL8:
1011	Identifier (unquoted keys)
1012	Ignored_Comment
1013	"""
1014
1015	def __init__(self, s, is_j8):
1016	# type: (str, bool) -> None
1017	_Parser.__init__(self, s, is_j8)
1018
1019	if 0:
1020
1021	def _LookAhead(self):
1022	# type: () -> Id_t
1023	"""
1024	Don't need this right now
1025	"""
1026	end_pos = self.end_pos # look ahead from last token
1027	while True:
1028	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1029	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1030	Id.Ignored_Comment):
1031	break
1032	return tok_id
1033
1034	def _ParseRecord(self):
1035	# type: () -> nvalue_t
1036	"""
1037	Yaks
1038	(self->Next) => (-> self Next)
1039	(self->Next obj.field) => ((-> self Next) (. obj field))
1040
1041	Similar to
1042	((identity identity) 42) => 42 in Clojure
1043
1044	ASDL
1045	(Node left:(. x4beef2))
1046	(Node left !x4beef2)
1047
1048	# Ambiguous because value can be identifier.
1049	# We have to look ahead to and see if there's a colon :
1050	field =
1051	Identifier ':' value
1052	\| value
1053
1054	record = '(' head field* ')'
1055
1056	- Identifier \| Symbol are treated the same, it's a side effect of
1057	the lexing style
1058	- do positional args come before named args
1059	- () is invalid? Use [] for empty list
1060	"""
1061	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1062
1063	items = [] # type: List[nvalue_t]
1064
1065	self._Next()
1066	if self.tok_id == Id.J8_RParen:
1067	self._Next()
1068	return nvalue.List(items)
1069
1070	#log('TOK %s', Id_str(self.tok_id))
1071	while self.tok_id != Id.J8_RParen:
1072	items.append(self._ParseNil8())
1073	#log('TOK 2 %s', Id_str(self.tok_id))
1074
1075	self._Eat(Id.J8_RParen)
1076
1077	return nvalue.List(items)
1078
1079	def _ParseList8(self):
1080	# type: () -> nvalue_t
1081	"""
1082	List8 = '[' value* ']'
1083
1084	No commas, not even optional ones for now.
1085	"""
1086	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1087
1088	items = [] # type: List[nvalue_t]
1089
1090	self._Next()
1091	if self.tok_id == Id.J8_RBracket:
1092	self._Next()
1093	return nvalue.List(items)
1094
1095	#log('TOK %s', Id_str(self.tok_id))
1096	while self.tok_id != Id.J8_RBracket:
1097	items.append(self._ParseNil8())
1098	#log('TOK 2 %s', Id_str(self.tok_id))
1099
1100	self._Eat(Id.J8_RBracket)
1101
1102	return nvalue.List(items)
1103
1104	def _ParseNil8(self):
1105	# type: () -> nvalue_t
1106	if self.tok_id == Id.J8_LParen:
1107	obj = self._ParseRecord() # type: nvalue_t
1108	#return obj
1109
1110	elif self.tok_id == Id.J8_LBracket:
1111	obj = self._ParseList8()
1112	#return obj
1113
1114	# Primitives are copied from J8 above.
1115	# TODO: We also want hex literals.
1116	elif self.tok_id == Id.J8_Null:
1117	self._Next()
1118	obj = nvalue.Null
1119
1120	elif self.tok_id == Id.J8_Bool:
1121	b = nvalue.Bool(self.s[self.start_pos] == 't')
1122	self._Next()
1123	obj = b
1124
1125	elif self.tok_id == Id.J8_Int:
1126	part = self.s[self.start_pos:self.end_pos]
1127	self._Next()
1128	obj = nvalue.Int(int(part))
1129
1130	elif self.tok_id == Id.J8_Float:
1131	part = self.s[self.start_pos:self.end_pos]
1132	self._Next()
1133	obj = nvalue.Float(float(part))
1134
1135	elif self.tok_id == Id.J8_String:
1136	str_val = nvalue.Str(self.decoded)
1137	self._Next()
1138	obj = str_val
1139
1140	# <- etc.
1141	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1142	Id.J8_Comma):
1143	# unquoted "word" treated like a string
1144	part = self.s[self.start_pos:self.end_pos]
1145	self._Next()
1146	obj = nvalue.Symbol(part)
1147
1148	elif self.tok_id == Id.Eol_Tok:
1149	raise self._ParseError('Unexpected EOF while parsing %s' %
1150	self.lang_str)
1151
1152	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1153	raise self._ParseError('Invalid token while parsing %s: %s' %
1154	(self.lang_str, Id_str(self.tok_id)))
1155
1156	#log('YO %s', Id_str(self.tok_id))
1157	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1158	#log('AT %s', Id_str(self.tok_id))
1159
1160	# key: "value" -> (: key "value")
1161	part = self.s[self.start_pos:self.end_pos]
1162	op = nvalue.Symbol(part)
1163
1164	self._Next()
1165	operand2 = self._ParseNil8()
1166	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1167	#print("--> INFIX %d %s" % (id(infix), infix))
1168	return infix
1169
1170	#next_id = self._LookAhead()
1171	#print('NEXT %s' % Id_str(next_id))
1172
1173	#raise AssertionError()
1174	#print("--> OBJ %d %s" % (id(obj), obj))
1175	return obj
1176
1177	def ParseNil8(self):
1178	# type: () -> nvalue_t
1179	""" Raises error.Decode. """
1180	self._Next()
1181	#print('yo')
1182	obj = self._ParseNil8()
1183	#print("==> %d %s" % (id(obj), obj))
1184	if self.tok_id != Id.Eol_Tok:
1185	raise self._ParseError('Unexpected trailing input')
1186	return obj
1187
1188
1189	class J8LinesParser(_Parser):
1190	"""Decode lines from a string with newlines.
1191
1192	We specify this with a grammar, to preserve location info and to reduce
1193	allocations. (But note that unquoted_line is more like a LOOP than it is
1194	grammatical.)
1195
1196	Grammar:
1197
1198	end = J8_Newline \| Eol_Tok
1199
1200	empty_line = WS_Space? end
1201
1202	# special case: read until end token, but REMOVE trailing WS_Space
1203	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1204
1205	j8_line = WS_Space? J8_String WS_Space? end
1206
1207	lines = (empty_line \| unquoted_line \| j8_line)*
1208
1209	where Lit_Chars is valid UTF-8
1210
1211	Notes:
1212
1213	(1) We disallow multiple strings on a line, like:
1214
1215	"json" "json2"
1216	"json" unquoted
1217
1218	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1219
1220	foo "" u''
1221
1222	The "" and u'' are not a decoded string, because the line started with
1223	Id.Lit_Chars literals.
1224
1225	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1226	Does it have - for empty cell?
1227	"""
1228
1229	def __init__(self, s):
1230	# type: (str) -> None
1231	_Parser.__init__(self, s, True)
1232
1233	def _Show(self, s):
1234	# type: (str) -> None
1235	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1236	self.end_pos)
1237
1238	def _ParseLine(self, out):
1239	# type: (List[str]) -> None
1240	""" May append a line to 'out' """
1241	#self._Show('1')
1242	if self.tok_id == Id.WS_Space:
1243	self._NextForLines()
1244
1245	# Empty line - return without doing anything
1246	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1247	self._NextForLines()
1248	return
1249
1250	# Quoted string on line
1251	if self.tok_id == Id.J8_String:
1252	out.append(self.decoded)
1253	self._NextForLines()
1254
1255	if self.tok_id == Id.WS_Space: # trailing whitespace
1256	self._NextForLines()
1257
1258	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1259	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1260	Id_str(self.tok_id))
1261
1262	self._NextForLines()
1263	return
1264
1265	# Unquoted line
1266	if self.tok_id == Id.Lit_Chars:
1267	# ' unquoted "" text on line ' # read every token until end
1268	string_start = self.start_pos
1269	while True:
1270	# for stripping whitespace
1271	prev_id = self.tok_id
1272	prev_start = self.start_pos
1273
1274	self._NextForLines()
1275
1276	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1277	# \r, but we're sticking with the JSON spec definition of
1278	# whitespace. (As another data point, CPython on Unix allows
1279	# \r in the middle of expressions, treating it as whitespace.)
1280	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1281	break
1282
1283	if prev_id == Id.WS_Space:
1284	string_end = prev_start # remove trailing whitespace
1285	else:
1286	string_end = self.start_pos
1287
1288	out.append(self.s[string_start:string_end])
1289
1290	self._NextForLines() # past newline
1291	return
1292
1293	raise AssertionError(Id_str(self.tok_id))
1294
1295	def Parse(self):
1296	# type: () -> List[str]
1297	""" Raises error.Decode. """
1298	self._NextForLines()
1299
1300	lines = [] # type: List[str]
1301	while self.tok_id != Id.Eol_Tok:
1302	self._ParseLine(lines)
1303
1304	if self.tok_id != Id.Eol_Tok:
1305	raise self._ParseError('Unexpected trailing input in J8 Lines')
1306
1307	return lines
1308
1309
1310	def SplitJ8Lines(s):
1311	# type: (str) -> List[str]
1312	"""Used by @(echo split command sub)
1313
1314	Raises:
1315	error.Decode
1316
1317	3 Errors:
1318	- J8 string syntax error inside quotes
1319	- Extra input on line
1320	- unquoted line isn't utf-8
1321	"""
1322	p = J8LinesParser(s)
1323	return p.Parse()
1324
1325
1326	# vim: sw=4