data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1368 lines, 684 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	if 0:
194
195	def Repr(val):
196	# type: (value_t) -> str
197	""" Unused
198	This is like Python's repr
199	"""
200	# error.Encode should be impossible - we show cycles and non-data
201	buf = mylib.BufWriter()
202	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
203	return buf.getvalue()
204
205
206	def EncodeString(s, buf, unquoted_ok=False):
207	# type: (str, mylib.BufWriter, bool) -> None
208	""" For pp proc, etc."""
209
210	if unquoted_ok and fastfunc.CanOmitQuotes(s):
211	buf.write(s)
212	return
213
214	_Print(value.Str(s), buf, -1)
215
216
217	def MaybeEncodeString(s):
218	# type: (str) -> str
219	""" For write --json8 $s and compexport """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223
224	buf = mylib.BufWriter()
225	_Print(value.Str(s), buf, -1)
226	return buf.getvalue()
227
228
229	def MaybeEncodeJsonString(s):
230	# type: (str) -> str
231	""" For write --json """
232
233	# TODO: add unquoted_ok here?
234	# /usr/local/foo-bar/x.y/a_b
235	buf = mylib.BufWriter()
236	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237	return buf.getvalue()
238
239
240	# DFS traversal state
241	UNSEEN = 0
242	EXPLORING = 1
243	FINISHED = 2
244
245
246	class InstancePrinter(object):
247	"""Print a value tree as J8/JSON."""
248
249	def __init__(self, buf, indent, options):
250	# type: (mylib.BufWriter, int, int) -> None
251	self.buf = buf
252	self.indent = indent
253	self.options = options
254
255	# Key is vm.HeapValueId(val)
256	# Value is always True
257	# Dict[int, None] doesn't translate -- it would be nice to have a set()
258	self.visited = {} # type: Dict[int, int]
259
260	def _ItemIndent(self, level):
261	# type: (int) -> None
262
263	if self.indent == -1:
264	return
265
266	self.buf.write_spaces((level + 1) * self.indent)
267
268	def _BracketIndent(self, level):
269	# type: (int) -> None
270
271	if self.indent == -1:
272	return
273
274	self.buf.write_spaces(level * self.indent)
275
276	def _MaybeNewline(self):
277	# type: () -> None
278	if self.indent == -1:
279	return
280	self.buf.write('\n')
281
282	def _MaybeSpace(self):
283	# type: () -> None
284	if self.indent == -1:
285	return
286	self.buf.write(' ')
287
288	def _PrintList(self, val, level):
289	# type: (value.List, int) -> None
290
291	if len(val.items) == 0: # Special case like Python/JS
292	self.buf.write('[]')
293	else:
294	self.buf.write('[')
295	self._MaybeNewline()
296	for i, item in enumerate(val.items):
297	if i != 0:
298	self.buf.write(',')
299	self._MaybeNewline()
300
301	self._ItemIndent(level)
302	self.Print(item, level + 1)
303	self._MaybeNewline()
304
305	self._BracketIndent(level)
306	self.buf.write(']')
307
308	def _PrintDict(self, val, level):
309	# type: (value.Dict, int) -> None
310
311	if len(val.d) == 0: # Special case like Python/JS
312	self.buf.write('{}')
313	else:
314	self.buf.write('{')
315	self._MaybeNewline()
316	i = 0
317	for k, v in iteritems(val.d):
318	if i != 0:
319	self.buf.write(',')
320	self._MaybeNewline()
321
322	self._ItemIndent(level)
323
324	pyj8.WriteString(k, self.options, self.buf)
325
326	self.buf.write(':')
327	self._MaybeSpace()
328
329	self.Print(v, level + 1)
330
331	i += 1
332
333	self._MaybeNewline()
334	self._BracketIndent(level)
335	self.buf.write('}')
336
337	def _PrintBashPrefix(self, type_str, level):
338	# type: (str, int) -> None
339
340	self.buf.write('{')
341	self._MaybeNewline()
342	self._ItemIndent(level)
343	self.buf.write('"type":')
344	self._MaybeSpace()
345	self.buf.write(type_str) # "BashArray", or "BashAssoc",
346
347	self._MaybeNewline()
348
349	self._ItemIndent(level)
350	self.buf.write('"data":')
351	self._MaybeSpace()
352
353	def _PrintBashSuffix(self, level):
354	# type: (int) -> None
355	self._MaybeNewline()
356	self._BracketIndent(level)
357	self.buf.write('}')
358
359	def _PrintSparseArray(self, val, level):
360	# type: (value.SparseArray, int) -> None
361
362	self._PrintBashPrefix('"SparseArray",', level)
363
364	if len(val.d) == 0: # Special case like Python/JS
365	self.buf.write('{}')
366	else:
367	self.buf.write('{')
368	self._MaybeNewline()
369
370	first = True
371	i = 0
372	for k, v in iteritems(val.d):
373	if i != 0:
374	self.buf.write(',')
375	self._MaybeNewline()
376
377	self._ItemIndent(level + 1)
378	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
379
380	self.buf.write(':')
381	self._MaybeSpace()
382
383	pyj8.WriteString(v, self.options, self.buf)
384
385	i += 1
386
387	self._MaybeNewline()
388
389	self._BracketIndent(level + 1)
390	self.buf.write('}')
391
392	self._PrintBashSuffix(level)
393
394	def _PrintBashArray(self, val, level):
395	# type: (value.BashArray, int) -> None
396
397	self._PrintBashPrefix('"BashArray",', level)
398
399	if len(val.strs) == 0: # Special case like Python/JS
400	self.buf.write('{}')
401	else:
402	self.buf.write('{')
403	self._MaybeNewline()
404
405	first = True
406	for i, s in enumerate(val.strs):
407	if s is None:
408	continue
409
410	if not first:
411	self.buf.write(',')
412	self._MaybeNewline()
413
414	self._ItemIndent(level + 1)
415	pyj8.WriteString(str(i), self.options, self.buf)
416
417	self.buf.write(':')
418	self._MaybeSpace()
419
420	pyj8.WriteString(s, self.options, self.buf)
421
422	first = False
423
424	self._MaybeNewline()
425
426	self._BracketIndent(level + 1)
427	self.buf.write('}')
428
429	self._PrintBashSuffix(level)
430
431	def _PrintBashAssoc(self, val, level):
432	# type: (value.BashAssoc, int) -> None
433
434	self._PrintBashPrefix('"BashAssoc",', level)
435
436	if len(val.d) == 0: # Special case like Python/JS
437	self.buf.write('{}')
438	else:
439	self.buf.write('{')
440	self._MaybeNewline()
441
442	i = 0
443	for k2, v2 in iteritems(val.d):
444	if i != 0:
445	self.buf.write(',')
446	self._MaybeNewline()
447
448	self._ItemIndent(level + 1)
449	pyj8.WriteString(k2, self.options, self.buf)
450
451	self.buf.write(':')
452	self._MaybeSpace()
453
454	pyj8.WriteString(v2, self.options, self.buf)
455
456	i += 1
457
458	self._MaybeNewline()
459
460	self._BracketIndent(level + 1)
461	self.buf.write('}')
462
463	self._PrintBashSuffix(level)
464
465	def Print(self, val, level=0):
466	# type: (value_t, int) -> None
467
468	# special value that means everything is on one line
469	# It's like
470	# JSON.stringify(d, null, 0)
471	# except we use -1, not 0. 0 can still have newlines.
472
473	UP_val = val
474	with tagswitch(val) as case:
475	if case(value_e.Null):
476	self.buf.write('null')
477
478	elif case(value_e.Bool):
479	val = cast(value.Bool, UP_val)
480	self.buf.write('true' if val.b else 'false')
481
482	elif case(value_e.Int):
483	val = cast(value.Int, UP_val)
484	# TODO: avoid intermediate allocation with
485	# self.buf.WriteBigInt(val.i)
486	#
487	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
488	# be of arbitrary length, and will need a growth strategy.
489	# Although that is not very common, so we could allocate in
490	# that case.
491
492	self.buf.write(mops.ToStr(val.i))
493
494	elif case(value_e.Float):
495	val = cast(value.Float, UP_val)
496
497	fl = val.f
498	if math.isinf(fl):
499	if self.options & INF_NAN_ARE_NULL:
500	s = 'null' # negative infinity is null too
501	else:
502	s = 'INFINITY'
503	if fl < 0:
504	s = '-' + s
505	elif math.isnan(fl):
506	if self.options & INF_NAN_ARE_NULL:
507	# JavaScript JSON lib behavior: Inf and NaN are null
508	# Python has a bug in the encoder by default, and then
509	# allow_nan=False raises an error
510	s = 'null'
511	else:
512	s = 'NAN'
513	else:
514	# TODO: can we avoid intermediate allocation?
515	# self.buf.WriteFloat(val.f)
516	s = str(fl)
517
518	self.buf.write(s)
519
520	elif case(value_e.Str):
521	val = cast(value.Str, UP_val)
522
523	pyj8.WriteString(val.s, self.options, self.buf)
524
525	elif case(value_e.List):
526	val = cast(value.List, UP_val)
527
528	# Cycle detection, only for containers that can be in cycles
529	heap_id = HeapValueId(val)
530
531	node_state = self.visited.get(heap_id, UNSEEN)
532	if node_state == FINISHED:
533	# Print it AGAIN. We print a JSON tree, which means we can
534	# visit and print nodes MANY TIMES, as long as they're not
535	# in a cycle.
536	self._PrintList(val, level)
537	return
538	if node_state == EXPLORING:
539	if self.options & SHOW_CYCLES:
540	self.buf.write('[ -->%s ]' % ValueIdString(val))
541	return
542	else:
543	# node.js prints which index closes the cycle
544	raise error.Encode(
545	"Can't encode List%s in object cycle" %
546	ValueIdString(val))
547
548	self.visited[heap_id] = EXPLORING
549	self._PrintList(val, level)
550	self.visited[heap_id] = FINISHED
551
552	elif case(value_e.Dict):
553	val = cast(value.Dict, UP_val)
554
555	# Cycle detection, only for containers that can be in cycles
556	heap_id = HeapValueId(val)
557
558	node_state = self.visited.get(heap_id, UNSEEN)
559	if node_state == FINISHED:
560	# Print it AGAIN. We print a JSON tree, which means we can
561	# visit and print nodes MANY TIMES, as long as they're not
562	# in a cycle.
563	self._PrintDict(val, level)
564	return
565	if node_state == EXPLORING:
566	if self.options & SHOW_CYCLES:
567	self.buf.write('{ -->%s }' % ValueIdString(val))
568	return
569	else:
570	# node.js prints which key closes the cycle
571	raise error.Encode(
572	"Can't encode Dict%s in object cycle" %
573	ValueIdString(val))
574
575	self.visited[heap_id] = EXPLORING
576	self._PrintDict(val, level)
577	self.visited[heap_id] = FINISHED
578
579	elif case(value_e.SparseArray):
580	val = cast(value.SparseArray, UP_val)
581	self._PrintSparseArray(val, level)
582
583	elif case(value_e.BashArray):
584	val = cast(value.BashArray, UP_val)
585	self._PrintBashArray(val, level)
586
587	elif case(value_e.BashAssoc):
588	val = cast(value.BashAssoc, UP_val)
589	self._PrintBashAssoc(val, level)
590
591	else:
592	pass # mycpp workaround
593	if self.options & SHOW_NON_DATA:
594	# Similar to = operator, ui.DebugPrint()
595	# TODO: that prints value.Range in a special way
596	ysh_type = ValType(val)
597	id_str = ValueIdString(val)
598	self.buf.write('<%s%s>' % (ysh_type, id_str))
599	else:
600	raise error.Encode("Can't serialize object of type %s" %
601	ValType(val))
602
603
604	class PrettyPrinter(object):
605	""" Unused right now, but could enhance the = operator.
606
607	Output to polymorphic ColorOutput
608
609	Features like asdl/format.py:
610	- line wrapping
611	- color
612	- sharing detection by passing in a REF COUTN dict
613	- print @123 the first time, and then print ... the second time
614
615	and
616
617	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
618	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
619
620	- Omitting commas for ASDL? Maybe we can use two spaces
621
622	(Token id: Id.VSub_DollarName start: 0 length: 3)
623	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
624	"""
625
626	def __init__(self, max_col):
627	# type: (int) -> None
628	self.max_col = max_col
629
630	# This could be an optimized set an C++ bit set like
631	# mark_sweep_heap.h, rather than a Dict
632	#self.unique_objs = mylib.UniqueObjects()
633
634	# first pass of object ID -> number of times references
635
636	self.ref_count = {} # type: Dict[int, int]
637
638	def PrettyTree(self, val, f):
639	# type: (value_t, fmt.ColorOutput) -> None
640
641	# TODO: first convert to hnode.asdl types?
642
643	# Although we might want
644	# hnode.AlreadyShown = (str type, int unique_id)
645	pass
646
647	def Print(self, val, buf):
648	# type: (value_t, mylib.BufWriter) -> None
649
650	# Or print to stderr?
651	f = fmt.DetectConsoleOutput(mylib.Stdout())
652	self.PrettyTree(val, f)
653
654	# Then print those with ASDL
655	pass
656
657
658	class LexerDecoder(object):
659	"""J8 lexer and string decoder.
660
661	Similar interface as SimpleLexer, except we return an optional decoded
662	string
663	"""
664
665	def __init__(self, s, is_j8, lang_str):
666	# type: (str, bool, str) -> None
667	self.s = s
668	self.is_j8 = is_j8
669	self.lang_str = lang_str
670
671	self.pos = 0
672
673	# current line being lexed -- for error messages
674	self.cur_line_num = 1
675
676	# Reuse this instance to save GC objects. JSON objects could have
677	# thousands of strings.
678	self.decoded = mylib.BufWriter()
679
680	def _Error(self, msg, end_pos):
681	# type: (str, int) -> error.Decode
682
683	# Use the current position as start pos
684	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
685
686	def Next(self):
687	# type: () -> Tuple[Id_t, int, Optional[str]]
688	""" Returns a token and updates self.pos """
689
690	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
691
692	if not self.is_j8:
693	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
694	raise self._Error(
695	"Single quotes aren't part of JSON; you may want 'json8 read'",
696	end_pos)
697	if tok_id == Id.Ignored_Comment:
698	raise self._Error(
699	"Comments aren't part of JSON; you may want 'json8 read'",
700	end_pos)
701
702	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
703	Id.Left_USingleQuote):
704	return self._DecodeString(tok_id, end_pos)
705
706	if tok_id == Id.Left_JDoubleQuote:
707	if self.is_j8:
708	return self._DecodeString(tok_id, end_pos)
709	else:
710	raise self._Error('Pure JSON does not accept j"" prefix',
711	end_pos)
712
713	if tok_id == Id.Ignored_Newline:
714	#log('LINE %d', self.cur_line_num)
715	self.cur_line_num += 1
716
717	self.pos = end_pos
718	return tok_id, end_pos, None
719
720	def NextForLines(self):
721	# type: () -> Tuple[Id_t, int, Optional[str]]
722	""" Like Next(), but for J8 Lines """
723
724	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
725
726	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
727	Id.Left_BSingleQuote, Id.Left_USingleQuote):
728	return self._DecodeString(tok_id, end_pos)
729
730	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
731	# this for quoted strings.)
732	if (tok_id == Id.Lit_Chars and
733	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
734	raise self._Error(
735	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
736	if tok_id == Id.Char_AsciiControl:
737	raise self._Error(
738	"J8 Lines can't have unescaped ASCII control chars", end_pos)
739
740	if tok_id == Id.J8_Newline:
741	#log('LINE %d', self.cur_line_num)
742	self.cur_line_num += 1
743
744	self.pos = end_pos
745	return tok_id, end_pos, None
746
747	def _DecodeString(self, left_id, str_pos):
748	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
749	""" Returns a string token and updates self.pos """
750
751	while True:
752	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
753	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
754	else:
755	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
756
757	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
758
759	if tok_id == Id.Eol_Tok:
760	# TODO: point to beginning of # quote?
761	raise self._Error(
762	'Unexpected EOF while lexing %s string' % self.lang_str,
763	str_end)
764	if tok_id == Id.Unknown_Backslash:
765	raise self._Error(
766	'Bad backslash escape in %s string' % self.lang_str,
767	str_end)
768	if tok_id == Id.Char_AsciiControl:
769	raise self._Error(
770	"%s strings can't have unescaped ASCII control chars" %
771	self.lang_str, str_end)
772
773	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
774
775	self.pos = str_end
776
777	s = self.decoded.getvalue()
778	self.decoded.clear() # reuse this instance
779
780	#log('decoded %r', self.decoded.getvalue())
781	return Id.J8_String, str_end, s
782
783	#
784	# Now handle each kind of token
785	#
786
787	if tok_id == Id.Lit_Chars: # JSON and J8
788	part = self.s[str_pos:str_end]
789	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
790	raise self._Error(
791	'Invalid UTF-8 in %s string literal' % self.lang_str,
792	str_end)
793
794	# TODO: would be nice to avoid allocation in all these cases.
795	# But LookupCharC() would have to change.
796
797	elif tok_id == Id.Char_OneChar: # JSON and J8
798	ch = self.s[str_pos + 1]
799	part = consts.LookupCharC(ch)
800
801	elif tok_id == Id.Char_UBraced: # J8 only
802	h = self.s[str_pos + 3:str_end - 1]
803	i = int(h, 16)
804
805	# Same checks in osh/word_compile.py
806	if i > 0x10ffff:
807	raise self._Error(
808	"Code point can't be greater than U+10ffff", str_end)
809	if 0xD800 <= i and i < 0xE000:
810	raise self._Error(
811	r"\u{%s} escape is illegal because it's in the surrogate range"
812	% h, str_end)
813
814	part = Utf8Encode(i)
815
816	elif tok_id == Id.Char_YHex: # J8 only
817	h = self.s[str_pos + 2:str_end]
818
819	# Same check in osh/word_parse.py
820	if left_id != Id.Left_BSingleQuote:
821	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
822	raise self._Error(
823	r"\y%s escapes not allowed in u'' strings" % h,
824	str_end)
825
826	i = int(h, 16)
827	part = chr(i)
828
829	elif tok_id == Id.Char_SurrogatePair:
830	h1 = self.s[str_pos + 2:str_pos + 6]
831	h2 = self.s[str_pos + 8:str_pos + 12]
832
833	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
834	i1 = int(h1, 16) - 0xD800 # high surrogate
835	i2 = int(h2, 16) - 0xDC00 # low surrogate
836	code_point = 0x10000 + (i1 << 10) + i2
837
838	part = Utf8Encode(code_point)
839
840	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
841	h = self.s[str_pos + 2:str_end]
842	i = int(h, 16)
843	part = Utf8Encode(i)
844
845	else:
846	# Should never happen
847	raise AssertionError(Id_str(tok_id))
848
849	#log('%s part %r', Id_str(tok_id), part)
850	self.decoded.write(part)
851	str_pos = str_end
852
853
854	class _Parser(object):
855
856	def __init__(self, s, is_j8):
857	# type: (str, bool) -> None
858	self.s = s
859	self.is_j8 = is_j8
860	self.lang_str = "J8" if is_j8 else "JSON"
861
862	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
863	self.tok_id = Id.Undefined_Tok
864	self.start_pos = 0
865	self.end_pos = 0
866	self.decoded = '' # decoded J8 string
867
868	def _Next(self):
869	# type: () -> None
870
871	# This isn't the start of a J8_Bool token, it's the END of the token before it
872	while True:
873	self.start_pos = self.end_pos
874	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
875	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
876	Id.Ignored_Comment):
877	break
878	# TODO: add Ignored_Newline to count lines, and show line numbers
879	# in errors messages. The position of the last newline and a token
880	# can be used to calculate a column number.
881
882	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
883
884	def _Eat(self, tok_id):
885	# type: (Id_t) -> None
886
887	if self.tok_id != tok_id:
888	#log('position %r %d-%d %r', self.s, self.start_pos,
889	# self.end_pos, self.s[self.start_pos:self.end_pos])
890	raise self._ParseError("Expected %s, got %s" %
891	(Id_str(tok_id), Id_str(self.tok_id)))
892	self._Next()
893
894	def _NextForLines(self):
895	# type: () -> None
896	"""Like _Next, but use the J8 Lines lexer."""
897	self.start_pos = self.end_pos
898	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
899
900	def _ParseError(self, msg):
901	# type: (str) -> error.Decode
902	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
903	self.lexer.cur_line_num)
904
905
906	class Parser(_Parser):
907	"""JSON and JSON8 Parser."""
908
909	def __init__(self, s, is_j8):
910	# type: (str, bool) -> None
911	_Parser.__init__(self, s, is_j8)
912
913	def _ParsePair(self):
914	# type: () -> Tuple[str, value_t]
915
916	k = self.decoded # Save the potential string value
917	self._Eat(Id.J8_String) # Check that it's a string
918	assert k is not None
919
920	self._Eat(Id.J8_Colon)
921
922	v = self._ParseValue()
923	return k, v
924
925	def _ParseDict(self):
926	# type: () -> value_t
927	"""
928	pair = string ':' value
929	Dict = '{' '}'
930	\| '{' pair (',' pair)* '}'
931	"""
932	# precondition
933	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
934
935	#log('> Dict')
936
937	d = NewDict() # type: Dict[str, value_t]
938
939	self._Next()
940	if self.tok_id == Id.J8_RBrace:
941	self._Next()
942	return value.Dict(d)
943
944	k, v = self._ParsePair()
945	d[k] = v
946	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
947
948	while self.tok_id == Id.J8_Comma:
949	self._Next()
950	k, v = self._ParsePair()
951	d[k] = v
952	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
953
954	self._Eat(Id.J8_RBrace)
955
956	#log('< Dict')
957
958	return value.Dict(d)
959
960	def _ParseList(self):
961	# type: () -> value_t
962	"""
963	List = '[' ']'
964	\| '[' value (',' value)* ']'
965	"""
966	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
967
968	items = [] # type: List[value_t]
969
970	self._Next()
971	if self.tok_id == Id.J8_RBracket:
972	self._Next()
973	return value.List(items)
974
975	items.append(self._ParseValue())
976
977	while self.tok_id == Id.J8_Comma:
978	self._Next()
979	items.append(self._ParseValue())
980
981	self._Eat(Id.J8_RBracket)
982
983	return value.List(items)
984
985	def _ParseValue(self):
986	# type: () -> value_t
987	if self.tok_id == Id.J8_LBrace:
988	return self._ParseDict()
989
990	elif self.tok_id == Id.J8_LBracket:
991	return self._ParseList()
992
993	elif self.tok_id == Id.J8_Null:
994	self._Next()
995	return value.Null
996
997	elif self.tok_id == Id.J8_Bool:
998	#log('%r %d', self.s[self.start_pos], self.start_pos)
999	b = value.Bool(self.s[self.start_pos] == 't')
1000	self._Next()
1001	return b
1002
1003	elif self.tok_id == Id.J8_Int:
1004	part = self.s[self.start_pos:self.end_pos]
1005	self._Next()
1006	try:
1007	big = mops.FromStr(part)
1008	except ValueError:
1009	raise self._ParseError('Integer is too big')
1010	return value.Int(big)
1011
1012	elif self.tok_id == Id.J8_Float:
1013	part = self.s[self.start_pos:self.end_pos]
1014	self._Next()
1015	return value.Float(float(part))
1016
1017	# UString, BString too
1018	elif self.tok_id == Id.J8_String:
1019	str_val = value.Str(self.decoded)
1020	#log('d %r', self.decoded)
1021	self._Next()
1022	return str_val
1023
1024	elif self.tok_id == Id.Eol_Tok:
1025	raise self._ParseError('Unexpected EOF while parsing %s' %
1026	self.lang_str)
1027
1028	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1029	raise self._ParseError('Invalid token while parsing %s: %s' %
1030	(self.lang_str, Id_str(self.tok_id)))
1031
1032	def ParseValue(self):
1033	# type: () -> value_t
1034	""" Raises error.Decode. """
1035	self._Next()
1036	obj = self._ParseValue()
1037
1038	n = len(self.s)
1039	if self.start_pos != n:
1040	extra = n - self.start_pos
1041	#log('n %d pos %d', n, self.start_pos)
1042	raise self._ParseError(
1043	'Got %d bytes of unexpected trailing input' % extra)
1044	return obj
1045
1046
1047	class Nil8Parser(_Parser):
1048	"""
1049	Tokens not in JSON8:
1050	LParen RParen Symbol
1051
1052	Tokens not in JSON, but in JSON8 and NIL8:
1053	Identifier (unquoted keys)
1054	Ignored_Comment
1055	"""
1056
1057	def __init__(self, s, is_j8):
1058	# type: (str, bool) -> None
1059	_Parser.__init__(self, s, is_j8)
1060
1061	if 0:
1062
1063	def _LookAhead(self):
1064	# type: () -> Id_t
1065	"""
1066	Don't need this right now
1067	"""
1068	end_pos = self.end_pos # look ahead from last token
1069	while True:
1070	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1071	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1072	Id.Ignored_Comment):
1073	break
1074	return tok_id
1075
1076	def _ParseRecord(self):
1077	# type: () -> nvalue_t
1078	"""
1079	Yaks
1080	(self->Next) => (-> self Next)
1081	(self->Next obj.field) => ((-> self Next) (. obj field))
1082
1083	Similar to
1084	((identity identity) 42) => 42 in Clojure
1085
1086	ASDL
1087	(Node left:(. x4beef2))
1088	(Node left !x4beef2)
1089
1090	# Ambiguous because value can be identifier.
1091	# We have to look ahead to and see if there's a colon :
1092	field =
1093	Identifier ':' value
1094	\| value
1095
1096	record = '(' head field* ')'
1097
1098	- Identifier \| Symbol are treated the same, it's a side effect of
1099	the lexing style
1100	- do positional args come before named args
1101	- () is invalid? Use [] for empty list
1102	"""
1103	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1104
1105	items = [] # type: List[nvalue_t]
1106
1107	self._Next()
1108	if self.tok_id == Id.J8_RParen:
1109	self._Next()
1110	return nvalue.List(items)
1111
1112	#log('TOK %s', Id_str(self.tok_id))
1113	while self.tok_id != Id.J8_RParen:
1114	items.append(self._ParseNil8())
1115	#log('TOK 2 %s', Id_str(self.tok_id))
1116
1117	self._Eat(Id.J8_RParen)
1118
1119	return nvalue.List(items)
1120
1121	def _ParseList8(self):
1122	# type: () -> nvalue_t
1123	"""
1124	List8 = '[' value* ']'
1125
1126	No commas, not even optional ones for now.
1127	"""
1128	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1129
1130	items = [] # type: List[nvalue_t]
1131
1132	self._Next()
1133	if self.tok_id == Id.J8_RBracket:
1134	self._Next()
1135	return nvalue.List(items)
1136
1137	#log('TOK %s', Id_str(self.tok_id))
1138	while self.tok_id != Id.J8_RBracket:
1139	items.append(self._ParseNil8())
1140	#log('TOK 2 %s', Id_str(self.tok_id))
1141
1142	self._Eat(Id.J8_RBracket)
1143
1144	return nvalue.List(items)
1145
1146	def _ParseNil8(self):
1147	# type: () -> nvalue_t
1148	if self.tok_id == Id.J8_LParen:
1149	obj = self._ParseRecord() # type: nvalue_t
1150	#return obj
1151
1152	elif self.tok_id == Id.J8_LBracket:
1153	obj = self._ParseList8()
1154	#return obj
1155
1156	# Primitives are copied from J8 above.
1157	# TODO: We also want hex literals.
1158	elif self.tok_id == Id.J8_Null:
1159	self._Next()
1160	obj = nvalue.Null
1161
1162	elif self.tok_id == Id.J8_Bool:
1163	b = nvalue.Bool(self.s[self.start_pos] == 't')
1164	self._Next()
1165	obj = b
1166
1167	elif self.tok_id == Id.J8_Int:
1168	part = self.s[self.start_pos:self.end_pos]
1169	self._Next()
1170	obj = nvalue.Int(int(part))
1171
1172	elif self.tok_id == Id.J8_Float:
1173	part = self.s[self.start_pos:self.end_pos]
1174	self._Next()
1175	obj = nvalue.Float(float(part))
1176
1177	elif self.tok_id == Id.J8_String:
1178	str_val = nvalue.Str(self.decoded)
1179	self._Next()
1180	obj = str_val
1181
1182	# <- etc.
1183	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1184	Id.J8_Comma):
1185	# unquoted "word" treated like a string
1186	part = self.s[self.start_pos:self.end_pos]
1187	self._Next()
1188	obj = nvalue.Symbol(part)
1189
1190	elif self.tok_id == Id.Eol_Tok:
1191	raise self._ParseError('Unexpected EOF while parsing %s' %
1192	self.lang_str)
1193
1194	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1195	raise self._ParseError('Invalid token while parsing %s: %s' %
1196	(self.lang_str, Id_str(self.tok_id)))
1197
1198	#log('YO %s', Id_str(self.tok_id))
1199	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1200	#log('AT %s', Id_str(self.tok_id))
1201
1202	# key: "value" -> (: key "value")
1203	part = self.s[self.start_pos:self.end_pos]
1204	op = nvalue.Symbol(part)
1205
1206	self._Next()
1207	operand2 = self._ParseNil8()
1208	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1209	#print("--> INFIX %d %s" % (id(infix), infix))
1210	return infix
1211
1212	#next_id = self._LookAhead()
1213	#print('NEXT %s' % Id_str(next_id))
1214
1215	#raise AssertionError()
1216	#print("--> OBJ %d %s" % (id(obj), obj))
1217	return obj
1218
1219	def ParseNil8(self):
1220	# type: () -> nvalue_t
1221	""" Raises error.Decode. """
1222	self._Next()
1223	#print('yo')
1224	obj = self._ParseNil8()
1225	#print("==> %d %s" % (id(obj), obj))
1226	if self.tok_id != Id.Eol_Tok:
1227	raise self._ParseError('Unexpected trailing input')
1228	return obj
1229
1230
1231	class J8LinesParser(_Parser):
1232	"""Decode lines from a string with newlines.
1233
1234	We specify this with a grammar, to preserve location info and to reduce
1235	allocations. (But note that unquoted_line is more like a LOOP than it is
1236	grammatical.)
1237
1238	Grammar:
1239
1240	end = J8_Newline \| Eol_Tok
1241
1242	empty_line = WS_Space? end
1243
1244	# special case: read until end token, but REMOVE trailing WS_Space
1245	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1246
1247	j8_line = WS_Space? J8_String WS_Space? end
1248
1249	lines = (empty_line \| unquoted_line \| j8_line)*
1250
1251	where Lit_Chars is valid UTF-8
1252
1253	Notes:
1254
1255	(1) We disallow multiple strings on a line, like:
1256
1257	"json" "json2"
1258	"json" unquoted
1259
1260	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1261
1262	foo "" u''
1263
1264	The "" and u'' are not a decoded string, because the line started with
1265	Id.Lit_Chars literals.
1266
1267	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1268	Does it have - for empty cell?
1269	"""
1270
1271	def __init__(self, s):
1272	# type: (str) -> None
1273	_Parser.__init__(self, s, True)
1274
1275	def _Show(self, s):
1276	# type: (str) -> None
1277	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1278	self.end_pos)
1279
1280	def _ParseLine(self, out):
1281	# type: (List[str]) -> None
1282	""" May append a line to 'out' """
1283	#self._Show('1')
1284	if self.tok_id == Id.WS_Space:
1285	self._NextForLines()
1286
1287	# Empty line - return without doing anything
1288	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1289	self._NextForLines()
1290	return
1291
1292	# Quoted string on line
1293	if self.tok_id == Id.J8_String:
1294	out.append(self.decoded)
1295	self._NextForLines()
1296
1297	if self.tok_id == Id.WS_Space: # trailing whitespace
1298	self._NextForLines()
1299
1300	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1301	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1302	Id_str(self.tok_id))
1303
1304	self._NextForLines()
1305	return
1306
1307	# Unquoted line
1308	if self.tok_id == Id.Lit_Chars:
1309	# ' unquoted "" text on line ' # read every token until end
1310	string_start = self.start_pos
1311	while True:
1312	# for stripping whitespace
1313	prev_id = self.tok_id
1314	prev_start = self.start_pos
1315
1316	self._NextForLines()
1317
1318	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1319	# \r, but we're sticking with the JSON spec definition of
1320	# whitespace. (As another data point, CPython on Unix allows
1321	# \r in the middle of expressions, treating it as whitespace.)
1322	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1323	break
1324
1325	if prev_id == Id.WS_Space:
1326	string_end = prev_start # remove trailing whitespace
1327	else:
1328	string_end = self.start_pos
1329
1330	out.append(self.s[string_start:string_end])
1331
1332	self._NextForLines() # past newline
1333	return
1334
1335	raise AssertionError(Id_str(self.tok_id))
1336
1337	def Parse(self):
1338	# type: () -> List[str]
1339	""" Raises error.Decode. """
1340	self._NextForLines()
1341
1342	lines = [] # type: List[str]
1343	while self.tok_id != Id.Eol_Tok:
1344	self._ParseLine(lines)
1345
1346	if self.tok_id != Id.Eol_Tok:
1347	raise self._ParseError('Unexpected trailing input in J8 Lines')
1348
1349	return lines
1350
1351
1352	def SplitJ8Lines(s):
1353	# type: (str) -> List[str]
1354	"""Used by @(echo split command sub)
1355
1356	Raises:
1357	error.Decode
1358
1359	3 Errors:
1360	- J8 string syntax error inside quotes
1361	- Extra input on line
1362	- unquoted line isn't utf-8
1363	"""
1364	p = J8LinesParser(s)
1365	return p.Parse()
1366
1367
1368	# vim: sw=4