data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1244 lines, 600 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	TODO:
6
7	- Many more tests
8	- Run JSONTestSuite
9
10	Later:
11
12	- PrettyPrinter uses hnode.asdl?
13	- color
14	- line wrapping -- do this later
15	- would like CONTRIBUTORS here
16
17	- Unify with ASDL pretty printing - NIL8
18	- {} [] are identical
19	- () is for statically typed ASDL data
20	(command.Simple blame_tok:(...) words:[ ])
21	although we are also using [] for typed ASDL arrays, not just JSON
22	- object IDs
23	- @ x123 can create an ID
24	- ! x123 can reference an ID
25	- <> can be for non-J8 data types? For the = operator
26	- 'hi \(name)' interpolation is useful for code
27
28	- Common between JSON8 and NIL8 - for writing by hand
29	- comments - # line or // line (JSON5 uses // line, following JS)
30	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
31	- commas
32	- JSON8 could have trailing commas rule
33	- NIL8 at least has no commas for [1 2 "hi"]
34	"""
35
36	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40	from asdl import format as fmt
41	from core import error
42	from data_lang import pyj8
43	# dependency issue: consts.py pulls in frontend/option_def.py
44	from frontend import consts
45	from frontend import match
46	from mycpp import mops
47	from mycpp import mylib
48	from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50	import fastfunc
51
52	_ = log
53
54	from typing import cast, Dict, List, Tuple, Optional
55
56
57	# COPIED from ui.ValType() to break dep
58	def ValType(val):
59	# type: (value_t) -> str
60	"""For displaying type errors in the UI."""
61
62	return value_str(val.tag(), dot=False)
63
64
65	if mylib.PYTHON:
66
67	def HeapValueId(val):
68	# type: (value_t) -> int
69	"""
70	Python's id() returns the address, which is up to 64 bits.
71
72	In C++ we can use the GC ID, which fits within 32 bits.
73	"""
74	return id(val)
75
76
77	def ValueId(val):
78	# type: (value_t) -> int
79	"""
80	Return an integer ID for object that:
81
82	1. Can be used to determine whether 2 objects are the same, e.g. for
83	List, Dict, Func, Proc, etc.
84	2. Will help detect object cycles
85
86	Primitives types like Int and Float don't have this notion. They're
87	immutable values that are copied and compared by value.
88	"""
89	with tagswitch(val) as case:
90	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91	value_e.Str):
92	# These will not be on the heap if we switch to tagged pointers
93	# Str is handled conservatively - when we add small string
94	# optimization, some strings will be values, so we assume all are.
95	return -1
96	else:
97	return HeapValueId(val)
98
99
100	def ValueIdString(val):
101	# type: (value_t) -> str
102	"""Used by pp value (42) and = 42"""
103	heap_id = ValueId(val) # could be -1
104	if heap_id == -1:
105	return ''
106	else:
107	return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110	def Utf8Encode(code):
111	# type: (int) -> str
112	"""Return utf-8 encoded bytes from a unicode code point.
113
114	Based on https://stackoverflow.com/a/23502707
115	"""
116	num_cont_bytes = 0
117
118	if code <= 0x7F:
119	return chr(code & 0x7F) # ASCII
120
121	elif code <= 0x7FF:
122	num_cont_bytes = 1
123	elif code <= 0xFFFF:
124	num_cont_bytes = 2
125	else:
126	# What about the check code <= 0x10FFFF ?
127	# - it happens in statically parsed $'' u''
128	# - but not dynamically parsed echo -e / printf, following bash/zsh
129	num_cont_bytes = 3
130
131	bytes_ = [] # type: List[int]
132	for _ in xrange(num_cont_bytes):
133	bytes_.append(0x80 \| (code & 0x3F))
134	code >>= 6
135
136	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
137	bytes_.append(b)
138	bytes_.reverse()
139
140	# mod 256 because Python ints don't wrap around!
141	tmp = [chr(b & 0xFF) for b in bytes_]
142	return ''.join(tmp)
143
144
145	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147	LOSSY_JSON = 1 << 3 # JSON is lossy
148
149	# Hack until we fully translate
150	assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153	def _Print(val, buf, indent, options=0):
154	# type: (value_t, mylib.BufWriter, int, int) -> None
155	"""
156	Args:
157	indent: number of spaces to indent, or -1 for everything on one line
158	"""
159	p = InstancePrinter(buf, indent, options)
160	p.Print(val)
161
162
163	def PrintMessage(val, buf, indent):
164	# type: (value_t, mylib.BufWriter, int) -> None
165	""" For json8 write (x) and toJson8()
166
167	Caller must handle error.Encode
168	"""
169	_Print(val, buf, indent)
170
171
172	def PrintJsonMessage(val, buf, indent):
173	# type: (value_t, mylib.BufWriter, int) -> None
174	""" For json write (x) and toJson()
175
176	Caller must handle error.Encode()
177	Doesn't decay to b'' strings - will use Unicode replacement char.
178	"""
179	_Print(val, buf, indent, options=LOSSY_JSON)
180
181
182	def PrintLine(val, f):
183	# type: (value_t, mylib.Writer) -> None
184	""" For pp line (x) """
185
186	# error.Encode should be impossible - we show cycles and non-data
187	buf = mylib.BufWriter()
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def EncodeString(s, buf, unquoted_ok=False):
194	# type: (str, mylib.BufWriter, bool) -> None
195	""" For pp proc, etc."""
196
197	if unquoted_ok and fastfunc.CanOmitQuotes(s):
198	buf.write(s)
199	return
200
201	_Print(value.Str(s), buf, -1)
202
203
204	def MaybeEncodeString(s):
205	# type: (str) -> str
206	""" For write --json8 $s and compexport """
207
208	# TODO: add unquoted_ok here?
209	# /usr/local/foo-bar/x.y/a_b
210
211	buf = mylib.BufWriter()
212	_Print(value.Str(s), buf, -1)
213	return buf.getvalue()
214
215
216	def MaybeEncodeJsonString(s):
217	# type: (str) -> str
218	""" For write --json """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222	buf = mylib.BufWriter()
223	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224	return buf.getvalue()
225
226
227	# DFS traversal state
228	UNSEEN = 0
229	EXPLORING = 1
230	FINISHED = 2
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	# Value is always True
244	# Dict[int, None] doesn't translate -- it would be nice to have a set()
245	self.visited = {} # type: Dict[int, int]
246
247	def _ItemIndent(self, level):
248	# type: (int) -> None
249
250	if self.indent == -1:
251	return
252
253	self.buf.write_spaces((level + 1) * self.indent)
254
255	def _BracketIndent(self, level):
256	# type: (int) -> None
257
258	if self.indent == -1:
259	return
260
261	self.buf.write_spaces(level * self.indent)
262
263	def _MaybeNewline(self):
264	# type: () -> None
265	if self.indent == -1:
266	return
267	self.buf.write('\n')
268
269	def _MaybeSpace(self):
270	# type: () -> None
271	if self.indent == -1:
272	return
273	self.buf.write(' ')
274
275	def _PrintList(self, val, level):
276	# type: (value.List, int) -> None
277
278	if len(val.items) == 0: # Special case like Python/JS
279	self.buf.write('[]')
280	else:
281	self.buf.write('[')
282	self._MaybeNewline()
283	for i, item in enumerate(val.items):
284	if i != 0:
285	self.buf.write(',')
286	self._MaybeNewline()
287
288	self._ItemIndent(level)
289	self.Print(item, level + 1)
290	self._MaybeNewline()
291
292	self._BracketIndent(level)
293	self.buf.write(']')
294
295	def _PrintDict(self, val, level):
296	# type: (value.Dict, int) -> None
297
298	if len(val.d) == 0: # Special case like Python/JS
299	self.buf.write('{}')
300	else:
301	self.buf.write('{')
302	self._MaybeNewline()
303	i = 0
304	for k, v in iteritems(val.d):
305	if i != 0:
306	self.buf.write(',')
307	self._MaybeNewline()
308
309	self._ItemIndent(level)
310
311	pyj8.WriteString(k, self.options, self.buf)
312
313	self.buf.write(':')
314	self._MaybeSpace()
315
316	self.Print(v, level + 1)
317
318	i += 1
319
320	self._MaybeNewline()
321	self._BracketIndent(level)
322	self.buf.write('}')
323
324	def Print(self, val, level=0):
325	# type: (value_t, int) -> None
326
327	# special value that means everything is on one line
328	# It's like
329	# JSON.stringify(d, null, 0)
330	# except we use -1, not 0. 0 can still have newlines.
331
332	UP_val = val
333	with tagswitch(val) as case:
334	if case(value_e.Null):
335	self.buf.write('null')
336
337	elif case(value_e.Bool):
338	val = cast(value.Bool, UP_val)
339	self.buf.write('true' if val.b else 'false')
340
341	elif case(value_e.Int):
342	val = cast(value.Int, UP_val)
343	# TODO: avoid intermediate allocation with
344	# self.buf.WriteBigInt(val.i)
345	#
346	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347	# be of arbitrary length, and will need a growth strategy.
348	# Although that is not very common, so we could allocate in
349	# that case.
350
351	self.buf.write(mops.ToStr(val.i))
352
353	elif case(value_e.Float):
354	val = cast(value.Float, UP_val)
355	# TODO: avoid intrmediate allocation with
356	# self.buf.WriteFloat(val.f)
357	self.buf.write(str(val.f))
358
359	elif case(value_e.Str):
360	val = cast(value.Str, UP_val)
361
362	pyj8.WriteString(val.s, self.options, self.buf)
363
364	elif case(value_e.List):
365	val = cast(value.List, UP_val)
366
367	# Cycle detection, only for containers that can be in cycles
368	heap_id = HeapValueId(val)
369
370	node_state = self.visited.get(heap_id, UNSEEN)
371	if node_state == FINISHED:
372	# Print it AGAIN. We print a JSON tree, which means we can
373	# visit and print nodes MANY TIMES, as long as they're not
374	# in a cycle.
375	self._PrintList(val, level)
376	return
377	if node_state == EXPLORING:
378	if self.options & SHOW_CYCLES:
379	self.buf.write('[ -->%s ]' % ValueIdString(val))
380	return
381	else:
382	# node.js prints which index closes the cycle
383	raise error.Encode(
384	"Can't encode List%s in object cycle" %
385	ValueIdString(val))
386
387	self.visited[heap_id] = EXPLORING
388	self._PrintList(val, level)
389	self.visited[heap_id] = FINISHED
390
391	elif case(value_e.Dict):
392	val = cast(value.Dict, UP_val)
393
394	# Cycle detection, only for containers that can be in cycles
395	heap_id = HeapValueId(val)
396
397	node_state = self.visited.get(heap_id, UNSEEN)
398	if node_state == FINISHED:
399	# Print it AGAIN. We print a JSON tree, which means we can
400	# visit and print nodes MANY TIMES, as long as they're not
401	# in a cycle.
402	self._PrintDict(val, level)
403	return
404	if node_state == EXPLORING:
405	if self.options & SHOW_CYCLES:
406	self.buf.write('{ -->%s }' % ValueIdString(val))
407	return
408	else:
409	# node.js prints which key closes the cycle
410	raise error.Encode(
411	"Can't encode Dict%s in object cycle" %
412	ValueIdString(val))
413
414	self.visited[heap_id] = EXPLORING
415	self._PrintDict(val, level)
416	self.visited[heap_id] = FINISHED
417
418	# BashArray and BashAssoc should be printed with pp line (x), e.g.
419	# for spec tests.
420	# - BashAssoc has a clear encoding.
421	# - BashArray could eventually be Dict[int, str]. But that's not
422	# encodable in JSON, which has string keys!
423	# So I think we can print it like ["a",null,'b"] and that won't
424	# change. That's what users expect.
425	elif case(value_e.BashArray):
426	val = cast(value.BashArray, UP_val)
427
428	self.buf.write('[')
429	self._MaybeNewline()
430	for i, s in enumerate(val.strs):
431	if i != 0:
432	self.buf.write(',')
433	self._MaybeNewline()
434
435	self._ItemIndent(level)
436	if s is None:
437	self.buf.write('null')
438	else:
439	pyj8.WriteString(s, self.options, self.buf)
440
441	self._MaybeNewline()
442
443	self._BracketIndent(level)
444	self.buf.write(']')
445
446	elif case(value_e.BashAssoc):
447	val = cast(value.BashAssoc, UP_val)
448
449	self.buf.write('{')
450	self._MaybeNewline()
451	i = 0
452	for k2, v2 in iteritems(val.d):
453	if i != 0:
454	self.buf.write(',')
455	self._MaybeNewline()
456
457	self._ItemIndent(level)
458
459	pyj8.WriteString(k2, self.options, self.buf)
460
461	self.buf.write(':')
462	self._MaybeSpace()
463
464	pyj8.WriteString(v2, self.options, self.buf)
465
466	i += 1
467
468	self._MaybeNewline()
469	self._BracketIndent(level)
470	self.buf.write('}')
471
472	else:
473	pass # mycpp workaround
474	if self.options & SHOW_NON_DATA:
475	# Similar to = operator, ui.DebugPrint()
476	# TODO: that prints value.Range in a special way
477	ysh_type = ValType(val)
478	id_str = ValueIdString(val)
479	self.buf.write('<%s%s>' % (ysh_type, id_str))
480	else:
481	raise error.Encode("Can't serialize object of type %s" %
482	ValType(val))
483
484
485	class PrettyPrinter(object):
486	""" Unused right now, but could enhance the = operator.
487
488	Output to polymorphic ColorOutput
489
490	Features like asdl/format.py:
491	- line wrapping
492	- color
493	- sharing detection by passing in a REF COUTN dict
494	- print @123 the first time, and then print ... the second time
495
496	and
497
498	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501	- Omitting commas for ASDL? Maybe we can use two spaces
502
503	(Token id: Id.VSub_DollarName start: 0 length: 3)
504	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505	"""
506
507	def __init__(self, max_col):
508	# type: (int) -> None
509	self.max_col = max_col
510
511	# This could be an optimized set an C++ bit set like
512	# mark_sweep_heap.h, rather than a Dict
513	#self.unique_objs = mylib.UniqueObjects()
514
515	# first pass of object ID -> number of times references
516
517	self.ref_count = {} # type: Dict[int, int]
518
519	def PrettyTree(self, val, f):
520	# type: (value_t, fmt.ColorOutput) -> None
521
522	# TODO: first convert to hnode.asdl types?
523
524	# Although we might want
525	# hnode.AlreadyShown = (str type, int unique_id)
526	pass
527
528	def Print(self, val, buf):
529	# type: (value_t, mylib.BufWriter) -> None
530
531	# Or print to stderr?
532	f = fmt.DetectConsoleOutput(mylib.Stdout())
533	self.PrettyTree(val, f)
534
535	# Then print those with ASDL
536	pass
537
538
539	class LexerDecoder(object):
540	"""J8 lexer and string decoder.
541
542	Similar interface as SimpleLexer, except we return an optional decoded
543	string
544	"""
545
546	def __init__(self, s, is_j8, lang_str):
547	# type: (str, bool, str) -> None
548	self.s = s
549	self.is_j8 = is_j8
550	self.lang_str = lang_str
551
552	self.pos = 0
553
554	# current line being lexed -- for error messages
555	self.cur_line_num = 1
556
557	# Reuse this instance to save GC objects. JSON objects could have
558	# thousands of strings.
559	self.decoded = mylib.BufWriter()
560
561	def _Error(self, msg, end_pos):
562	# type: (str, int) -> error.Decode
563
564	# Use the current position as start pos
565	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567	def Next(self):
568	# type: () -> Tuple[Id_t, int, Optional[str]]
569	""" Returns a token and updates self.pos """
570
571	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573	if not self.is_j8:
574	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575	raise self._Error(
576	"Single quotes aren't part of JSON; you may want 'json8 read'",
577	end_pos)
578	if tok_id == Id.Ignored_Comment:
579	raise self._Error(
580	"Comments aren't part of JSON; you may want 'json8 read'",
581	end_pos)
582
583	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584	Id.Left_USingleQuote):
585	return self._DecodeString(tok_id, end_pos)
586
587	if tok_id == Id.Left_JDoubleQuote:
588	if self.is_j8:
589	return self._DecodeString(tok_id, end_pos)
590	else:
591	raise self._Error('Pure JSON does not accept j"" prefix',
592	end_pos)
593
594	if tok_id == Id.Ignored_Newline:
595	#log('LINE %d', self.cur_line_num)
596	self.cur_line_num += 1
597
598	self.pos = end_pos
599	return tok_id, end_pos, None
600
601	def NextForLines(self):
602	# type: () -> Tuple[Id_t, int, Optional[str]]
603	""" Like Next(), but for J8 Lines """
604
605	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
606
607	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
608	Id.Left_BSingleQuote, Id.Left_USingleQuote):
609	return self._DecodeString(tok_id, end_pos)
610
611	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
612	# this for quoted strings.)
613	if (tok_id == Id.Lit_Chars and
614	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
615	raise self._Error(
616	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
617	if tok_id == Id.Char_AsciiControl:
618	raise self._Error(
619	"J8 Lines can't have unescaped ASCII control chars", end_pos)
620
621	if tok_id == Id.J8_Newline:
622	#log('LINE %d', self.cur_line_num)
623	self.cur_line_num += 1
624
625	self.pos = end_pos
626	return tok_id, end_pos, None
627
628	def _DecodeString(self, left_id, str_pos):
629	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
630	""" Returns a string token and updates self.pos """
631
632	while True:
633	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
634	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
635	else:
636	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
637
638	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
639
640	if tok_id == Id.Eol_Tok:
641	# TODO: point to beginning of # quote?
642	raise self._Error(
643	'Unexpected EOF while lexing %s string' % self.lang_str,
644	str_end)
645	if tok_id == Id.Unknown_Backslash:
646	raise self._Error(
647	'Bad backslash escape in %s string' % self.lang_str,
648	str_end)
649	if tok_id == Id.Char_AsciiControl:
650	raise self._Error(
651	"%s strings can't have unescaped ASCII control chars" %
652	self.lang_str, str_end)
653
654	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
655
656	self.pos = str_end
657
658	s = self.decoded.getvalue()
659	self.decoded.clear() # reuse this instance
660
661	#log('decoded %r', self.decoded.getvalue())
662	return Id.J8_String, str_end, s
663
664	#
665	# Now handle each kind of token
666	#
667
668	if tok_id == Id.Lit_Chars: # JSON and J8
669	part = self.s[str_pos:str_end]
670	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
671	raise self._Error(
672	'Invalid UTF-8 in %s string literal' % self.lang_str,
673	str_end)
674
675	# TODO: would be nice to avoid allocation in all these cases.
676	# But LookupCharC() would have to change.
677
678	elif tok_id == Id.Char_OneChar: # JSON and J8
679	ch = self.s[str_pos + 1]
680	part = consts.LookupCharC(ch)
681
682	elif tok_id == Id.Char_UBraced: # J8 only
683	h = self.s[str_pos + 3:str_end - 1]
684	i = int(h, 16)
685
686	# Same checks in osh/word_compile.py
687	if i > 0x10ffff:
688	raise self._Error(
689	"Code point can't be greater than U+10ffff", str_end)
690	if 0xD800 <= i and i < 0xE000:
691	raise self._Error(
692	r"\u{%s} escape is illegal because it's in the surrogate range"
693	% h, str_end)
694
695	part = Utf8Encode(i)
696
697	elif tok_id == Id.Char_YHex: # J8 only
698	h = self.s[str_pos + 2:str_end]
699
700	# Same check in osh/word_parse.py
701	if left_id != Id.Left_BSingleQuote:
702	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
703	raise self._Error(
704	r"\y%s escapes not allowed in u'' strings" % h,
705	str_end)
706
707	i = int(h, 16)
708	part = chr(i)
709
710	elif tok_id == Id.Char_SurrogatePair:
711	h1 = self.s[str_pos + 2:str_pos + 6]
712	h2 = self.s[str_pos + 8:str_pos + 12]
713
714	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
715	i1 = int(h1, 16) - 0xD800 # high surrogate
716	i2 = int(h2, 16) - 0xDC00 # low surrogate
717	code_point = 0x10000 + (i1 << 10) + i2
718
719	part = Utf8Encode(code_point)
720
721	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
722	h = self.s[str_pos + 2:str_end]
723	i = int(h, 16)
724	part = Utf8Encode(i)
725
726	else:
727	# Should never happen
728	raise AssertionError(Id_str(tok_id))
729
730	#log('%s part %r', Id_str(tok_id), part)
731	self.decoded.write(part)
732	str_pos = str_end
733
734
735	class _Parser(object):
736
737	def __init__(self, s, is_j8):
738	# type: (str, bool) -> None
739	self.s = s
740	self.is_j8 = is_j8
741	self.lang_str = "J8" if is_j8 else "JSON"
742
743	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
744	self.tok_id = Id.Undefined_Tok
745	self.start_pos = 0
746	self.end_pos = 0
747	self.decoded = '' # decoded J8 string
748
749	def _Next(self):
750	# type: () -> None
751
752	# This isn't the start of a J8_Bool token, it's the END of the token before it
753	while True:
754	self.start_pos = self.end_pos
755	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
756	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
757	Id.Ignored_Comment):
758	break
759	# TODO: add Ignored_Newline to count lines, and show line numbers
760	# in errors messages. The position of the last newline and a token
761	# can be used to calculate a column number.
762
763	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
764
765	def _Eat(self, tok_id):
766	# type: (Id_t) -> None
767
768	if self.tok_id != tok_id:
769	#log('position %r %d-%d %r', self.s, self.start_pos,
770	# self.end_pos, self.s[self.start_pos:self.end_pos])
771	raise self._ParseError("Expected %s, got %s" %
772	(Id_str(tok_id), Id_str(self.tok_id)))
773	self._Next()
774
775	def _NextForLines(self):
776	# type: () -> None
777	"""Like _Next, but use the J8 Lines lexer."""
778	self.start_pos = self.end_pos
779	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
780
781	def _ParseError(self, msg):
782	# type: (str) -> error.Decode
783	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
784	self.lexer.cur_line_num)
785
786
787	class Parser(_Parser):
788	"""JSON and JSON8 Parser."""
789
790	def __init__(self, s, is_j8):
791	# type: (str, bool) -> None
792	_Parser.__init__(self, s, is_j8)
793
794	def _ParsePair(self):
795	# type: () -> Tuple[str, value_t]
796
797	k = self.decoded # Save the potential string value
798	self._Eat(Id.J8_String) # Check that it's a string
799	assert k is not None
800
801	self._Eat(Id.J8_Colon)
802
803	v = self._ParseValue()
804	return k, v
805
806	def _ParseDict(self):
807	# type: () -> value_t
808	"""
809	pair = string ':' value
810	Dict = '{' '}'
811	\| '{' pair (',' pair)* '}'
812	"""
813	# precondition
814	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
815
816	#log('> Dict')
817
818	d = NewDict() # type: Dict[str, value_t]
819
820	self._Next()
821	if self.tok_id == Id.J8_RBrace:
822	self._Next()
823	return value.Dict(d)
824
825	k, v = self._ParsePair()
826	d[k] = v
827	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
828
829	while self.tok_id == Id.J8_Comma:
830	self._Next()
831	k, v = self._ParsePair()
832	d[k] = v
833	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
834
835	self._Eat(Id.J8_RBrace)
836
837	#log('< Dict')
838
839	return value.Dict(d)
840
841	def _ParseList(self):
842	# type: () -> value_t
843	"""
844	List = '[' ']'
845	\| '[' value (',' value)* ']'
846	"""
847	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
848
849	items = [] # type: List[value_t]
850
851	self._Next()
852	if self.tok_id == Id.J8_RBracket:
853	self._Next()
854	return value.List(items)
855
856	items.append(self._ParseValue())
857
858	while self.tok_id == Id.J8_Comma:
859	self._Next()
860	items.append(self._ParseValue())
861
862	self._Eat(Id.J8_RBracket)
863
864	return value.List(items)
865
866	def _ParseValue(self):
867	# type: () -> value_t
868	if self.tok_id == Id.J8_LBrace:
869	return self._ParseDict()
870
871	elif self.tok_id == Id.J8_LBracket:
872	return self._ParseList()
873
874	elif self.tok_id == Id.J8_Null:
875	self._Next()
876	return value.Null
877
878	elif self.tok_id == Id.J8_Bool:
879	#log('%r %d', self.s[self.start_pos], self.start_pos)
880	b = value.Bool(self.s[self.start_pos] == 't')
881	self._Next()
882	return b
883
884	elif self.tok_id == Id.J8_Int:
885	part = self.s[self.start_pos:self.end_pos]
886	self._Next()
887	try:
888	big = mops.FromStr(part)
889	except ValueError:
890	raise self._ParseError('Integer is too big')
891	return value.Int(big)
892
893	elif self.tok_id == Id.J8_Float:
894	part = self.s[self.start_pos:self.end_pos]
895	self._Next()
896	return value.Float(float(part))
897
898	# UString, BString too
899	elif self.tok_id == Id.J8_String:
900	str_val = value.Str(self.decoded)
901	#log('d %r', self.decoded)
902	self._Next()
903	return str_val
904
905	elif self.tok_id == Id.Eol_Tok:
906	raise self._ParseError('Unexpected EOF while parsing %s' %
907	self.lang_str)
908
909	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
910	raise self._ParseError('Invalid token while parsing %s: %s' %
911	(self.lang_str, Id_str(self.tok_id)))
912
913	def ParseValue(self):
914	# type: () -> value_t
915	""" Raises error.Decode. """
916	self._Next()
917	obj = self._ParseValue()
918	if self.tok_id != Id.Eol_Tok:
919	raise self._ParseError('Unexpected trailing input')
920	return obj
921
922
923	class Nil8Parser(_Parser):
924	"""
925	Tokens not in JSON8:
926	LParen RParen Symbol
927
928	Tokens not in JSON, but in JSON8 and NIL8:
929	Identifier (unquoted keys)
930	Ignored_Comment
931	"""
932
933	def __init__(self, s, is_j8):
934	# type: (str, bool) -> None
935	_Parser.__init__(self, s, is_j8)
936
937	if 0:
938
939	def _LookAhead(self):
940	# type: () -> Id_t
941	"""
942	Don't need this right now
943	"""
944	end_pos = self.end_pos # look ahead from last token
945	while True:
946	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
947	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
948	Id.Ignored_Comment):
949	break
950	return tok_id
951
952	def _ParseRecord(self):
953	# type: () -> nvalue_t
954	"""
955	Yaks
956	(self->Next) => (-> self Next)
957	(self->Next obj.field) => ((-> self Next) (. obj field))
958
959	Similar to
960	((identity identity) 42) => 42 in Clojure
961
962	ASDL
963	(Node left:(. x4beef2))
964	(Node left !x4beef2)
965
966	# Ambiguous because value can be identifier.
967	# We have to look ahead to and see if there's a colon :
968	field =
969	Identifier ':' value
970	\| value
971
972	record = '(' head field* ')'
973
974	- Identifier \| Symbol are treated the same, it's a side effect of
975	the lexing style
976	- do positional args come before named args
977	- () is invalid? Use [] for empty list
978	"""
979	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
980
981	items = [] # type: List[nvalue_t]
982
983	self._Next()
984	if self.tok_id == Id.J8_RParen:
985	self._Next()
986	return nvalue.List(items)
987
988	#log('TOK %s', Id_str(self.tok_id))
989	while self.tok_id != Id.J8_RParen:
990	items.append(self._ParseNil8())
991	#log('TOK 2 %s', Id_str(self.tok_id))
992
993	self._Eat(Id.J8_RParen)
994
995	return nvalue.List(items)
996
997	def _ParseList8(self):
998	# type: () -> nvalue_t
999	"""
1000	List8 = '[' value* ']'
1001
1002	No commas, not even optional ones for now.
1003	"""
1004	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1005
1006	items = [] # type: List[nvalue_t]
1007
1008	self._Next()
1009	if self.tok_id == Id.J8_RBracket:
1010	self._Next()
1011	return nvalue.List(items)
1012
1013	#log('TOK %s', Id_str(self.tok_id))
1014	while self.tok_id != Id.J8_RBracket:
1015	items.append(self._ParseNil8())
1016	#log('TOK 2 %s', Id_str(self.tok_id))
1017
1018	self._Eat(Id.J8_RBracket)
1019
1020	return nvalue.List(items)
1021
1022	def _ParseNil8(self):
1023	# type: () -> nvalue_t
1024	if self.tok_id == Id.J8_LParen:
1025	obj = self._ParseRecord() # type: nvalue_t
1026	#return obj
1027
1028	elif self.tok_id == Id.J8_LBracket:
1029	obj = self._ParseList8()
1030	#return obj
1031
1032	# Primitives are copied from J8 above.
1033	# TODO: We also want hex literals.
1034	elif self.tok_id == Id.J8_Null:
1035	self._Next()
1036	obj = nvalue.Null
1037
1038	elif self.tok_id == Id.J8_Bool:
1039	b = nvalue.Bool(self.s[self.start_pos] == 't')
1040	self._Next()
1041	obj = b
1042
1043	elif self.tok_id == Id.J8_Int:
1044	part = self.s[self.start_pos:self.end_pos]
1045	self._Next()
1046	obj = nvalue.Int(int(part))
1047
1048	elif self.tok_id == Id.J8_Float:
1049	part = self.s[self.start_pos:self.end_pos]
1050	self._Next()
1051	obj = nvalue.Float(float(part))
1052
1053	elif self.tok_id == Id.J8_String:
1054	str_val = nvalue.Str(self.decoded)
1055	self._Next()
1056	obj = str_val
1057
1058	# <- etc.
1059	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1060	Id.J8_Comma):
1061	# unquoted "word" treated like a string
1062	part = self.s[self.start_pos:self.end_pos]
1063	self._Next()
1064	obj = nvalue.Symbol(part)
1065
1066	elif self.tok_id == Id.Eol_Tok:
1067	raise self._ParseError('Unexpected EOF while parsing %s' %
1068	self.lang_str)
1069
1070	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1071	raise self._ParseError('Invalid token while parsing %s: %s' %
1072	(self.lang_str, Id_str(self.tok_id)))
1073
1074	#log('YO %s', Id_str(self.tok_id))
1075	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1076	#log('AT %s', Id_str(self.tok_id))
1077
1078	# key: "value" -> (: key "value")
1079	part = self.s[self.start_pos:self.end_pos]
1080	op = nvalue.Symbol(part)
1081
1082	self._Next()
1083	operand2 = self._ParseNil8()
1084	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1085	#print("--> INFIX %d %s" % (id(infix), infix))
1086	return infix
1087
1088	#next_id = self._LookAhead()
1089	#print('NEXT %s' % Id_str(next_id))
1090
1091	#raise AssertionError()
1092	#print("--> OBJ %d %s" % (id(obj), obj))
1093	return obj
1094
1095	def ParseNil8(self):
1096	# type: () -> nvalue_t
1097	""" Raises error.Decode. """
1098	self._Next()
1099	#print('yo')
1100	obj = self._ParseNil8()
1101	#print("==> %d %s" % (id(obj), obj))
1102	if self.tok_id != Id.Eol_Tok:
1103	raise self._ParseError('Unexpected trailing input')
1104	return obj
1105
1106
1107	class J8LinesParser(_Parser):
1108	"""Decode lines from a string with newlines.
1109
1110	We specify this with a grammar, to preserve location info and to reduce
1111	allocations. (But note that unquoted_line is more like a LOOP than it is
1112	grammatical.)
1113
1114	Grammar:
1115
1116	end = J8_Newline \| Eol_Tok
1117
1118	empty_line = WS_Space? end
1119
1120	# special case: read until end token, but REMOVE trailing WS_Space
1121	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1122
1123	j8_line = WS_Space? J8_String WS_Space? end
1124
1125	lines = (empty_line \| unquoted_line \| j8_line)*
1126
1127	where Lit_Chars is valid UTF-8
1128
1129	Notes:
1130
1131	(1) We disallow multiple strings on a line, like:
1132
1133	"json" "json2"
1134	"json" unquoted
1135
1136	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1137
1138	foo "" u''
1139
1140	The "" and u'' are not a decoded string, because the line started with
1141	Id.Lit_Chars literals.
1142
1143	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1144	Does it have - for empty cell?
1145	"""
1146
1147	def __init__(self, s):
1148	# type: (str) -> None
1149	_Parser.__init__(self, s, True)
1150
1151	def _Show(self, s):
1152	# type: (str) -> None
1153	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1154	self.end_pos)
1155
1156	def _ParseLine(self, out):
1157	# type: (List[str]) -> None
1158	""" May append a line to 'out' """
1159	#self._Show('1')
1160	if self.tok_id == Id.WS_Space:
1161	self._NextForLines()
1162
1163	# Empty line - return without doing anything
1164	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1165	self._NextForLines()
1166	return
1167
1168	# Quoted string on line
1169	if self.tok_id == Id.J8_String:
1170	out.append(self.decoded)
1171	self._NextForLines()
1172
1173	if self.tok_id == Id.WS_Space: # trailing whitespace
1174	self._NextForLines()
1175
1176	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1177	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1178	Id_str(self.tok_id))
1179
1180	self._NextForLines()
1181	return
1182
1183	# Unquoted line
1184	if self.tok_id == Id.Lit_Chars:
1185	# ' unquoted "" text on line ' # read every token until end
1186	string_start = self.start_pos
1187	while True:
1188	# for stripping whitespace
1189	prev_id = self.tok_id
1190	prev_start = self.start_pos
1191
1192	self._NextForLines()
1193
1194	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1195	# \r, but we're sticking with the JSON spec definition of
1196	# whitespace. (As another data point, CPython on Unix allows
1197	# \r in the middle of expressions, treating it as whitespace.)
1198	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1199	break
1200
1201	if prev_id == Id.WS_Space:
1202	string_end = prev_start # remove trailing whitespace
1203	else:
1204	string_end = self.start_pos
1205
1206	out.append(self.s[string_start:string_end])
1207
1208	self._NextForLines() # past newline
1209	return
1210
1211	raise AssertionError(Id_str(self.tok_id))
1212
1213	def Parse(self):
1214	# type: () -> List[str]
1215	""" Raises error.Decode. """
1216	self._NextForLines()
1217
1218	lines = [] # type: List[str]
1219	while self.tok_id != Id.Eol_Tok:
1220	self._ParseLine(lines)
1221
1222	if self.tok_id != Id.Eol_Tok:
1223	raise self._ParseError('Unexpected trailing input in J8 Lines')
1224
1225	return lines
1226
1227
1228	def SplitJ8Lines(s):
1229	# type: (str) -> List[str]
1230	"""Used by @(echo split command sub)
1231
1232	Raises:
1233	error.Decode
1234
1235	3 Errors:
1236	- J8 string syntax error inside quotes
1237	- Extra input on line
1238	- unquoted line isn't utf-8
1239	"""
1240	p = J8LinesParser(s)
1241	return p.Parse()
1242
1243
1244	# vim: sw=4