data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1240 lines, 596 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	TODO:
6
7	- Many more tests
8	- Run JSONTestSuite
9
10	Later:
11
12	- PrettyPrinter uses hnode.asdl?
13	- color
14	- line wrapping -- do this later
15	- would like CONTRIBUTORS here
16
17	- Unify with ASDL pretty printing - NIL8
18	- {} [] are identical
19	- () is for statically typed ASDL data
20	(command.Simple blame_tok:(...) words:[ ])
21	although we are also using [] for typed ASDL arrays, not just JSON
22	- object IDs
23	- @ x123 can create an ID
24	- ! x123 can reference an ID
25	- <> can be for non-J8 data types? For the = operator
26	- 'hi \(name)' interpolation is useful for code
27
28	- Common between JSON8 and NIL8 - for writing by hand
29	- comments - # line or // line (JSON5 uses // line, following JS)
30	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
31	- commas
32	- JSON8 could have trailing commas rule
33	- NIL8 at least has no commas for [1 2 "hi"]
34	"""
35
36	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40	from asdl import format as fmt
41	from core import error
42	from data_lang import pyj8
43	# dependency issue: consts.py pulls in frontend/option_def.py
44	from frontend import consts
45	from frontend import match
46	from mycpp import mops
47	from mycpp import mylib
48	from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50	import fastfunc
51
52	_ = log
53
54	from typing import cast, Dict, List, Tuple, Optional
55
56
57	# COPIED from ui.ValType() to break dep
58	def ValType(val):
59	# type: (value_t) -> str
60	"""For displaying type errors in the UI."""
61
62	return value_str(val.tag(), dot=False)
63
64
65	if mylib.PYTHON:
66
67	def HeapValueId(val):
68	# type: (value_t) -> int
69	"""
70	Python's id() returns the address, which is up to 64 bits.
71
72	In C++ we can use the GC ID, which fits within 32 bits.
73	"""
74	return id(val)
75
76
77	def ValueId(val):
78	# type: (value_t) -> int
79	"""
80	Return an integer ID for object that:
81
82	1. Can be used to determine whether 2 objects are the same, e.g. for
83	List, Dict, Func, Proc, etc.
84	2. Will help detect object cycles
85
86	Primitives types like Int and Float don't have this notion. They're
87	immutable values that are copied and compared by value.
88	"""
89	with tagswitch(val) as case:
90	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91	value_e.Str):
92	# These will not be on the heap if we switch to tagged pointers
93	# Str is handled conservatively - when we add small string
94	# optimization, some strings will be values, so we assume all are.
95	return -1
96	else:
97	return HeapValueId(val)
98
99
100	def ValueIdString(val):
101	# type: (value_t) -> str
102	"""Used by pp value (42) and = 42"""
103	heap_id = ValueId(val) # could be -1
104	if heap_id == -1:
105	return ''
106	else:
107	return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110	def Utf8Encode(code):
111	# type: (int) -> str
112	"""Return utf-8 encoded bytes from a unicode code point.
113
114	Based on https://stackoverflow.com/a/23502707
115	"""
116	num_cont_bytes = 0
117
118	if code <= 0x7F:
119	return chr(code & 0x7F) # ASCII
120
121	elif code <= 0x7FF:
122	num_cont_bytes = 1
123	elif code <= 0xFFFF:
124	num_cont_bytes = 2
125	else:
126	# What about the check code <= 0x10FFFF ?
127	# - it happens in statically parsed $'' u''
128	# - but not dynamically parsed echo -e / printf, following bash/zsh
129	num_cont_bytes = 3
130
131	bytes_ = [] # type: List[int]
132	for _ in xrange(num_cont_bytes):
133	bytes_.append(0x80 \| (code & 0x3F))
134	code >>= 6
135
136	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
137	bytes_.append(b)
138	bytes_.reverse()
139
140	# mod 256 because Python ints don't wrap around!
141	tmp = [chr(b & 0xFF) for b in bytes_]
142	return ''.join(tmp)
143
144
145	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147	LOSSY_JSON = 1 << 3 # JSON is lossy
148
149	# Hack until we fully translate
150	assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153	def _Print(val, buf, indent, options=0):
154	# type: (value_t, mylib.BufWriter, int, int) -> None
155	"""
156	Args:
157	indent: number of spaces to indent, or -1 for everything on one line
158	"""
159	p = InstancePrinter(buf, indent, options)
160	p.Print(val)
161
162
163	def PrintMessage(val, buf, indent):
164	# type: (value_t, mylib.BufWriter, int) -> None
165	""" For json8 write (x) and toJson8()
166
167	Caller must handle error.Encode
168	"""
169	_Print(val, buf, indent)
170
171
172	def PrintJsonMessage(val, buf, indent):
173	# type: (value_t, mylib.BufWriter, int) -> None
174	""" For json write (x) and toJson()
175
176	Caller must handle error.Encode()
177	Doesn't decay to b'' strings - will use Unicode replacement char.
178	"""
179	_Print(val, buf, indent, options=LOSSY_JSON)
180
181
182	def PrintLine(val, f):
183	# type: (value_t, mylib.Writer) -> None
184	""" For pp line (x) """
185
186	# error.Encode should be impossible - we show cycles and non-data
187	buf = mylib.BufWriter()
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def EncodeString(s, buf, unquoted_ok=False):
194	# type: (str, mylib.BufWriter, bool) -> None
195	""" For pp proc, etc."""
196
197	if unquoted_ok and fastfunc.CanOmitQuotes(s):
198	buf.write(s)
199	return
200
201	_Print(value.Str(s), buf, -1)
202
203
204	def MaybeEncodeString(s):
205	# type: (str) -> str
206	""" For write --json8 $s and compexport """
207
208	# TODO: add unquoted_ok here?
209	# /usr/local/foo-bar/x.y/a_b
210
211	buf = mylib.BufWriter()
212	_Print(value.Str(s), buf, -1)
213	return buf.getvalue()
214
215
216	def MaybeEncodeJsonString(s):
217	# type: (str) -> str
218	""" For write --json """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222	buf = mylib.BufWriter()
223	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224	return buf.getvalue()
225
226
227	# DFS traversal state
228	UNSEEN = 0
229	EXPLORING = 1
230	FINISHED = 2
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	# Value is always True
244	# Dict[int, None] doesn't translate -- it would be nice to have a set()
245	self.visited = {} # type: Dict[int, int]
246
247	def _ItemIndent(self, level):
248	# type: (int) -> None
249
250	if self.indent == -1:
251	return
252
253	self.buf.write_spaces((level + 1) * self.indent)
254
255	def _BracketIndent(self, level):
256	# type: (int) -> None
257
258	if self.indent == -1:
259	return
260
261	self.buf.write_spaces(level * self.indent)
262
263	def _MaybeNewline(self):
264	# type: () -> None
265	if self.indent == -1:
266	return
267	self.buf.write('\n')
268
269	def _MaybeSpace(self):
270	# type: () -> None
271	if self.indent == -1:
272	return
273	self.buf.write(' ')
274
275	def _PrintList(self, val, level):
276	# type: (value.List, int) -> None
277
278	if len(val.items) == 0: # Special case like Python/JS
279	self.buf.write('[]')
280	else:
281	self.buf.write('[')
282	self._MaybeNewline()
283	for i, item in enumerate(val.items):
284	if i != 0:
285	self.buf.write(',')
286	self._MaybeNewline()
287
288	self._ItemIndent(level)
289	self.Print(item, level + 1)
290	self._MaybeNewline()
291
292	self._BracketIndent(level)
293	self.buf.write(']')
294
295	def _PrintDict(self, val, level):
296	# type: (value.Dict, int) -> None
297
298	if len(val.d) == 0: # Special case like Python/JS
299	self.buf.write('{}')
300	else:
301	self.buf.write('{')
302	self._MaybeNewline()
303	i = 0
304	for k, v in iteritems(val.d):
305	if i != 0:
306	self.buf.write(',')
307	self._MaybeNewline()
308
309	self._ItemIndent(level)
310
311	pyj8.WriteString(k, self.options, self.buf)
312
313	self.buf.write(':')
314	self._MaybeSpace()
315
316	self.Print(v, level + 1)
317
318	i += 1
319
320	self._MaybeNewline()
321	self._BracketIndent(level)
322	self.buf.write('}')
323
324	def Print(self, val, level=0):
325	# type: (value_t, int) -> None
326
327	# special value that means everything is on one line
328	# It's like
329	# JSON.stringify(d, null, 0)
330	# except we use -1, not 0. 0 can still have newlines.
331
332	UP_val = val
333	with tagswitch(val) as case:
334	if case(value_e.Null):
335	self.buf.write('null')
336
337	elif case(value_e.Bool):
338	val = cast(value.Bool, UP_val)
339	self.buf.write('true' if val.b else 'false')
340
341	elif case(value_e.Int):
342	val = cast(value.Int, UP_val)
343	# TODO: avoid intermediate allocation with
344	# self.buf.WriteBigInt(val.i)
345	#
346	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347	# be of arbitrary length, and will need a growth strategy.
348	# Although that is not very common, so we could allocate in
349	# that case.
350
351	self.buf.write(mops.ToStr(val.i))
352
353	elif case(value_e.Float):
354	val = cast(value.Float, UP_val)
355	# TODO: avoid intrmediate allocation with
356	# self.buf.WriteFloat(val.f)
357	self.buf.write(str(val.f))
358
359	elif case(value_e.Str):
360	val = cast(value.Str, UP_val)
361
362	pyj8.WriteString(val.s, self.options, self.buf)
363
364	elif case(value_e.List):
365	val = cast(value.List, UP_val)
366
367	# Cycle detection, only for containers that can be in cycles
368	heap_id = HeapValueId(val)
369
370	node_state = self.visited.get(heap_id, UNSEEN)
371	if node_state == FINISHED:
372	# Print it AGAIN. We print a JSON tree, which means we can
373	# visit and print nodes MANY TIMES, as long as they're not
374	# in a cycle.
375	self._PrintList(val, level)
376	return
377	if node_state == EXPLORING:
378	if self.options & SHOW_CYCLES:
379	self.buf.write('[ -->%s ]' % ValueIdString(val))
380	return
381	else:
382	# node.js prints which index closes the cycle
383	raise error.Encode(
384	"Can't encode List%s in object cycle" %
385	ValueIdString(val))
386
387	self.visited[heap_id] = EXPLORING
388	self._PrintList(val, level)
389	self.visited[heap_id] = FINISHED
390
391	elif case(value_e.Dict):
392	val = cast(value.Dict, UP_val)
393
394	# Cycle detection, only for containers that can be in cycles
395	heap_id = HeapValueId(val)
396
397	node_state = self.visited.get(heap_id, UNSEEN)
398	if node_state == FINISHED:
399	# Print it AGAIN. We print a JSON tree, which means we can
400	# visit and print nodes MANY TIMES, as long as they're not
401	# in a cycle.
402	self._PrintDict(val, level)
403	return
404	if node_state == EXPLORING:
405	if self.options & SHOW_CYCLES:
406	self.buf.write('{ -->%s }' % ValueIdString(val))
407	return
408	else:
409	# node.js prints which key closes the cycle
410	raise error.Encode(
411	"Can't encode Dict%s in object cycle" %
412	ValueIdString(val))
413
414	self.visited[heap_id] = EXPLORING
415	self._PrintDict(val, level)
416	self.visited[heap_id] = FINISHED
417
418	# BashArray and BashAssoc should be printed with pp line (x), e.g.
419	# for spec tests.
420	# - BashAssoc has a clear encoding.
421	# - BashArray could eventually be Dict[int, str]. But that's not
422	# encodable in JSON, which has string keys!
423	# So I think we can print it like ["a",null,'b"] and that won't
424	# change. That's what users expect.
425	elif case(value_e.BashArray):
426	val = cast(value.BashArray, UP_val)
427
428	self.buf.write('[')
429	self._MaybeNewline()
430	for i, s in enumerate(val.strs):
431	if i != 0:
432	self.buf.write(',')
433	self._MaybeNewline()
434
435	self._ItemIndent(level)
436	if s is None:
437	self.buf.write('null')
438	else:
439	pyj8.WriteString(s, self.options, self.buf)
440
441	self._MaybeNewline()
442
443	self._BracketIndent(level)
444	self.buf.write(']')
445
446	elif case(value_e.BashAssoc):
447	val = cast(value.BashAssoc, UP_val)
448
449	self.buf.write('{')
450	self._MaybeNewline()
451	i = 0
452	for k2, v2 in iteritems(val.d):
453	if i != 0:
454	self.buf.write(',')
455	self._MaybeNewline()
456
457	self._ItemIndent(level)
458
459	pyj8.WriteString(k2, self.options, self.buf)
460
461	self.buf.write(':')
462	self._MaybeSpace()
463
464	pyj8.WriteString(v2, self.options, self.buf)
465
466	i += 1
467
468	self._MaybeNewline()
469	self._BracketIndent(level)
470	self.buf.write('}')
471
472	else:
473	pass # mycpp workaround
474	if self.options & SHOW_NON_DATA:
475	# Similar to = operator, ui.DebugPrint()
476	# TODO: that prints value.Range in a special way
477	ysh_type = ValType(val)
478	id_str = ValueIdString(val)
479	self.buf.write('<%s%s>' % (ysh_type, id_str))
480	else:
481	raise error.Encode("Can't serialize object of type %s" %
482	ValType(val))
483
484
485	class PrettyPrinter(object):
486	""" Unused right now, but could enhance the = operator.
487
488	Output to polymorphic ColorOutput
489
490	Features like asdl/format.py:
491	- line wrapping
492	- color
493	- sharing detection by passing in a REF COUTN dict
494	- print @123 the first time, and then print ... the second time
495
496	and
497
498	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501	- Omitting commas for ASDL? Maybe we can use two spaces
502
503	(Token id: Id.VSub_DollarName start: 0 length: 3)
504	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505	"""
506
507	def __init__(self, max_col):
508	# type: (int) -> None
509	self.max_col = max_col
510
511	# This could be an optimized set an C++ bit set like
512	# mark_sweep_heap.h, rather than a Dict
513	#self.unique_objs = mylib.UniqueObjects()
514
515	# first pass of object ID -> number of times references
516
517	self.ref_count = {} # type: Dict[int, int]
518
519	def PrettyTree(self, val, f):
520	# type: (value_t, fmt.ColorOutput) -> None
521
522	# TODO: first convert to hnode.asdl types?
523
524	# Although we might want
525	# hnode.AlreadyShown = (str type, int unique_id)
526	pass
527
528	def Print(self, val, buf):
529	# type: (value_t, mylib.BufWriter) -> None
530
531	# Or print to stderr?
532	f = fmt.DetectConsoleOutput(mylib.Stdout())
533	self.PrettyTree(val, f)
534
535	# Then print those with ASDL
536	pass
537
538
539	class LexerDecoder(object):
540	"""J8 lexer and string decoder.
541
542	Similar interface as SimpleLexer, except we return an optional decoded
543	string
544	"""
545
546	def __init__(self, s, is_j8, lang_str):
547	# type: (str, bool, str) -> None
548	self.s = s
549	self.is_j8 = is_j8
550	self.lang_str = lang_str
551
552	self.pos = 0
553
554	# current line being lexed -- for error messages
555	self.cur_line_num = 1
556
557	# Reuse this instance to save GC objects. JSON objects could have
558	# thousands of strings.
559	self.decoded = mylib.BufWriter()
560
561	def _Error(self, msg, end_pos):
562	# type: (str, int) -> error.Decode
563
564	# Use the current position as start pos
565	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567	def Next(self):
568	# type: () -> Tuple[Id_t, int, Optional[str]]
569	""" Returns a token and updates self.pos """
570
571	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573	if not self.is_j8:
574	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575	raise self._Error(
576	"Single quotes aren't part of JSON; you may want 'json8 read'",
577	end_pos)
578	if tok_id == Id.Ignored_Comment:
579	raise self._Error(
580	"Comments aren't part of JSON; you may want 'json8 read'",
581	end_pos)
582
583	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584	Id.Left_USingleQuote):
585	return self._DecodeString(tok_id, end_pos)
586
587	if tok_id == Id.Left_JDoubleQuote:
588	if self.is_j8:
589	return self._DecodeString(tok_id, end_pos)
590	else:
591	raise self._Error('Pure JSON does not accept j"" prefix',
592	end_pos)
593
594	if tok_id == Id.Ignored_Newline:
595	#log('LINE %d', self.cur_line_num)
596	self.cur_line_num += 1
597
598	self.pos = end_pos
599	return tok_id, end_pos, None
600
601	def NextForLines(self):
602	# type: () -> Tuple[Id_t, int, Optional[str]]
603	""" Like Next(), but for J8 Lines """
604
605	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
606
607	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
608	Id.Left_BSingleQuote, Id.Left_USingleQuote):
609	return self._DecodeString(tok_id, end_pos)
610
611	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
612	# this for quoted strings.)
613	if (tok_id == Id.Lit_Chars and
614	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
615	raise self._Error(
616	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
617	if tok_id == Id.Char_AsciiControl:
618	raise self._Error(
619	"J8 Lines can't have unescaped ASCII control chars", end_pos)
620
621	if tok_id == Id.J8_Newline:
622	#log('LINE %d', self.cur_line_num)
623	self.cur_line_num += 1
624
625	self.pos = end_pos
626	return tok_id, end_pos, None
627
628	def _DecodeString(self, left_id, str_pos):
629	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
630	""" Returns a string token and updates self.pos """
631
632	while True:
633	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
634	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
635	else:
636	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
637
638	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
639
640	if tok_id == Id.Eol_Tok:
641	# TODO: point to beginning of # quote?
642	raise self._Error(
643	'Unexpected EOF while lexing %s string' % self.lang_str,
644	str_end)
645	if tok_id == Id.Unknown_Backslash:
646	raise self._Error(
647	'Bad backslash escape in %s string' % self.lang_str,
648	str_end)
649	if tok_id == Id.Char_AsciiControl:
650	raise self._Error(
651	"%s strings can't have unescaped ASCII control chars" %
652	self.lang_str, str_end)
653
654	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
655
656	self.pos = str_end
657
658	s = self.decoded.getvalue()
659	self.decoded.clear() # reuse this instance
660
661	#log('decoded %r', self.decoded.getvalue())
662	return Id.J8_String, str_end, s
663
664	#
665	# Now handle each kind of token
666	#
667
668	if tok_id == Id.Lit_Chars: # JSON and J8
669	part = self.s[str_pos:str_end]
670	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
671	raise self._Error(
672	'Invalid UTF-8 in %s string literal' % self.lang_str,
673	str_end)
674
675	# TODO: would be nice to avoid allocation in all these cases.
676	# But LookupCharC() would have to change.
677
678	elif tok_id == Id.Char_OneChar: # JSON and J8
679	ch = self.s[str_pos + 1]
680	part = consts.LookupCharC(ch)
681
682	elif tok_id == Id.Char_UBraced: # J8 only
683	h = self.s[str_pos + 3:str_end - 1]
684	i = int(h, 16)
685
686	# Same checks in osh/word_compile.py
687	if i > 0x10ffff:
688	raise self._Error(
689	"Code point can't be greater than U+10ffff", str_end)
690	if 0xD800 <= i and i < 0xE000:
691	raise self._Error(
692	r"\u{%s} escape is illegal because it's in the surrogate range"
693	% h, str_end)
694
695	part = Utf8Encode(i)
696
697	elif tok_id == Id.Char_YHex: # J8 only
698	h = self.s[str_pos + 2:str_end]
699
700	# Same check in osh/word_parse.py
701	if left_id != Id.Left_BSingleQuote:
702	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
703	raise self._Error(
704	r"\y%s escapes not allowed in u'' strings" % h,
705	str_end)
706
707	i = int(h, 16)
708	part = chr(i)
709
710	elif tok_id == Id.Char_SurrogatePair:
711	h1 = self.s[str_pos + 2:str_pos + 6]
712	h2 = self.s[str_pos + 8:str_pos + 12]
713
714	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
715	i1 = int(h1, 16) - 0xD800 # high surrogate
716	i2 = int(h2, 16) - 0xDC00 # low surrogate
717	code_point = 0x10000 + (i1 << 10) + i2
718
719	part = Utf8Encode(code_point)
720
721	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
722	h = self.s[str_pos + 2:str_end]
723	i = int(h, 16)
724	part = Utf8Encode(i)
725
726	else:
727	# Should never happen
728	raise AssertionError(Id_str(tok_id))
729
730	#log('%s part %r', Id_str(tok_id), part)
731	self.decoded.write(part)
732	str_pos = str_end
733
734
735	class _Parser(object):
736
737	def __init__(self, s, is_j8):
738	# type: (str, bool) -> None
739	self.s = s
740	self.is_j8 = is_j8
741	self.lang_str = "J8" if is_j8 else "JSON"
742
743	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
744	self.tok_id = Id.Undefined_Tok
745	self.start_pos = 0
746	self.end_pos = 0
747	self.decoded = '' # decoded J8 string
748
749	def _Next(self):
750	# type: () -> None
751
752	# This isn't the start of a J8_Bool token, it's the END of the token before it
753	while True:
754	self.start_pos = self.end_pos
755	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
756	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
757	Id.Ignored_Comment):
758	break
759	# TODO: add Ignored_Newline to count lines, and show line numbers
760	# in errors messages. The position of the last newline and a token
761	# can be used to calculate a column number.
762
763	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
764
765	def _Eat(self, tok_id):
766	# type: (Id_t) -> None
767
768	if self.tok_id != tok_id:
769	#log('position %r %d-%d %r', self.s, self.start_pos,
770	# self.end_pos, self.s[self.start_pos:self.end_pos])
771	raise self._ParseError("Expected %s, got %s" %
772	(Id_str(tok_id), Id_str(self.tok_id)))
773	self._Next()
774
775	def _NextForLines(self):
776	# type: () -> None
777	"""Like _Next, but use the J8 Lines lexer."""
778	self.start_pos = self.end_pos
779	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
780
781	def _ParseError(self, msg):
782	# type: (str) -> error.Decode
783	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
784	self.lexer.cur_line_num)
785
786
787	class Parser(_Parser):
788	"""JSON and JSON8 Parser."""
789
790	def __init__(self, s, is_j8):
791	# type: (str, bool) -> None
792	_Parser.__init__(self, s, is_j8)
793
794	def _ParsePair(self):
795	# type: () -> Tuple[str, value_t]
796
797	k = self.decoded # Save the potential string value
798	self._Eat(Id.J8_String) # Check that it's a string
799	assert k is not None
800
801	self._Eat(Id.J8_Colon)
802
803	v = self._ParseValue()
804	return k, v
805
806	def _ParseDict(self):
807	# type: () -> value_t
808	"""
809	pair = string ':' value
810	Dict = '{' '}'
811	\| '{' pair (',' pair)* '}'
812	"""
813	# precondition
814	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
815
816	#log('> Dict')
817
818	d = NewDict() # type: Dict[str, value_t]
819
820	self._Next()
821	if self.tok_id == Id.J8_RBrace:
822	self._Next()
823	return value.Dict(d)
824
825	k, v = self._ParsePair()
826	d[k] = v
827	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
828
829	while self.tok_id == Id.J8_Comma:
830	self._Next()
831	k, v = self._ParsePair()
832	d[k] = v
833	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
834
835	self._Eat(Id.J8_RBrace)
836
837	#log('< Dict')
838
839	return value.Dict(d)
840
841	def _ParseList(self):
842	# type: () -> value_t
843	"""
844	List = '[' ']'
845	\| '[' value (',' value)* ']'
846	"""
847	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
848
849	items = [] # type: List[value_t]
850
851	self._Next()
852	if self.tok_id == Id.J8_RBracket:
853	self._Next()
854	return value.List(items)
855
856	items.append(self._ParseValue())
857
858	while self.tok_id == Id.J8_Comma:
859	self._Next()
860	items.append(self._ParseValue())
861
862	self._Eat(Id.J8_RBracket)
863
864	return value.List(items)
865
866	def _ParseValue(self):
867	# type: () -> value_t
868	if self.tok_id == Id.J8_LBrace:
869	return self._ParseDict()
870
871	elif self.tok_id == Id.J8_LBracket:
872	return self._ParseList()
873
874	elif self.tok_id == Id.J8_Null:
875	self._Next()
876	return value.Null
877
878	elif self.tok_id == Id.J8_Bool:
879	#log('%r %d', self.s[self.start_pos], self.start_pos)
880	b = value.Bool(self.s[self.start_pos] == 't')
881	self._Next()
882	return b
883
884	elif self.tok_id == Id.J8_Int:
885	part = self.s[self.start_pos:self.end_pos]
886	self._Next()
887	return value.Int(mops.FromStr(part))
888
889	elif self.tok_id == Id.J8_Float:
890	part = self.s[self.start_pos:self.end_pos]
891	self._Next()
892	return value.Float(float(part))
893
894	# UString, BString too
895	elif self.tok_id == Id.J8_String:
896	str_val = value.Str(self.decoded)
897	#log('d %r', self.decoded)
898	self._Next()
899	return str_val
900
901	elif self.tok_id == Id.Eol_Tok:
902	raise self._ParseError('Unexpected EOF while parsing %s' %
903	self.lang_str)
904
905	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
906	raise self._ParseError('Invalid token while parsing %s: %s' %
907	(self.lang_str, Id_str(self.tok_id)))
908
909	def ParseValue(self):
910	# type: () -> value_t
911	""" Raises error.Decode. """
912	self._Next()
913	obj = self._ParseValue()
914	if self.tok_id != Id.Eol_Tok:
915	raise self._ParseError('Unexpected trailing input')
916	return obj
917
918
919	class Nil8Parser(_Parser):
920	"""
921	Tokens not in JSON8:
922	LParen RParen Symbol
923
924	Tokens not in JSON, but in JSON8 and NIL8:
925	Identifier (unquoted keys)
926	Ignored_Comment
927	"""
928
929	def __init__(self, s, is_j8):
930	# type: (str, bool) -> None
931	_Parser.__init__(self, s, is_j8)
932
933	if 0:
934
935	def _LookAhead(self):
936	# type: () -> Id_t
937	"""
938	Don't need this right now
939	"""
940	end_pos = self.end_pos # look ahead from last token
941	while True:
942	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
943	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
944	Id.Ignored_Comment):
945	break
946	return tok_id
947
948	def _ParseRecord(self):
949	# type: () -> nvalue_t
950	"""
951	Yaks
952	(self->Next) => (-> self Next)
953	(self->Next obj.field) => ((-> self Next) (. obj field))
954
955	Similar to
956	((identity identity) 42) => 42 in Clojure
957
958	ASDL
959	(Node left:(. x4beef2))
960	(Node left !x4beef2)
961
962	# Ambiguous because value can be identifier.
963	# We have to look ahead to and see if there's a colon :
964	field =
965	Identifier ':' value
966	\| value
967
968	record = '(' head field* ')'
969
970	- Identifier \| Symbol are treated the same, it's a side effect of
971	the lexing style
972	- do positional args come before named args
973	- () is invalid? Use [] for empty list
974	"""
975	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
976
977	items = [] # type: List[nvalue_t]
978
979	self._Next()
980	if self.tok_id == Id.J8_RParen:
981	self._Next()
982	return nvalue.List(items)
983
984	#log('TOK %s', Id_str(self.tok_id))
985	while self.tok_id != Id.J8_RParen:
986	items.append(self._ParseNil8())
987	#log('TOK 2 %s', Id_str(self.tok_id))
988
989	self._Eat(Id.J8_RParen)
990
991	return nvalue.List(items)
992
993	def _ParseList8(self):
994	# type: () -> nvalue_t
995	"""
996	List8 = '[' value* ']'
997
998	No commas, not even optional ones for now.
999	"""
1000	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1001
1002	items = [] # type: List[nvalue_t]
1003
1004	self._Next()
1005	if self.tok_id == Id.J8_RBracket:
1006	self._Next()
1007	return nvalue.List(items)
1008
1009	#log('TOK %s', Id_str(self.tok_id))
1010	while self.tok_id != Id.J8_RBracket:
1011	items.append(self._ParseNil8())
1012	#log('TOK 2 %s', Id_str(self.tok_id))
1013
1014	self._Eat(Id.J8_RBracket)
1015
1016	return nvalue.List(items)
1017
1018	def _ParseNil8(self):
1019	# type: () -> nvalue_t
1020	if self.tok_id == Id.J8_LParen:
1021	obj = self._ParseRecord() # type: nvalue_t
1022	#return obj
1023
1024	elif self.tok_id == Id.J8_LBracket:
1025	obj = self._ParseList8()
1026	#return obj
1027
1028	# Primitives are copied from J8 above.
1029	# TODO: We also want hex literals.
1030	elif self.tok_id == Id.J8_Null:
1031	self._Next()
1032	obj = nvalue.Null
1033
1034	elif self.tok_id == Id.J8_Bool:
1035	b = nvalue.Bool(self.s[self.start_pos] == 't')
1036	self._Next()
1037	obj = b
1038
1039	elif self.tok_id == Id.J8_Int:
1040	part = self.s[self.start_pos:self.end_pos]
1041	self._Next()
1042	obj = nvalue.Int(int(part))
1043
1044	elif self.tok_id == Id.J8_Float:
1045	part = self.s[self.start_pos:self.end_pos]
1046	self._Next()
1047	obj = nvalue.Float(float(part))
1048
1049	elif self.tok_id == Id.J8_String:
1050	str_val = nvalue.Str(self.decoded)
1051	self._Next()
1052	obj = str_val
1053
1054	# <- etc.
1055	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1056	Id.J8_Comma):
1057	# unquoted "word" treated like a string
1058	part = self.s[self.start_pos:self.end_pos]
1059	self._Next()
1060	obj = nvalue.Symbol(part)
1061
1062	elif self.tok_id == Id.Eol_Tok:
1063	raise self._ParseError('Unexpected EOF while parsing %s' %
1064	self.lang_str)
1065
1066	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1067	raise self._ParseError('Invalid token while parsing %s: %s' %
1068	(self.lang_str, Id_str(self.tok_id)))
1069
1070	#log('YO %s', Id_str(self.tok_id))
1071	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1072	#log('AT %s', Id_str(self.tok_id))
1073
1074	# key: "value" -> (: key "value")
1075	part = self.s[self.start_pos:self.end_pos]
1076	op = nvalue.Symbol(part)
1077
1078	self._Next()
1079	operand2 = self._ParseNil8()
1080	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1081	#print("--> INFIX %d %s" % (id(infix), infix))
1082	return infix
1083
1084	#next_id = self._LookAhead()
1085	#print('NEXT %s' % Id_str(next_id))
1086
1087	#raise AssertionError()
1088	#print("--> OBJ %d %s" % (id(obj), obj))
1089	return obj
1090
1091	def ParseNil8(self):
1092	# type: () -> nvalue_t
1093	""" Raises error.Decode. """
1094	self._Next()
1095	#print('yo')
1096	obj = self._ParseNil8()
1097	#print("==> %d %s" % (id(obj), obj))
1098	if self.tok_id != Id.Eol_Tok:
1099	raise self._ParseError('Unexpected trailing input')
1100	return obj
1101
1102
1103	class J8LinesParser(_Parser):
1104	"""Decode lines from a string with newlines.
1105
1106	We specify this with a grammar, to preserve location info and to reduce
1107	allocations. (But note that unquoted_line is more like a LOOP than it is
1108	grammatical.)
1109
1110	Grammar:
1111
1112	end = J8_Newline \| Eol_Tok
1113
1114	empty_line = WS_Space? end
1115
1116	# special case: read until end token, but REMOVE trailing WS_Space
1117	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1118
1119	j8_line = WS_Space? J8_String WS_Space? end
1120
1121	lines = (empty_line \| unquoted_line \| j8_line)*
1122
1123	where Lit_Chars is valid UTF-8
1124
1125	Notes:
1126
1127	(1) We disallow multiple strings on a line, like:
1128
1129	"json" "json2"
1130	"json" unquoted
1131
1132	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1133
1134	foo "" u''
1135
1136	The "" and u'' are not a decoded string, because the line started with
1137	Id.Lit_Chars literals.
1138
1139	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1140	Does it have - for empty cell?
1141	"""
1142
1143	def __init__(self, s):
1144	# type: (str) -> None
1145	_Parser.__init__(self, s, True)
1146
1147	def _Show(self, s):
1148	# type: (str) -> None
1149	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1150	self.end_pos)
1151
1152	def _ParseLine(self, out):
1153	# type: (List[str]) -> None
1154	""" May append a line to 'out' """
1155	#self._Show('1')
1156	if self.tok_id == Id.WS_Space:
1157	self._NextForLines()
1158
1159	# Empty line - return without doing anything
1160	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1161	self._NextForLines()
1162	return
1163
1164	# Quoted string on line
1165	if self.tok_id == Id.J8_String:
1166	out.append(self.decoded)
1167	self._NextForLines()
1168
1169	if self.tok_id == Id.WS_Space: # trailing whitespace
1170	self._NextForLines()
1171
1172	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1173	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1174	Id_str(self.tok_id))
1175
1176	self._NextForLines()
1177	return
1178
1179	# Unquoted line
1180	if self.tok_id == Id.Lit_Chars:
1181	# ' unquoted "" text on line ' # read every token until end
1182	string_start = self.start_pos
1183	while True:
1184	# for stripping whitespace
1185	prev_id = self.tok_id
1186	prev_start = self.start_pos
1187
1188	self._NextForLines()
1189
1190	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1191	# \r, but we're sticking with the JSON spec definition of
1192	# whitespace. (As another data point, CPython on Unix allows
1193	# \r in the middle of expressions, treating it as whitespace.)
1194	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1195	break
1196
1197	if prev_id == Id.WS_Space:
1198	string_end = prev_start # remove trailing whitespace
1199	else:
1200	string_end = self.start_pos
1201
1202	out.append(self.s[string_start:string_end])
1203
1204	self._NextForLines() # past newline
1205	return
1206
1207	raise AssertionError(Id_str(self.tok_id))
1208
1209	def Parse(self):
1210	# type: () -> List[str]
1211	""" Raises error.Decode. """
1212	self._NextForLines()
1213
1214	lines = [] # type: List[str]
1215	while self.tok_id != Id.Eol_Tok:
1216	self._ParseLine(lines)
1217
1218	if self.tok_id != Id.Eol_Tok:
1219	raise self._ParseError('Unexpected trailing input in J8 Lines')
1220
1221	return lines
1222
1223
1224	def SplitJ8Lines(s):
1225	# type: (str) -> List[str]
1226	"""Used by @(echo split command sub)
1227
1228	Raises:
1229	error.Decode
1230
1231	3 Errors:
1232	- J8 string syntax error inside quotes
1233	- Extra input on line
1234	- unquoted line isn't utf-8
1235	"""
1236	p = J8LinesParser(s)
1237	return p.Parse()
1238
1239
1240	# vim: sw=4