data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1249 lines, 603 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	TODO:
6
7	- Many more tests
8	- Run JSONTestSuite
9
10	Later:
11
12	- PrettyPrinter uses hnode.asdl?
13	- color
14	- line wrapping -- do this later
15	- would like CONTRIBUTORS here
16
17	- Unify with ASDL pretty printing - NIL8
18	- {} [] are identical
19	- () is for statically typed ASDL data
20	(command.Simple blame_tok:(...) words:[ ])
21	although we are also using [] for typed ASDL arrays, not just JSON
22	- object IDs
23	- @ x123 can create an ID
24	- ! x123 can reference an ID
25	- <> can be for non-J8 data types? For the = operator
26	- 'hi \(name)' interpolation is useful for code
27
28	- Common between JSON8 and NIL8 - for writing by hand
29	- comments - # line or // line (JSON5 uses // line, following JS)
30	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
31	- commas
32	- JSON8 could have trailing commas rule
33	- NIL8 at least has no commas for [1 2 "hi"]
34	"""
35
36	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40	from asdl import format as fmt
41	from core import error
42	from data_lang import pyj8
43	# dependency issue: consts.py pulls in frontend/option_def.py
44	from frontend import consts
45	from frontend import match
46	from mycpp import mops
47	from mycpp import mylib
48	from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50	import fastfunc
51
52	_ = log
53
54	from typing import cast, Dict, List, Tuple, Optional
55
56
57	# COPIED from ui.ValType() to break dep
58	def ValType(val):
59	# type: (value_t) -> str
60	"""For displaying type errors in the UI."""
61
62	return value_str(val.tag(), dot=False)
63
64
65	if mylib.PYTHON:
66
67	def HeapValueId(val):
68	# type: (value_t) -> int
69	"""
70	Python's id() returns the address, which is up to 64 bits.
71
72	In C++ we can use the GC ID, which fits within 32 bits.
73	"""
74	return id(val)
75
76
77	def ValueId(val):
78	# type: (value_t) -> int
79	"""
80	Return an integer ID for object that:
81
82	1. Can be used to determine whether 2 objects are the same, e.g. for
83	List, Dict, Func, Proc, etc.
84	2. Will help detect object cycles
85
86	Primitives types like Int and Float don't have this notion. They're
87	immutable values that are copied and compared by value.
88	"""
89	with tagswitch(val) as case:
90	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91	value_e.Str):
92	# These will not be on the heap if we switch to tagged pointers
93	# Str is handled conservatively - when we add small string
94	# optimization, some strings will be values, so we assume all are.
95	return -1
96	else:
97	return HeapValueId(val)
98
99
100	def ValueIdString(val):
101	# type: (value_t) -> str
102	"""Used by pp value (42) and = 42"""
103	heap_id = ValueId(val) # could be -1
104	if heap_id == -1:
105	return ''
106	else:
107	return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110	def Utf8Encode(code):
111	# type: (int) -> str
112	"""Return utf-8 encoded bytes from a unicode code point.
113
114	Based on https://stackoverflow.com/a/23502707
115	"""
116	num_cont_bytes = 0
117
118	if code <= 0x7F:
119	return chr(code & 0x7F) # ASCII
120
121	elif code <= 0x7FF:
122	num_cont_bytes = 1
123	elif code <= 0xFFFF:
124	num_cont_bytes = 2
125	else:
126	# What about the check code <= 0x10FFFF ?
127	# - it happens in statically parsed $'' u''
128	# - but not dynamically parsed echo -e / printf, following bash/zsh
129	num_cont_bytes = 3
130
131	bytes_ = [] # type: List[int]
132	for _ in xrange(num_cont_bytes):
133	bytes_.append(0x80 \| (code & 0x3F))
134	code >>= 6
135
136	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
137	bytes_.append(b)
138	bytes_.reverse()
139
140	# mod 256 because Python ints don't wrap around!
141	tmp = [chr(b & 0xFF) for b in bytes_]
142	return ''.join(tmp)
143
144
145	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147	LOSSY_JSON = 1 << 3 # JSON is lossy
148
149	# Hack until we fully translate
150	assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153	def _Print(val, buf, indent, options=0):
154	# type: (value_t, mylib.BufWriter, int, int) -> None
155	"""
156	Args:
157	indent: number of spaces to indent, or -1 for everything on one line
158	"""
159	p = InstancePrinter(buf, indent, options)
160	p.Print(val)
161
162
163	def PrintMessage(val, buf, indent):
164	# type: (value_t, mylib.BufWriter, int) -> None
165	""" For json8 write (x) and toJson8()
166
167	Caller must handle error.Encode
168	"""
169	_Print(val, buf, indent)
170
171
172	def PrintJsonMessage(val, buf, indent):
173	# type: (value_t, mylib.BufWriter, int) -> None
174	""" For json write (x) and toJson()
175
176	Caller must handle error.Encode()
177	Doesn't decay to b'' strings - will use Unicode replacement char.
178	"""
179	_Print(val, buf, indent, options=LOSSY_JSON)
180
181
182	def PrintLine(val, f):
183	# type: (value_t, mylib.Writer) -> None
184	""" For pp line (x) """
185
186	# error.Encode should be impossible - we show cycles and non-data
187	buf = mylib.BufWriter()
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def EncodeString(s, buf, unquoted_ok=False):
194	# type: (str, mylib.BufWriter, bool) -> None
195	""" For pp proc, etc."""
196
197	if unquoted_ok and fastfunc.CanOmitQuotes(s):
198	buf.write(s)
199	return
200
201	_Print(value.Str(s), buf, -1)
202
203
204	def MaybeEncodeString(s):
205	# type: (str) -> str
206	""" For write --json8 $s and compexport """
207
208	# TODO: add unquoted_ok here?
209	# /usr/local/foo-bar/x.y/a_b
210
211	buf = mylib.BufWriter()
212	_Print(value.Str(s), buf, -1)
213	return buf.getvalue()
214
215
216	def MaybeEncodeJsonString(s):
217	# type: (str) -> str
218	""" For write --json """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222	buf = mylib.BufWriter()
223	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224	return buf.getvalue()
225
226
227	# DFS traversal state
228	UNSEEN = 0
229	EXPLORING = 1
230	FINISHED = 2
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	# Value is always True
244	# Dict[int, None] doesn't translate -- it would be nice to have a set()
245	self.visited = {} # type: Dict[int, int]
246
247	def _ItemIndent(self, level):
248	# type: (int) -> None
249
250	if self.indent == -1:
251	return
252
253	self.buf.write_spaces((level + 1) * self.indent)
254
255	def _BracketIndent(self, level):
256	# type: (int) -> None
257
258	if self.indent == -1:
259	return
260
261	self.buf.write_spaces(level * self.indent)
262
263	def _MaybeNewline(self):
264	# type: () -> None
265	if self.indent == -1:
266	return
267	self.buf.write('\n')
268
269	def _MaybeSpace(self):
270	# type: () -> None
271	if self.indent == -1:
272	return
273	self.buf.write(' ')
274
275	def _PrintList(self, val, level):
276	# type: (value.List, int) -> None
277
278	if len(val.items) == 0: # Special case like Python/JS
279	self.buf.write('[]')
280	else:
281	self.buf.write('[')
282	self._MaybeNewline()
283	for i, item in enumerate(val.items):
284	if i != 0:
285	self.buf.write(',')
286	self._MaybeNewline()
287
288	self._ItemIndent(level)
289	self.Print(item, level + 1)
290	self._MaybeNewline()
291
292	self._BracketIndent(level)
293	self.buf.write(']')
294
295	def _PrintDict(self, val, level):
296	# type: (value.Dict, int) -> None
297
298	if len(val.d) == 0: # Special case like Python/JS
299	self.buf.write('{}')
300	else:
301	self.buf.write('{')
302	self._MaybeNewline()
303	i = 0
304	for k, v in iteritems(val.d):
305	if i != 0:
306	self.buf.write(',')
307	self._MaybeNewline()
308
309	self._ItemIndent(level)
310
311	pyj8.WriteString(k, self.options, self.buf)
312
313	self.buf.write(':')
314	self._MaybeSpace()
315
316	self.Print(v, level + 1)
317
318	i += 1
319
320	self._MaybeNewline()
321	self._BracketIndent(level)
322	self.buf.write('}')
323
324	def Print(self, val, level=0):
325	# type: (value_t, int) -> None
326
327	# special value that means everything is on one line
328	# It's like
329	# JSON.stringify(d, null, 0)
330	# except we use -1, not 0. 0 can still have newlines.
331
332	UP_val = val
333	with tagswitch(val) as case:
334	if case(value_e.Null):
335	self.buf.write('null')
336
337	elif case(value_e.Bool):
338	val = cast(value.Bool, UP_val)
339	self.buf.write('true' if val.b else 'false')
340
341	elif case(value_e.Int):
342	val = cast(value.Int, UP_val)
343	# TODO: avoid intermediate allocation with
344	# self.buf.WriteBigInt(val.i)
345	#
346	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347	# be of arbitrary length, and will need a growth strategy.
348	# Although that is not very common, so we could allocate in
349	# that case.
350
351	self.buf.write(mops.ToStr(val.i))
352
353	elif case(value_e.Float):
354	val = cast(value.Float, UP_val)
355	# TODO: avoid intrmediate allocation with
356	# self.buf.WriteFloat(val.f)
357	self.buf.write(str(val.f))
358
359	elif case(value_e.Str):
360	val = cast(value.Str, UP_val)
361
362	pyj8.WriteString(val.s, self.options, self.buf)
363
364	elif case(value_e.List):
365	val = cast(value.List, UP_val)
366
367	# Cycle detection, only for containers that can be in cycles
368	heap_id = HeapValueId(val)
369
370	node_state = self.visited.get(heap_id, UNSEEN)
371	if node_state == FINISHED:
372	# Print it AGAIN. We print a JSON tree, which means we can
373	# visit and print nodes MANY TIMES, as long as they're not
374	# in a cycle.
375	self._PrintList(val, level)
376	return
377	if node_state == EXPLORING:
378	if self.options & SHOW_CYCLES:
379	self.buf.write('[ -->%s ]' % ValueIdString(val))
380	return
381	else:
382	# node.js prints which index closes the cycle
383	raise error.Encode(
384	"Can't encode List%s in object cycle" %
385	ValueIdString(val))
386
387	self.visited[heap_id] = EXPLORING
388	self._PrintList(val, level)
389	self.visited[heap_id] = FINISHED
390
391	elif case(value_e.Dict):
392	val = cast(value.Dict, UP_val)
393
394	# Cycle detection, only for containers that can be in cycles
395	heap_id = HeapValueId(val)
396
397	node_state = self.visited.get(heap_id, UNSEEN)
398	if node_state == FINISHED:
399	# Print it AGAIN. We print a JSON tree, which means we can
400	# visit and print nodes MANY TIMES, as long as they're not
401	# in a cycle.
402	self._PrintDict(val, level)
403	return
404	if node_state == EXPLORING:
405	if self.options & SHOW_CYCLES:
406	self.buf.write('{ -->%s }' % ValueIdString(val))
407	return
408	else:
409	# node.js prints which key closes the cycle
410	raise error.Encode(
411	"Can't encode Dict%s in object cycle" %
412	ValueIdString(val))
413
414	self.visited[heap_id] = EXPLORING
415	self._PrintDict(val, level)
416	self.visited[heap_id] = FINISHED
417
418	# BashArray and BashAssoc should be printed with pp line (x), e.g.
419	# for spec tests.
420	# - BashAssoc has a clear encoding.
421	# - BashArray could eventually be Dict[int, str]. But that's not
422	# encodable in JSON, which has string keys!
423	# So I think we can print it like ["a",null,'b"] and that won't
424	# change. That's what users expect.
425	elif case(value_e.BashArray):
426	val = cast(value.BashArray, UP_val)
427
428	self.buf.write('[')
429	self._MaybeNewline()
430	for i, s in enumerate(val.strs):
431	if i != 0:
432	self.buf.write(',')
433	self._MaybeNewline()
434
435	self._ItemIndent(level)
436	if s is None:
437	self.buf.write('null')
438	else:
439	pyj8.WriteString(s, self.options, self.buf)
440
441	self._MaybeNewline()
442
443	self._BracketIndent(level)
444	self.buf.write(']')
445
446	elif case(value_e.BashAssoc):
447	val = cast(value.BashAssoc, UP_val)
448
449	self.buf.write('{')
450	self._MaybeNewline()
451	i = 0
452	for k2, v2 in iteritems(val.d):
453	if i != 0:
454	self.buf.write(',')
455	self._MaybeNewline()
456
457	self._ItemIndent(level)
458
459	pyj8.WriteString(k2, self.options, self.buf)
460
461	self.buf.write(':')
462	self._MaybeSpace()
463
464	pyj8.WriteString(v2, self.options, self.buf)
465
466	i += 1
467
468	self._MaybeNewline()
469	self._BracketIndent(level)
470	self.buf.write('}')
471
472	else:
473	pass # mycpp workaround
474	if self.options & SHOW_NON_DATA:
475	# Similar to = operator, ui.DebugPrint()
476	# TODO: that prints value.Range in a special way
477	ysh_type = ValType(val)
478	id_str = ValueIdString(val)
479	self.buf.write('<%s%s>' % (ysh_type, id_str))
480	else:
481	raise error.Encode("Can't serialize object of type %s" %
482	ValType(val))
483
484
485	class PrettyPrinter(object):
486	""" Unused right now, but could enhance the = operator.
487
488	Output to polymorphic ColorOutput
489
490	Features like asdl/format.py:
491	- line wrapping
492	- color
493	- sharing detection by passing in a REF COUTN dict
494	- print @123 the first time, and then print ... the second time
495
496	and
497
498	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501	- Omitting commas for ASDL? Maybe we can use two spaces
502
503	(Token id: Id.VSub_DollarName start: 0 length: 3)
504	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505	"""
506
507	def __init__(self, max_col):
508	# type: (int) -> None
509	self.max_col = max_col
510
511	# This could be an optimized set an C++ bit set like
512	# mark_sweep_heap.h, rather than a Dict
513	#self.unique_objs = mylib.UniqueObjects()
514
515	# first pass of object ID -> number of times references
516
517	self.ref_count = {} # type: Dict[int, int]
518
519	def PrettyTree(self, val, f):
520	# type: (value_t, fmt.ColorOutput) -> None
521
522	# TODO: first convert to hnode.asdl types?
523
524	# Although we might want
525	# hnode.AlreadyShown = (str type, int unique_id)
526	pass
527
528	def Print(self, val, buf):
529	# type: (value_t, mylib.BufWriter) -> None
530
531	# Or print to stderr?
532	f = fmt.DetectConsoleOutput(mylib.Stdout())
533	self.PrettyTree(val, f)
534
535	# Then print those with ASDL
536	pass
537
538
539	class LexerDecoder(object):
540	"""J8 lexer and string decoder.
541
542	Similar interface as SimpleLexer, except we return an optional decoded
543	string
544	"""
545
546	def __init__(self, s, is_j8, lang_str):
547	# type: (str, bool, str) -> None
548	self.s = s
549	self.is_j8 = is_j8
550	self.lang_str = lang_str
551
552	self.pos = 0
553
554	# current line being lexed -- for error messages
555	self.cur_line_num = 1
556
557	# Reuse this instance to save GC objects. JSON objects could have
558	# thousands of strings.
559	self.decoded = mylib.BufWriter()
560
561	def _Error(self, msg, end_pos):
562	# type: (str, int) -> error.Decode
563
564	# Use the current position as start pos
565	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567	def Next(self):
568	# type: () -> Tuple[Id_t, int, Optional[str]]
569	""" Returns a token and updates self.pos """
570
571	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573	if not self.is_j8:
574	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575	raise self._Error(
576	"Single quotes aren't part of JSON; you may want 'json8 read'",
577	end_pos)
578	if tok_id == Id.Ignored_Comment:
579	raise self._Error(
580	"Comments aren't part of JSON; you may want 'json8 read'",
581	end_pos)
582
583	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584	Id.Left_USingleQuote):
585	return self._DecodeString(tok_id, end_pos)
586
587	if tok_id == Id.Left_JDoubleQuote:
588	if self.is_j8:
589	return self._DecodeString(tok_id, end_pos)
590	else:
591	raise self._Error('Pure JSON does not accept j"" prefix',
592	end_pos)
593
594	if tok_id == Id.Ignored_Newline:
595	#log('LINE %d', self.cur_line_num)
596	self.cur_line_num += 1
597
598	self.pos = end_pos
599	return tok_id, end_pos, None
600
601	def NextForLines(self):
602	# type: () -> Tuple[Id_t, int, Optional[str]]
603	""" Like Next(), but for J8 Lines """
604
605	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
606
607	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
608	Id.Left_BSingleQuote, Id.Left_USingleQuote):
609	return self._DecodeString(tok_id, end_pos)
610
611	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
612	# this for quoted strings.)
613	if (tok_id == Id.Lit_Chars and
614	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
615	raise self._Error(
616	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
617	if tok_id == Id.Char_AsciiControl:
618	raise self._Error(
619	"J8 Lines can't have unescaped ASCII control chars", end_pos)
620
621	if tok_id == Id.J8_Newline:
622	#log('LINE %d', self.cur_line_num)
623	self.cur_line_num += 1
624
625	self.pos = end_pos
626	return tok_id, end_pos, None
627
628	def _DecodeString(self, left_id, str_pos):
629	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
630	""" Returns a string token and updates self.pos """
631
632	while True:
633	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
634	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
635	else:
636	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
637
638	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
639
640	if tok_id == Id.Eol_Tok:
641	# TODO: point to beginning of # quote?
642	raise self._Error(
643	'Unexpected EOF while lexing %s string' % self.lang_str,
644	str_end)
645	if tok_id == Id.Unknown_Backslash:
646	raise self._Error(
647	'Bad backslash escape in %s string' % self.lang_str,
648	str_end)
649	if tok_id == Id.Char_AsciiControl:
650	raise self._Error(
651	"%s strings can't have unescaped ASCII control chars" %
652	self.lang_str, str_end)
653
654	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
655
656	self.pos = str_end
657
658	s = self.decoded.getvalue()
659	self.decoded.clear() # reuse this instance
660
661	#log('decoded %r', self.decoded.getvalue())
662	return Id.J8_String, str_end, s
663
664	#
665	# Now handle each kind of token
666	#
667
668	if tok_id == Id.Lit_Chars: # JSON and J8
669	part = self.s[str_pos:str_end]
670	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
671	raise self._Error(
672	'Invalid UTF-8 in %s string literal' % self.lang_str,
673	str_end)
674
675	# TODO: would be nice to avoid allocation in all these cases.
676	# But LookupCharC() would have to change.
677
678	elif tok_id == Id.Char_OneChar: # JSON and J8
679	ch = self.s[str_pos + 1]
680	part = consts.LookupCharC(ch)
681
682	elif tok_id == Id.Char_UBraced: # J8 only
683	h = self.s[str_pos + 3:str_end - 1]
684	i = int(h, 16)
685
686	# Same checks in osh/word_compile.py
687	if i > 0x10ffff:
688	raise self._Error(
689	"Code point can't be greater than U+10ffff", str_end)
690	if 0xD800 <= i and i < 0xE000:
691	raise self._Error(
692	r"\u{%s} escape is illegal because it's in the surrogate range"
693	% h, str_end)
694
695	part = Utf8Encode(i)
696
697	elif tok_id == Id.Char_YHex: # J8 only
698	h = self.s[str_pos + 2:str_end]
699
700	# Same check in osh/word_parse.py
701	if left_id != Id.Left_BSingleQuote:
702	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
703	raise self._Error(
704	r"\y%s escapes not allowed in u'' strings" % h,
705	str_end)
706
707	i = int(h, 16)
708	part = chr(i)
709
710	elif tok_id == Id.Char_SurrogatePair:
711	h1 = self.s[str_pos + 2:str_pos + 6]
712	h2 = self.s[str_pos + 8:str_pos + 12]
713
714	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
715	i1 = int(h1, 16) - 0xD800 # high surrogate
716	i2 = int(h2, 16) - 0xDC00 # low surrogate
717	code_point = 0x10000 + (i1 << 10) + i2
718
719	part = Utf8Encode(code_point)
720
721	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
722	h = self.s[str_pos + 2:str_end]
723	i = int(h, 16)
724	part = Utf8Encode(i)
725
726	else:
727	# Should never happen
728	raise AssertionError(Id_str(tok_id))
729
730	#log('%s part %r', Id_str(tok_id), part)
731	self.decoded.write(part)
732	str_pos = str_end
733
734
735	class _Parser(object):
736
737	def __init__(self, s, is_j8):
738	# type: (str, bool) -> None
739	self.s = s
740	self.is_j8 = is_j8
741	self.lang_str = "J8" if is_j8 else "JSON"
742
743	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
744	self.tok_id = Id.Undefined_Tok
745	self.start_pos = 0
746	self.end_pos = 0
747	self.decoded = '' # decoded J8 string
748
749	def _Next(self):
750	# type: () -> None
751
752	# This isn't the start of a J8_Bool token, it's the END of the token before it
753	while True:
754	self.start_pos = self.end_pos
755	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
756	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
757	Id.Ignored_Comment):
758	break
759	# TODO: add Ignored_Newline to count lines, and show line numbers
760	# in errors messages. The position of the last newline and a token
761	# can be used to calculate a column number.
762
763	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
764
765	def _Eat(self, tok_id):
766	# type: (Id_t) -> None
767
768	if self.tok_id != tok_id:
769	#log('position %r %d-%d %r', self.s, self.start_pos,
770	# self.end_pos, self.s[self.start_pos:self.end_pos])
771	raise self._ParseError("Expected %s, got %s" %
772	(Id_str(tok_id), Id_str(self.tok_id)))
773	self._Next()
774
775	def _NextForLines(self):
776	# type: () -> None
777	"""Like _Next, but use the J8 Lines lexer."""
778	self.start_pos = self.end_pos
779	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
780
781	def _ParseError(self, msg):
782	# type: (str) -> error.Decode
783	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
784	self.lexer.cur_line_num)
785
786
787	class Parser(_Parser):
788	"""JSON and JSON8 Parser."""
789
790	def __init__(self, s, is_j8):
791	# type: (str, bool) -> None
792	_Parser.__init__(self, s, is_j8)
793
794	def _ParsePair(self):
795	# type: () -> Tuple[str, value_t]
796
797	k = self.decoded # Save the potential string value
798	self._Eat(Id.J8_String) # Check that it's a string
799	assert k is not None
800
801	self._Eat(Id.J8_Colon)
802
803	v = self._ParseValue()
804	return k, v
805
806	def _ParseDict(self):
807	# type: () -> value_t
808	"""
809	pair = string ':' value
810	Dict = '{' '}'
811	\| '{' pair (',' pair)* '}'
812	"""
813	# precondition
814	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
815
816	#log('> Dict')
817
818	d = NewDict() # type: Dict[str, value_t]
819
820	self._Next()
821	if self.tok_id == Id.J8_RBrace:
822	self._Next()
823	return value.Dict(d)
824
825	k, v = self._ParsePair()
826	d[k] = v
827	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
828
829	while self.tok_id == Id.J8_Comma:
830	self._Next()
831	k, v = self._ParsePair()
832	d[k] = v
833	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
834
835	self._Eat(Id.J8_RBrace)
836
837	#log('< Dict')
838
839	return value.Dict(d)
840
841	def _ParseList(self):
842	# type: () -> value_t
843	"""
844	List = '[' ']'
845	\| '[' value (',' value)* ']'
846	"""
847	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
848
849	items = [] # type: List[value_t]
850
851	self._Next()
852	if self.tok_id == Id.J8_RBracket:
853	self._Next()
854	return value.List(items)
855
856	items.append(self._ParseValue())
857
858	while self.tok_id == Id.J8_Comma:
859	self._Next()
860	items.append(self._ParseValue())
861
862	self._Eat(Id.J8_RBracket)
863
864	return value.List(items)
865
866	def _ParseValue(self):
867	# type: () -> value_t
868	if self.tok_id == Id.J8_LBrace:
869	return self._ParseDict()
870
871	elif self.tok_id == Id.J8_LBracket:
872	return self._ParseList()
873
874	elif self.tok_id == Id.J8_Null:
875	self._Next()
876	return value.Null
877
878	elif self.tok_id == Id.J8_Bool:
879	#log('%r %d', self.s[self.start_pos], self.start_pos)
880	b = value.Bool(self.s[self.start_pos] == 't')
881	self._Next()
882	return b
883
884	elif self.tok_id == Id.J8_Int:
885	part = self.s[self.start_pos:self.end_pos]
886	self._Next()
887	try:
888	big = mops.FromStr(part)
889	except ValueError:
890	raise self._ParseError('Integer is too big')
891	return value.Int(big)
892
893	elif self.tok_id == Id.J8_Float:
894	part = self.s[self.start_pos:self.end_pos]
895	self._Next()
896	return value.Float(float(part))
897
898	# UString, BString too
899	elif self.tok_id == Id.J8_String:
900	str_val = value.Str(self.decoded)
901	#log('d %r', self.decoded)
902	self._Next()
903	return str_val
904
905	elif self.tok_id == Id.Eol_Tok:
906	raise self._ParseError('Unexpected EOF while parsing %s' %
907	self.lang_str)
908
909	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
910	raise self._ParseError('Invalid token while parsing %s: %s' %
911	(self.lang_str, Id_str(self.tok_id)))
912
913	def ParseValue(self):
914	# type: () -> value_t
915	""" Raises error.Decode. """
916	self._Next()
917	obj = self._ParseValue()
918
919	n = len(self.s)
920	if self.start_pos != n:
921	extra = n - self.start_pos
922	#log('n %d pos %d', n, self.start_pos)
923	raise self._ParseError(
924	'Got %d bytes of unexpected trailing input' % extra)
925	return obj
926
927
928	class Nil8Parser(_Parser):
929	"""
930	Tokens not in JSON8:
931	LParen RParen Symbol
932
933	Tokens not in JSON, but in JSON8 and NIL8:
934	Identifier (unquoted keys)
935	Ignored_Comment
936	"""
937
938	def __init__(self, s, is_j8):
939	# type: (str, bool) -> None
940	_Parser.__init__(self, s, is_j8)
941
942	if 0:
943
944	def _LookAhead(self):
945	# type: () -> Id_t
946	"""
947	Don't need this right now
948	"""
949	end_pos = self.end_pos # look ahead from last token
950	while True:
951	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
952	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
953	Id.Ignored_Comment):
954	break
955	return tok_id
956
957	def _ParseRecord(self):
958	# type: () -> nvalue_t
959	"""
960	Yaks
961	(self->Next) => (-> self Next)
962	(self->Next obj.field) => ((-> self Next) (. obj field))
963
964	Similar to
965	((identity identity) 42) => 42 in Clojure
966
967	ASDL
968	(Node left:(. x4beef2))
969	(Node left !x4beef2)
970
971	# Ambiguous because value can be identifier.
972	# We have to look ahead to and see if there's a colon :
973	field =
974	Identifier ':' value
975	\| value
976
977	record = '(' head field* ')'
978
979	- Identifier \| Symbol are treated the same, it's a side effect of
980	the lexing style
981	- do positional args come before named args
982	- () is invalid? Use [] for empty list
983	"""
984	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
985
986	items = [] # type: List[nvalue_t]
987
988	self._Next()
989	if self.tok_id == Id.J8_RParen:
990	self._Next()
991	return nvalue.List(items)
992
993	#log('TOK %s', Id_str(self.tok_id))
994	while self.tok_id != Id.J8_RParen:
995	items.append(self._ParseNil8())
996	#log('TOK 2 %s', Id_str(self.tok_id))
997
998	self._Eat(Id.J8_RParen)
999
1000	return nvalue.List(items)
1001
1002	def _ParseList8(self):
1003	# type: () -> nvalue_t
1004	"""
1005	List8 = '[' value* ']'
1006
1007	No commas, not even optional ones for now.
1008	"""
1009	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1010
1011	items = [] # type: List[nvalue_t]
1012
1013	self._Next()
1014	if self.tok_id == Id.J8_RBracket:
1015	self._Next()
1016	return nvalue.List(items)
1017
1018	#log('TOK %s', Id_str(self.tok_id))
1019	while self.tok_id != Id.J8_RBracket:
1020	items.append(self._ParseNil8())
1021	#log('TOK 2 %s', Id_str(self.tok_id))
1022
1023	self._Eat(Id.J8_RBracket)
1024
1025	return nvalue.List(items)
1026
1027	def _ParseNil8(self):
1028	# type: () -> nvalue_t
1029	if self.tok_id == Id.J8_LParen:
1030	obj = self._ParseRecord() # type: nvalue_t
1031	#return obj
1032
1033	elif self.tok_id == Id.J8_LBracket:
1034	obj = self._ParseList8()
1035	#return obj
1036
1037	# Primitives are copied from J8 above.
1038	# TODO: We also want hex literals.
1039	elif self.tok_id == Id.J8_Null:
1040	self._Next()
1041	obj = nvalue.Null
1042
1043	elif self.tok_id == Id.J8_Bool:
1044	b = nvalue.Bool(self.s[self.start_pos] == 't')
1045	self._Next()
1046	obj = b
1047
1048	elif self.tok_id == Id.J8_Int:
1049	part = self.s[self.start_pos:self.end_pos]
1050	self._Next()
1051	obj = nvalue.Int(int(part))
1052
1053	elif self.tok_id == Id.J8_Float:
1054	part = self.s[self.start_pos:self.end_pos]
1055	self._Next()
1056	obj = nvalue.Float(float(part))
1057
1058	elif self.tok_id == Id.J8_String:
1059	str_val = nvalue.Str(self.decoded)
1060	self._Next()
1061	obj = str_val
1062
1063	# <- etc.
1064	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1065	Id.J8_Comma):
1066	# unquoted "word" treated like a string
1067	part = self.s[self.start_pos:self.end_pos]
1068	self._Next()
1069	obj = nvalue.Symbol(part)
1070
1071	elif self.tok_id == Id.Eol_Tok:
1072	raise self._ParseError('Unexpected EOF while parsing %s' %
1073	self.lang_str)
1074
1075	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1076	raise self._ParseError('Invalid token while parsing %s: %s' %
1077	(self.lang_str, Id_str(self.tok_id)))
1078
1079	#log('YO %s', Id_str(self.tok_id))
1080	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1081	#log('AT %s', Id_str(self.tok_id))
1082
1083	# key: "value" -> (: key "value")
1084	part = self.s[self.start_pos:self.end_pos]
1085	op = nvalue.Symbol(part)
1086
1087	self._Next()
1088	operand2 = self._ParseNil8()
1089	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1090	#print("--> INFIX %d %s" % (id(infix), infix))
1091	return infix
1092
1093	#next_id = self._LookAhead()
1094	#print('NEXT %s' % Id_str(next_id))
1095
1096	#raise AssertionError()
1097	#print("--> OBJ %d %s" % (id(obj), obj))
1098	return obj
1099
1100	def ParseNil8(self):
1101	# type: () -> nvalue_t
1102	""" Raises error.Decode. """
1103	self._Next()
1104	#print('yo')
1105	obj = self._ParseNil8()
1106	#print("==> %d %s" % (id(obj), obj))
1107	if self.tok_id != Id.Eol_Tok:
1108	raise self._ParseError('Unexpected trailing input')
1109	return obj
1110
1111
1112	class J8LinesParser(_Parser):
1113	"""Decode lines from a string with newlines.
1114
1115	We specify this with a grammar, to preserve location info and to reduce
1116	allocations. (But note that unquoted_line is more like a LOOP than it is
1117	grammatical.)
1118
1119	Grammar:
1120
1121	end = J8_Newline \| Eol_Tok
1122
1123	empty_line = WS_Space? end
1124
1125	# special case: read until end token, but REMOVE trailing WS_Space
1126	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1127
1128	j8_line = WS_Space? J8_String WS_Space? end
1129
1130	lines = (empty_line \| unquoted_line \| j8_line)*
1131
1132	where Lit_Chars is valid UTF-8
1133
1134	Notes:
1135
1136	(1) We disallow multiple strings on a line, like:
1137
1138	"json" "json2"
1139	"json" unquoted
1140
1141	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1142
1143	foo "" u''
1144
1145	The "" and u'' are not a decoded string, because the line started with
1146	Id.Lit_Chars literals.
1147
1148	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1149	Does it have - for empty cell?
1150	"""
1151
1152	def __init__(self, s):
1153	# type: (str) -> None
1154	_Parser.__init__(self, s, True)
1155
1156	def _Show(self, s):
1157	# type: (str) -> None
1158	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1159	self.end_pos)
1160
1161	def _ParseLine(self, out):
1162	# type: (List[str]) -> None
1163	""" May append a line to 'out' """
1164	#self._Show('1')
1165	if self.tok_id == Id.WS_Space:
1166	self._NextForLines()
1167
1168	# Empty line - return without doing anything
1169	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1170	self._NextForLines()
1171	return
1172
1173	# Quoted string on line
1174	if self.tok_id == Id.J8_String:
1175	out.append(self.decoded)
1176	self._NextForLines()
1177
1178	if self.tok_id == Id.WS_Space: # trailing whitespace
1179	self._NextForLines()
1180
1181	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1182	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1183	Id_str(self.tok_id))
1184
1185	self._NextForLines()
1186	return
1187
1188	# Unquoted line
1189	if self.tok_id == Id.Lit_Chars:
1190	# ' unquoted "" text on line ' # read every token until end
1191	string_start = self.start_pos
1192	while True:
1193	# for stripping whitespace
1194	prev_id = self.tok_id
1195	prev_start = self.start_pos
1196
1197	self._NextForLines()
1198
1199	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1200	# \r, but we're sticking with the JSON spec definition of
1201	# whitespace. (As another data point, CPython on Unix allows
1202	# \r in the middle of expressions, treating it as whitespace.)
1203	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1204	break
1205
1206	if prev_id == Id.WS_Space:
1207	string_end = prev_start # remove trailing whitespace
1208	else:
1209	string_end = self.start_pos
1210
1211	out.append(self.s[string_start:string_end])
1212
1213	self._NextForLines() # past newline
1214	return
1215
1216	raise AssertionError(Id_str(self.tok_id))
1217
1218	def Parse(self):
1219	# type: () -> List[str]
1220	""" Raises error.Decode. """
1221	self._NextForLines()
1222
1223	lines = [] # type: List[str]
1224	while self.tok_id != Id.Eol_Tok:
1225	self._ParseLine(lines)
1226
1227	if self.tok_id != Id.Eol_Tok:
1228	raise self._ParseError('Unexpected trailing input in J8 Lines')
1229
1230	return lines
1231
1232
1233	def SplitJ8Lines(s):
1234	# type: (str) -> List[str]
1235	"""Used by @(echo split command sub)
1236
1237	Raises:
1238	error.Decode
1239
1240	3 Errors:
1241	- J8 string syntax error inside quotes
1242	- Extra input on line
1243	- unquoted line isn't utf-8
1244	"""
1245	p = J8LinesParser(s)
1246	return p.Parse()
1247
1248
1249	# vim: sw=4