data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1258 lines, 611 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
187	f.write(buf.getvalue())
188	f.write('\n')
189
190
191	def EncodeString(s, buf, unquoted_ok=False):
192	# type: (str, mylib.BufWriter, bool) -> None
193	""" For pp proc, etc."""
194
195	if unquoted_ok and fastfunc.CanOmitQuotes(s):
196	buf.write(s)
197	return
198
199	_Print(value.Str(s), buf, -1)
200
201
202	def MaybeEncodeString(s):
203	# type: (str) -> str
204	""" For write --json8 $s and compexport """
205
206	# TODO: add unquoted_ok here?
207	# /usr/local/foo-bar/x.y/a_b
208
209	buf = mylib.BufWriter()
210	_Print(value.Str(s), buf, -1)
211	return buf.getvalue()
212
213
214	def MaybeEncodeJsonString(s):
215	# type: (str) -> str
216	""" For write --json """
217
218	# TODO: add unquoted_ok here?
219	# /usr/local/foo-bar/x.y/a_b
220	buf = mylib.BufWriter()
221	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
222	return buf.getvalue()
223
224
225	# DFS traversal state
226	UNSEEN = 0
227	EXPLORING = 1
228	FINISHED = 2
229
230
231	class InstancePrinter(object):
232	"""Print a value tree as J8/JSON."""
233
234	def __init__(self, buf, indent, options):
235	# type: (mylib.BufWriter, int, int) -> None
236	self.buf = buf
237	self.indent = indent
238	self.options = options
239
240	# Key is vm.HeapValueId(val)
241	# Value is always True
242	# Dict[int, None] doesn't translate -- it would be nice to have a set()
243	self.visited = {} # type: Dict[int, int]
244
245	def _ItemIndent(self, level):
246	# type: (int) -> None
247
248	if self.indent == -1:
249	return
250
251	self.buf.write_spaces((level + 1) * self.indent)
252
253	def _BracketIndent(self, level):
254	# type: (int) -> None
255
256	if self.indent == -1:
257	return
258
259	self.buf.write_spaces(level * self.indent)
260
261	def _MaybeNewline(self):
262	# type: () -> None
263	if self.indent == -1:
264	return
265	self.buf.write('\n')
266
267	def _MaybeSpace(self):
268	# type: () -> None
269	if self.indent == -1:
270	return
271	self.buf.write(' ')
272
273	def _PrintList(self, val, level):
274	# type: (value.List, int) -> None
275
276	if len(val.items) == 0: # Special case like Python/JS
277	self.buf.write('[]')
278	else:
279	self.buf.write('[')
280	self._MaybeNewline()
281	for i, item in enumerate(val.items):
282	if i != 0:
283	self.buf.write(',')
284	self._MaybeNewline()
285
286	self._ItemIndent(level)
287	self.Print(item, level + 1)
288	self._MaybeNewline()
289
290	self._BracketIndent(level)
291	self.buf.write(']')
292
293	def _PrintDict(self, val, level):
294	# type: (value.Dict, int) -> None
295
296	if len(val.d) == 0: # Special case like Python/JS
297	self.buf.write('{}')
298	else:
299	self.buf.write('{')
300	self._MaybeNewline()
301	i = 0
302	for k, v in iteritems(val.d):
303	if i != 0:
304	self.buf.write(',')
305	self._MaybeNewline()
306
307	self._ItemIndent(level)
308
309	pyj8.WriteString(k, self.options, self.buf)
310
311	self.buf.write(':')
312	self._MaybeSpace()
313
314	self.Print(v, level + 1)
315
316	i += 1
317
318	self._MaybeNewline()
319	self._BracketIndent(level)
320	self.buf.write('}')
321
322	def Print(self, val, level=0):
323	# type: (value_t, int) -> None
324
325	# special value that means everything is on one line
326	# It's like
327	# JSON.stringify(d, null, 0)
328	# except we use -1, not 0. 0 can still have newlines.
329
330	UP_val = val
331	with tagswitch(val) as case:
332	if case(value_e.Null):
333	self.buf.write('null')
334
335	elif case(value_e.Bool):
336	val = cast(value.Bool, UP_val)
337	self.buf.write('true' if val.b else 'false')
338
339	elif case(value_e.Int):
340	val = cast(value.Int, UP_val)
341	# TODO: avoid intermediate allocation with
342	# self.buf.WriteBigInt(val.i)
343	#
344	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
345	# be of arbitrary length, and will need a growth strategy.
346	# Although that is not very common, so we could allocate in
347	# that case.
348
349	self.buf.write(mops.ToStr(val.i))
350
351	elif case(value_e.Float):
352	val = cast(value.Float, UP_val)
353
354	fl = val.f
355	if ((self.options & INF_NAN_ARE_NULL) and
356	(math.isnan(fl) or math.isinf(fl))):
357	# JavaScript JSON lib behavior: Inf and NaN are null
358	# Python has a bug in the encoder by default, and then
359	# allow_nan=False raises an error
360	s = 'null'
361	else:
362	# TODO: can we avoid intermediate allocation?
363	# self.buf.WriteFloat(val.f)
364	s = str(fl)
365
366	self.buf.write(s)
367
368	elif case(value_e.Str):
369	val = cast(value.Str, UP_val)
370
371	pyj8.WriteString(val.s, self.options, self.buf)
372
373	elif case(value_e.List):
374	val = cast(value.List, UP_val)
375
376	# Cycle detection, only for containers that can be in cycles
377	heap_id = HeapValueId(val)
378
379	node_state = self.visited.get(heap_id, UNSEEN)
380	if node_state == FINISHED:
381	# Print it AGAIN. We print a JSON tree, which means we can
382	# visit and print nodes MANY TIMES, as long as they're not
383	# in a cycle.
384	self._PrintList(val, level)
385	return
386	if node_state == EXPLORING:
387	if self.options & SHOW_CYCLES:
388	self.buf.write('[ -->%s ]' % ValueIdString(val))
389	return
390	else:
391	# node.js prints which index closes the cycle
392	raise error.Encode(
393	"Can't encode List%s in object cycle" %
394	ValueIdString(val))
395
396	self.visited[heap_id] = EXPLORING
397	self._PrintList(val, level)
398	self.visited[heap_id] = FINISHED
399
400	elif case(value_e.Dict):
401	val = cast(value.Dict, UP_val)
402
403	# Cycle detection, only for containers that can be in cycles
404	heap_id = HeapValueId(val)
405
406	node_state = self.visited.get(heap_id, UNSEEN)
407	if node_state == FINISHED:
408	# Print it AGAIN. We print a JSON tree, which means we can
409	# visit and print nodes MANY TIMES, as long as they're not
410	# in a cycle.
411	self._PrintDict(val, level)
412	return
413	if node_state == EXPLORING:
414	if self.options & SHOW_CYCLES:
415	self.buf.write('{ -->%s }' % ValueIdString(val))
416	return
417	else:
418	# node.js prints which key closes the cycle
419	raise error.Encode(
420	"Can't encode Dict%s in object cycle" %
421	ValueIdString(val))
422
423	self.visited[heap_id] = EXPLORING
424	self._PrintDict(val, level)
425	self.visited[heap_id] = FINISHED
426
427	# BashArray and BashAssoc should be printed with pp line (x), e.g.
428	# for spec tests.
429	# - BashAssoc has a clear encoding.
430	# - BashArray could eventually be Dict[int, str]. But that's not
431	# encodable in JSON, which has string keys!
432	# So I think we can print it like ["a",null,'b"] and that won't
433	# change. That's what users expect.
434	elif case(value_e.BashArray):
435	val = cast(value.BashArray, UP_val)
436
437	self.buf.write('[')
438	self._MaybeNewline()
439	for i, s in enumerate(val.strs):
440	if i != 0:
441	self.buf.write(',')
442	self._MaybeNewline()
443
444	self._ItemIndent(level)
445	if s is None:
446	self.buf.write('null')
447	else:
448	pyj8.WriteString(s, self.options, self.buf)
449
450	self._MaybeNewline()
451
452	self._BracketIndent(level)
453	self.buf.write(']')
454
455	elif case(value_e.BashAssoc):
456	val = cast(value.BashAssoc, UP_val)
457
458	self.buf.write('{')
459	self._MaybeNewline()
460	i = 0
461	for k2, v2 in iteritems(val.d):
462	if i != 0:
463	self.buf.write(',')
464	self._MaybeNewline()
465
466	self._ItemIndent(level)
467
468	pyj8.WriteString(k2, self.options, self.buf)
469
470	self.buf.write(':')
471	self._MaybeSpace()
472
473	pyj8.WriteString(v2, self.options, self.buf)
474
475	i += 1
476
477	self._MaybeNewline()
478	self._BracketIndent(level)
479	self.buf.write('}')
480
481	else:
482	pass # mycpp workaround
483	if self.options & SHOW_NON_DATA:
484	# Similar to = operator, ui.DebugPrint()
485	# TODO: that prints value.Range in a special way
486	ysh_type = ValType(val)
487	id_str = ValueIdString(val)
488	self.buf.write('<%s%s>' % (ysh_type, id_str))
489	else:
490	raise error.Encode("Can't serialize object of type %s" %
491	ValType(val))
492
493
494	class PrettyPrinter(object):
495	""" Unused right now, but could enhance the = operator.
496
497	Output to polymorphic ColorOutput
498
499	Features like asdl/format.py:
500	- line wrapping
501	- color
502	- sharing detection by passing in a REF COUTN dict
503	- print @123 the first time, and then print ... the second time
504
505	and
506
507	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
508	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
509
510	- Omitting commas for ASDL? Maybe we can use two spaces
511
512	(Token id: Id.VSub_DollarName start: 0 length: 3)
513	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
514	"""
515
516	def __init__(self, max_col):
517	# type: (int) -> None
518	self.max_col = max_col
519
520	# This could be an optimized set an C++ bit set like
521	# mark_sweep_heap.h, rather than a Dict
522	#self.unique_objs = mylib.UniqueObjects()
523
524	# first pass of object ID -> number of times references
525
526	self.ref_count = {} # type: Dict[int, int]
527
528	def PrettyTree(self, val, f):
529	# type: (value_t, fmt.ColorOutput) -> None
530
531	# TODO: first convert to hnode.asdl types?
532
533	# Although we might want
534	# hnode.AlreadyShown = (str type, int unique_id)
535	pass
536
537	def Print(self, val, buf):
538	# type: (value_t, mylib.BufWriter) -> None
539
540	# Or print to stderr?
541	f = fmt.DetectConsoleOutput(mylib.Stdout())
542	self.PrettyTree(val, f)
543
544	# Then print those with ASDL
545	pass
546
547
548	class LexerDecoder(object):
549	"""J8 lexer and string decoder.
550
551	Similar interface as SimpleLexer, except we return an optional decoded
552	string
553	"""
554
555	def __init__(self, s, is_j8, lang_str):
556	# type: (str, bool, str) -> None
557	self.s = s
558	self.is_j8 = is_j8
559	self.lang_str = lang_str
560
561	self.pos = 0
562
563	# current line being lexed -- for error messages
564	self.cur_line_num = 1
565
566	# Reuse this instance to save GC objects. JSON objects could have
567	# thousands of strings.
568	self.decoded = mylib.BufWriter()
569
570	def _Error(self, msg, end_pos):
571	# type: (str, int) -> error.Decode
572
573	# Use the current position as start pos
574	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
575
576	def Next(self):
577	# type: () -> Tuple[Id_t, int, Optional[str]]
578	""" Returns a token and updates self.pos """
579
580	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
581
582	if not self.is_j8:
583	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
584	raise self._Error(
585	"Single quotes aren't part of JSON; you may want 'json8 read'",
586	end_pos)
587	if tok_id == Id.Ignored_Comment:
588	raise self._Error(
589	"Comments aren't part of JSON; you may want 'json8 read'",
590	end_pos)
591
592	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
593	Id.Left_USingleQuote):
594	return self._DecodeString(tok_id, end_pos)
595
596	if tok_id == Id.Left_JDoubleQuote:
597	if self.is_j8:
598	return self._DecodeString(tok_id, end_pos)
599	else:
600	raise self._Error('Pure JSON does not accept j"" prefix',
601	end_pos)
602
603	if tok_id == Id.Ignored_Newline:
604	#log('LINE %d', self.cur_line_num)
605	self.cur_line_num += 1
606
607	self.pos = end_pos
608	return tok_id, end_pos, None
609
610	def NextForLines(self):
611	# type: () -> Tuple[Id_t, int, Optional[str]]
612	""" Like Next(), but for J8 Lines """
613
614	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
615
616	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
617	Id.Left_BSingleQuote, Id.Left_USingleQuote):
618	return self._DecodeString(tok_id, end_pos)
619
620	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
621	# this for quoted strings.)
622	if (tok_id == Id.Lit_Chars and
623	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
624	raise self._Error(
625	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
626	if tok_id == Id.Char_AsciiControl:
627	raise self._Error(
628	"J8 Lines can't have unescaped ASCII control chars", end_pos)
629
630	if tok_id == Id.J8_Newline:
631	#log('LINE %d', self.cur_line_num)
632	self.cur_line_num += 1
633
634	self.pos = end_pos
635	return tok_id, end_pos, None
636
637	def _DecodeString(self, left_id, str_pos):
638	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
639	""" Returns a string token and updates self.pos """
640
641	while True:
642	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
643	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
644	else:
645	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
646
647	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
648
649	if tok_id == Id.Eol_Tok:
650	# TODO: point to beginning of # quote?
651	raise self._Error(
652	'Unexpected EOF while lexing %s string' % self.lang_str,
653	str_end)
654	if tok_id == Id.Unknown_Backslash:
655	raise self._Error(
656	'Bad backslash escape in %s string' % self.lang_str,
657	str_end)
658	if tok_id == Id.Char_AsciiControl:
659	raise self._Error(
660	"%s strings can't have unescaped ASCII control chars" %
661	self.lang_str, str_end)
662
663	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
664
665	self.pos = str_end
666
667	s = self.decoded.getvalue()
668	self.decoded.clear() # reuse this instance
669
670	#log('decoded %r', self.decoded.getvalue())
671	return Id.J8_String, str_end, s
672
673	#
674	# Now handle each kind of token
675	#
676
677	if tok_id == Id.Lit_Chars: # JSON and J8
678	part = self.s[str_pos:str_end]
679	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
680	raise self._Error(
681	'Invalid UTF-8 in %s string literal' % self.lang_str,
682	str_end)
683
684	# TODO: would be nice to avoid allocation in all these cases.
685	# But LookupCharC() would have to change.
686
687	elif tok_id == Id.Char_OneChar: # JSON and J8
688	ch = self.s[str_pos + 1]
689	part = consts.LookupCharC(ch)
690
691	elif tok_id == Id.Char_UBraced: # J8 only
692	h = self.s[str_pos + 3:str_end - 1]
693	i = int(h, 16)
694
695	# Same checks in osh/word_compile.py
696	if i > 0x10ffff:
697	raise self._Error(
698	"Code point can't be greater than U+10ffff", str_end)
699	if 0xD800 <= i and i < 0xE000:
700	raise self._Error(
701	r"\u{%s} escape is illegal because it's in the surrogate range"
702	% h, str_end)
703
704	part = Utf8Encode(i)
705
706	elif tok_id == Id.Char_YHex: # J8 only
707	h = self.s[str_pos + 2:str_end]
708
709	# Same check in osh/word_parse.py
710	if left_id != Id.Left_BSingleQuote:
711	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
712	raise self._Error(
713	r"\y%s escapes not allowed in u'' strings" % h,
714	str_end)
715
716	i = int(h, 16)
717	part = chr(i)
718
719	elif tok_id == Id.Char_SurrogatePair:
720	h1 = self.s[str_pos + 2:str_pos + 6]
721	h2 = self.s[str_pos + 8:str_pos + 12]
722
723	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
724	i1 = int(h1, 16) - 0xD800 # high surrogate
725	i2 = int(h2, 16) - 0xDC00 # low surrogate
726	code_point = 0x10000 + (i1 << 10) + i2
727
728	part = Utf8Encode(code_point)
729
730	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
731	h = self.s[str_pos + 2:str_end]
732	i = int(h, 16)
733	part = Utf8Encode(i)
734
735	else:
736	# Should never happen
737	raise AssertionError(Id_str(tok_id))
738
739	#log('%s part %r', Id_str(tok_id), part)
740	self.decoded.write(part)
741	str_pos = str_end
742
743
744	class _Parser(object):
745
746	def __init__(self, s, is_j8):
747	# type: (str, bool) -> None
748	self.s = s
749	self.is_j8 = is_j8
750	self.lang_str = "J8" if is_j8 else "JSON"
751
752	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
753	self.tok_id = Id.Undefined_Tok
754	self.start_pos = 0
755	self.end_pos = 0
756	self.decoded = '' # decoded J8 string
757
758	def _Next(self):
759	# type: () -> None
760
761	# This isn't the start of a J8_Bool token, it's the END of the token before it
762	while True:
763	self.start_pos = self.end_pos
764	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
765	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
766	Id.Ignored_Comment):
767	break
768	# TODO: add Ignored_Newline to count lines, and show line numbers
769	# in errors messages. The position of the last newline and a token
770	# can be used to calculate a column number.
771
772	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
773
774	def _Eat(self, tok_id):
775	# type: (Id_t) -> None
776
777	if self.tok_id != tok_id:
778	#log('position %r %d-%d %r', self.s, self.start_pos,
779	# self.end_pos, self.s[self.start_pos:self.end_pos])
780	raise self._ParseError("Expected %s, got %s" %
781	(Id_str(tok_id), Id_str(self.tok_id)))
782	self._Next()
783
784	def _NextForLines(self):
785	# type: () -> None
786	"""Like _Next, but use the J8 Lines lexer."""
787	self.start_pos = self.end_pos
788	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
789
790	def _ParseError(self, msg):
791	# type: (str) -> error.Decode
792	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
793	self.lexer.cur_line_num)
794
795
796	class Parser(_Parser):
797	"""JSON and JSON8 Parser."""
798
799	def __init__(self, s, is_j8):
800	# type: (str, bool) -> None
801	_Parser.__init__(self, s, is_j8)
802
803	def _ParsePair(self):
804	# type: () -> Tuple[str, value_t]
805
806	k = self.decoded # Save the potential string value
807	self._Eat(Id.J8_String) # Check that it's a string
808	assert k is not None
809
810	self._Eat(Id.J8_Colon)
811
812	v = self._ParseValue()
813	return k, v
814
815	def _ParseDict(self):
816	# type: () -> value_t
817	"""
818	pair = string ':' value
819	Dict = '{' '}'
820	\| '{' pair (',' pair)* '}'
821	"""
822	# precondition
823	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
824
825	#log('> Dict')
826
827	d = NewDict() # type: Dict[str, value_t]
828
829	self._Next()
830	if self.tok_id == Id.J8_RBrace:
831	self._Next()
832	return value.Dict(d)
833
834	k, v = self._ParsePair()
835	d[k] = v
836	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
837
838	while self.tok_id == Id.J8_Comma:
839	self._Next()
840	k, v = self._ParsePair()
841	d[k] = v
842	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
843
844	self._Eat(Id.J8_RBrace)
845
846	#log('< Dict')
847
848	return value.Dict(d)
849
850	def _ParseList(self):
851	# type: () -> value_t
852	"""
853	List = '[' ']'
854	\| '[' value (',' value)* ']'
855	"""
856	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
857
858	items = [] # type: List[value_t]
859
860	self._Next()
861	if self.tok_id == Id.J8_RBracket:
862	self._Next()
863	return value.List(items)
864
865	items.append(self._ParseValue())
866
867	while self.tok_id == Id.J8_Comma:
868	self._Next()
869	items.append(self._ParseValue())
870
871	self._Eat(Id.J8_RBracket)
872
873	return value.List(items)
874
875	def _ParseValue(self):
876	# type: () -> value_t
877	if self.tok_id == Id.J8_LBrace:
878	return self._ParseDict()
879
880	elif self.tok_id == Id.J8_LBracket:
881	return self._ParseList()
882
883	elif self.tok_id == Id.J8_Null:
884	self._Next()
885	return value.Null
886
887	elif self.tok_id == Id.J8_Bool:
888	#log('%r %d', self.s[self.start_pos], self.start_pos)
889	b = value.Bool(self.s[self.start_pos] == 't')
890	self._Next()
891	return b
892
893	elif self.tok_id == Id.J8_Int:
894	part = self.s[self.start_pos:self.end_pos]
895	self._Next()
896	try:
897	big = mops.FromStr(part)
898	except ValueError:
899	raise self._ParseError('Integer is too big')
900	return value.Int(big)
901
902	elif self.tok_id == Id.J8_Float:
903	part = self.s[self.start_pos:self.end_pos]
904	self._Next()
905	return value.Float(float(part))
906
907	# UString, BString too
908	elif self.tok_id == Id.J8_String:
909	str_val = value.Str(self.decoded)
910	#log('d %r', self.decoded)
911	self._Next()
912	return str_val
913
914	elif self.tok_id == Id.Eol_Tok:
915	raise self._ParseError('Unexpected EOF while parsing %s' %
916	self.lang_str)
917
918	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
919	raise self._ParseError('Invalid token while parsing %s: %s' %
920	(self.lang_str, Id_str(self.tok_id)))
921
922	def ParseValue(self):
923	# type: () -> value_t
924	""" Raises error.Decode. """
925	self._Next()
926	obj = self._ParseValue()
927
928	n = len(self.s)
929	if self.start_pos != n:
930	extra = n - self.start_pos
931	#log('n %d pos %d', n, self.start_pos)
932	raise self._ParseError(
933	'Got %d bytes of unexpected trailing input' % extra)
934	return obj
935
936
937	class Nil8Parser(_Parser):
938	"""
939	Tokens not in JSON8:
940	LParen RParen Symbol
941
942	Tokens not in JSON, but in JSON8 and NIL8:
943	Identifier (unquoted keys)
944	Ignored_Comment
945	"""
946
947	def __init__(self, s, is_j8):
948	# type: (str, bool) -> None
949	_Parser.__init__(self, s, is_j8)
950
951	if 0:
952
953	def _LookAhead(self):
954	# type: () -> Id_t
955	"""
956	Don't need this right now
957	"""
958	end_pos = self.end_pos # look ahead from last token
959	while True:
960	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
961	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
962	Id.Ignored_Comment):
963	break
964	return tok_id
965
966	def _ParseRecord(self):
967	# type: () -> nvalue_t
968	"""
969	Yaks
970	(self->Next) => (-> self Next)
971	(self->Next obj.field) => ((-> self Next) (. obj field))
972
973	Similar to
974	((identity identity) 42) => 42 in Clojure
975
976	ASDL
977	(Node left:(. x4beef2))
978	(Node left !x4beef2)
979
980	# Ambiguous because value can be identifier.
981	# We have to look ahead to and see if there's a colon :
982	field =
983	Identifier ':' value
984	\| value
985
986	record = '(' head field* ')'
987
988	- Identifier \| Symbol are treated the same, it's a side effect of
989	the lexing style
990	- do positional args come before named args
991	- () is invalid? Use [] for empty list
992	"""
993	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
994
995	items = [] # type: List[nvalue_t]
996
997	self._Next()
998	if self.tok_id == Id.J8_RParen:
999	self._Next()
1000	return nvalue.List(items)
1001
1002	#log('TOK %s', Id_str(self.tok_id))
1003	while self.tok_id != Id.J8_RParen:
1004	items.append(self._ParseNil8())
1005	#log('TOK 2 %s', Id_str(self.tok_id))
1006
1007	self._Eat(Id.J8_RParen)
1008
1009	return nvalue.List(items)
1010
1011	def _ParseList8(self):
1012	# type: () -> nvalue_t
1013	"""
1014	List8 = '[' value* ']'
1015
1016	No commas, not even optional ones for now.
1017	"""
1018	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1019
1020	items = [] # type: List[nvalue_t]
1021
1022	self._Next()
1023	if self.tok_id == Id.J8_RBracket:
1024	self._Next()
1025	return nvalue.List(items)
1026
1027	#log('TOK %s', Id_str(self.tok_id))
1028	while self.tok_id != Id.J8_RBracket:
1029	items.append(self._ParseNil8())
1030	#log('TOK 2 %s', Id_str(self.tok_id))
1031
1032	self._Eat(Id.J8_RBracket)
1033
1034	return nvalue.List(items)
1035
1036	def _ParseNil8(self):
1037	# type: () -> nvalue_t
1038	if self.tok_id == Id.J8_LParen:
1039	obj = self._ParseRecord() # type: nvalue_t
1040	#return obj
1041
1042	elif self.tok_id == Id.J8_LBracket:
1043	obj = self._ParseList8()
1044	#return obj
1045
1046	# Primitives are copied from J8 above.
1047	# TODO: We also want hex literals.
1048	elif self.tok_id == Id.J8_Null:
1049	self._Next()
1050	obj = nvalue.Null
1051
1052	elif self.tok_id == Id.J8_Bool:
1053	b = nvalue.Bool(self.s[self.start_pos] == 't')
1054	self._Next()
1055	obj = b
1056
1057	elif self.tok_id == Id.J8_Int:
1058	part = self.s[self.start_pos:self.end_pos]
1059	self._Next()
1060	obj = nvalue.Int(int(part))
1061
1062	elif self.tok_id == Id.J8_Float:
1063	part = self.s[self.start_pos:self.end_pos]
1064	self._Next()
1065	obj = nvalue.Float(float(part))
1066
1067	elif self.tok_id == Id.J8_String:
1068	str_val = nvalue.Str(self.decoded)
1069	self._Next()
1070	obj = str_val
1071
1072	# <- etc.
1073	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1074	Id.J8_Comma):
1075	# unquoted "word" treated like a string
1076	part = self.s[self.start_pos:self.end_pos]
1077	self._Next()
1078	obj = nvalue.Symbol(part)
1079
1080	elif self.tok_id == Id.Eol_Tok:
1081	raise self._ParseError('Unexpected EOF while parsing %s' %
1082	self.lang_str)
1083
1084	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1085	raise self._ParseError('Invalid token while parsing %s: %s' %
1086	(self.lang_str, Id_str(self.tok_id)))
1087
1088	#log('YO %s', Id_str(self.tok_id))
1089	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1090	#log('AT %s', Id_str(self.tok_id))
1091
1092	# key: "value" -> (: key "value")
1093	part = self.s[self.start_pos:self.end_pos]
1094	op = nvalue.Symbol(part)
1095
1096	self._Next()
1097	operand2 = self._ParseNil8()
1098	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1099	#print("--> INFIX %d %s" % (id(infix), infix))
1100	return infix
1101
1102	#next_id = self._LookAhead()
1103	#print('NEXT %s' % Id_str(next_id))
1104
1105	#raise AssertionError()
1106	#print("--> OBJ %d %s" % (id(obj), obj))
1107	return obj
1108
1109	def ParseNil8(self):
1110	# type: () -> nvalue_t
1111	""" Raises error.Decode. """
1112	self._Next()
1113	#print('yo')
1114	obj = self._ParseNil8()
1115	#print("==> %d %s" % (id(obj), obj))
1116	if self.tok_id != Id.Eol_Tok:
1117	raise self._ParseError('Unexpected trailing input')
1118	return obj
1119
1120
1121	class J8LinesParser(_Parser):
1122	"""Decode lines from a string with newlines.
1123
1124	We specify this with a grammar, to preserve location info and to reduce
1125	allocations. (But note that unquoted_line is more like a LOOP than it is
1126	grammatical.)
1127
1128	Grammar:
1129
1130	end = J8_Newline \| Eol_Tok
1131
1132	empty_line = WS_Space? end
1133
1134	# special case: read until end token, but REMOVE trailing WS_Space
1135	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1136
1137	j8_line = WS_Space? J8_String WS_Space? end
1138
1139	lines = (empty_line \| unquoted_line \| j8_line)*
1140
1141	where Lit_Chars is valid UTF-8
1142
1143	Notes:
1144
1145	(1) We disallow multiple strings on a line, like:
1146
1147	"json" "json2"
1148	"json" unquoted
1149
1150	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1151
1152	foo "" u''
1153
1154	The "" and u'' are not a decoded string, because the line started with
1155	Id.Lit_Chars literals.
1156
1157	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1158	Does it have - for empty cell?
1159	"""
1160
1161	def __init__(self, s):
1162	# type: (str) -> None
1163	_Parser.__init__(self, s, True)
1164
1165	def _Show(self, s):
1166	# type: (str) -> None
1167	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1168	self.end_pos)
1169
1170	def _ParseLine(self, out):
1171	# type: (List[str]) -> None
1172	""" May append a line to 'out' """
1173	#self._Show('1')
1174	if self.tok_id == Id.WS_Space:
1175	self._NextForLines()
1176
1177	# Empty line - return without doing anything
1178	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1179	self._NextForLines()
1180	return
1181
1182	# Quoted string on line
1183	if self.tok_id == Id.J8_String:
1184	out.append(self.decoded)
1185	self._NextForLines()
1186
1187	if self.tok_id == Id.WS_Space: # trailing whitespace
1188	self._NextForLines()
1189
1190	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1191	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1192	Id_str(self.tok_id))
1193
1194	self._NextForLines()
1195	return
1196
1197	# Unquoted line
1198	if self.tok_id == Id.Lit_Chars:
1199	# ' unquoted "" text on line ' # read every token until end
1200	string_start = self.start_pos
1201	while True:
1202	# for stripping whitespace
1203	prev_id = self.tok_id
1204	prev_start = self.start_pos
1205
1206	self._NextForLines()
1207
1208	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1209	# \r, but we're sticking with the JSON spec definition of
1210	# whitespace. (As another data point, CPython on Unix allows
1211	# \r in the middle of expressions, treating it as whitespace.)
1212	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1213	break
1214
1215	if prev_id == Id.WS_Space:
1216	string_end = prev_start # remove trailing whitespace
1217	else:
1218	string_end = self.start_pos
1219
1220	out.append(self.s[string_start:string_end])
1221
1222	self._NextForLines() # past newline
1223	return
1224
1225	raise AssertionError(Id_str(self.tok_id))
1226
1227	def Parse(self):
1228	# type: () -> List[str]
1229	""" Raises error.Decode. """
1230	self._NextForLines()
1231
1232	lines = [] # type: List[str]
1233	while self.tok_id != Id.Eol_Tok:
1234	self._ParseLine(lines)
1235
1236	if self.tok_id != Id.Eol_Tok:
1237	raise self._ParseError('Unexpected trailing input in J8 Lines')
1238
1239	return lines
1240
1241
1242	def SplitJ8Lines(s):
1243	# type: (str) -> List[str]
1244	"""Used by @(echo split command sub)
1245
1246	Raises:
1247	error.Decode
1248
1249	3 Errors:
1250	- J8 string syntax error inside quotes
1251	- Extra input on line
1252	- unquoted line isn't utf-8
1253	"""
1254	p = J8LinesParser(s)
1255	return p.Parse()
1256
1257
1258	# vim: sw=4