data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1272 lines, 620 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	# TODO: Omit type at top level
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	def EncodeString(s, buf, unquoted_ok=False):
195	# type: (str, mylib.BufWriter, bool) -> None
196	""" For pp proc, etc."""
197
198	if unquoted_ok and fastfunc.CanOmitQuotes(s):
199	buf.write(s)
200	return
201
202	_Print(value.Str(s), buf, -1)
203
204
205	def MaybeEncodeString(s):
206	# type: (str) -> str
207	""" For write --json8 $s and compexport """
208
209	# TODO: add unquoted_ok here?
210	# /usr/local/foo-bar/x.y/a_b
211
212	buf = mylib.BufWriter()
213	_Print(value.Str(s), buf, -1)
214	return buf.getvalue()
215
216
217	def MaybeEncodeJsonString(s):
218	# type: (str) -> str
219	""" For write --json """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225	return buf.getvalue()
226
227
228	# DFS traversal state
229	UNSEEN = 0
230	EXPLORING = 1
231	FINISHED = 2
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	# Value is always True
245	# Dict[int, None] doesn't translate -- it would be nice to have a set()
246	self.visited = {} # type: Dict[int, int]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintDict(self, val, level):
297	# type: (value.Dict, int) -> None
298
299	if len(val.d) == 0: # Special case like Python/JS
300	self.buf.write('{}')
301	else:
302	self.buf.write('{')
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(val.d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write('}')
324
325	def Print(self, val, level=0):
326	# type: (value_t, int) -> None
327
328	# special value that means everything is on one line
329	# It's like
330	# JSON.stringify(d, null, 0)
331	# except we use -1, not 0. 0 can still have newlines.
332
333	UP_val = val
334	with tagswitch(val) as case:
335	if case(value_e.Null):
336	self.buf.write('null')
337
338	elif case(value_e.Bool):
339	val = cast(value.Bool, UP_val)
340	self.buf.write('true' if val.b else 'false')
341
342	elif case(value_e.Int):
343	val = cast(value.Int, UP_val)
344	# TODO: avoid intermediate allocation with
345	# self.buf.WriteBigInt(val.i)
346	#
347	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
348	# be of arbitrary length, and will need a growth strategy.
349	# Although that is not very common, so we could allocate in
350	# that case.
351
352	self.buf.write(mops.ToStr(val.i))
353
354	elif case(value_e.Float):
355	val = cast(value.Float, UP_val)
356
357	fl = val.f
358	if math.isinf(fl):
359	if self.options & INF_NAN_ARE_NULL:
360	s = 'null' # negative infinity is null too
361	else:
362	s = 'INFINITY'
363	if fl < 0:
364	s = '-' + s
365	elif math.isnan(fl):
366	if self.options & INF_NAN_ARE_NULL:
367	# JavaScript JSON lib behavior: Inf and NaN are null
368	# Python has a bug in the encoder by default, and then
369	# allow_nan=False raises an error
370	s = 'null'
371	else:
372	s = 'NAN'
373	else:
374	# TODO: can we avoid intermediate allocation?
375	# self.buf.WriteFloat(val.f)
376	s = str(fl)
377
378	self.buf.write(s)
379
380	elif case(value_e.Str):
381	val = cast(value.Str, UP_val)
382
383	pyj8.WriteString(val.s, self.options, self.buf)
384
385	elif case(value_e.List):
386	val = cast(value.List, UP_val)
387
388	# Cycle detection, only for containers that can be in cycles
389	heap_id = HeapValueId(val)
390
391	node_state = self.visited.get(heap_id, UNSEEN)
392	if node_state == FINISHED:
393	# Print it AGAIN. We print a JSON tree, which means we can
394	# visit and print nodes MANY TIMES, as long as they're not
395	# in a cycle.
396	self._PrintList(val, level)
397	return
398	if node_state == EXPLORING:
399	if self.options & SHOW_CYCLES:
400	self.buf.write('[ -->%s ]' % ValueIdString(val))
401	return
402	else:
403	# node.js prints which index closes the cycle
404	raise error.Encode(
405	"Can't encode List%s in object cycle" %
406	ValueIdString(val))
407
408	self.visited[heap_id] = EXPLORING
409	self._PrintList(val, level)
410	self.visited[heap_id] = FINISHED
411
412	elif case(value_e.Dict):
413	val = cast(value.Dict, UP_val)
414
415	# Cycle detection, only for containers that can be in cycles
416	heap_id = HeapValueId(val)
417
418	node_state = self.visited.get(heap_id, UNSEEN)
419	if node_state == FINISHED:
420	# Print it AGAIN. We print a JSON tree, which means we can
421	# visit and print nodes MANY TIMES, as long as they're not
422	# in a cycle.
423	self._PrintDict(val, level)
424	return
425	if node_state == EXPLORING:
426	if self.options & SHOW_CYCLES:
427	self.buf.write('{ -->%s }' % ValueIdString(val))
428	return
429	else:
430	# node.js prints which key closes the cycle
431	raise error.Encode(
432	"Can't encode Dict%s in object cycle" %
433	ValueIdString(val))
434
435	self.visited[heap_id] = EXPLORING
436	self._PrintDict(val, level)
437	self.visited[heap_id] = FINISHED
438
439	# TODO: New format, which should consistent with pretty printing
440	# pp line (x) supports BashArray and BashAssoc, e.g. for spec
441	# tests.
442
443	# - BashAssoc is Dict[str, str]
444	# (BashAssoc ['1']='foo' ['3']='bar')
445	# - BashArray will be Dict[int, str] - SparseArray. We should write it like
446	# (BashArray [1]='foo' [3]='bar')
447
448	elif case(value_e.BashArray):
449	val = cast(value.BashArray, UP_val)
450
451	self.buf.write('[')
452	self._MaybeNewline()
453	for i, s in enumerate(val.strs):
454	if i != 0:
455	self.buf.write(',')
456	self._MaybeNewline()
457
458	self._ItemIndent(level)
459	if s is None:
460	self.buf.write('null')
461	else:
462	pyj8.WriteString(s, self.options, self.buf)
463
464	self._MaybeNewline()
465
466	self._BracketIndent(level)
467	self.buf.write(']')
468
469	elif case(value_e.BashAssoc):
470	val = cast(value.BashAssoc, UP_val)
471
472	self.buf.write('{')
473	self._MaybeNewline()
474	i = 0
475	for k2, v2 in iteritems(val.d):
476	if i != 0:
477	self.buf.write(',')
478	self._MaybeNewline()
479
480	self._ItemIndent(level)
481
482	pyj8.WriteString(k2, self.options, self.buf)
483
484	self.buf.write(':')
485	self._MaybeSpace()
486
487	pyj8.WriteString(v2, self.options, self.buf)
488
489	i += 1
490
491	self._MaybeNewline()
492	self._BracketIndent(level)
493	self.buf.write('}')
494
495	else:
496	pass # mycpp workaround
497	if self.options & SHOW_NON_DATA:
498	# Similar to = operator, ui.DebugPrint()
499	# TODO: that prints value.Range in a special way
500	ysh_type = ValType(val)
501	id_str = ValueIdString(val)
502	self.buf.write('<%s%s>' % (ysh_type, id_str))
503	else:
504	raise error.Encode("Can't serialize object of type %s" %
505	ValType(val))
506
507
508	class PrettyPrinter(object):
509	""" Unused right now, but could enhance the = operator.
510
511	Output to polymorphic ColorOutput
512
513	Features like asdl/format.py:
514	- line wrapping
515	- color
516	- sharing detection by passing in a REF COUTN dict
517	- print @123 the first time, and then print ... the second time
518
519	and
520
521	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
522	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
523
524	- Omitting commas for ASDL? Maybe we can use two spaces
525
526	(Token id: Id.VSub_DollarName start: 0 length: 3)
527	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
528	"""
529
530	def __init__(self, max_col):
531	# type: (int) -> None
532	self.max_col = max_col
533
534	# This could be an optimized set an C++ bit set like
535	# mark_sweep_heap.h, rather than a Dict
536	#self.unique_objs = mylib.UniqueObjects()
537
538	# first pass of object ID -> number of times references
539
540	self.ref_count = {} # type: Dict[int, int]
541
542	def PrettyTree(self, val, f):
543	# type: (value_t, fmt.ColorOutput) -> None
544
545	# TODO: first convert to hnode.asdl types?
546
547	# Although we might want
548	# hnode.AlreadyShown = (str type, int unique_id)
549	pass
550
551	def Print(self, val, buf):
552	# type: (value_t, mylib.BufWriter) -> None
553
554	# Or print to stderr?
555	f = fmt.DetectConsoleOutput(mylib.Stdout())
556	self.PrettyTree(val, f)
557
558	# Then print those with ASDL
559	pass
560
561
562	class LexerDecoder(object):
563	"""J8 lexer and string decoder.
564
565	Similar interface as SimpleLexer, except we return an optional decoded
566	string
567	"""
568
569	def __init__(self, s, is_j8, lang_str):
570	# type: (str, bool, str) -> None
571	self.s = s
572	self.is_j8 = is_j8
573	self.lang_str = lang_str
574
575	self.pos = 0
576
577	# current line being lexed -- for error messages
578	self.cur_line_num = 1
579
580	# Reuse this instance to save GC objects. JSON objects could have
581	# thousands of strings.
582	self.decoded = mylib.BufWriter()
583
584	def _Error(self, msg, end_pos):
585	# type: (str, int) -> error.Decode
586
587	# Use the current position as start pos
588	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
589
590	def Next(self):
591	# type: () -> Tuple[Id_t, int, Optional[str]]
592	""" Returns a token and updates self.pos """
593
594	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
595
596	if not self.is_j8:
597	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
598	raise self._Error(
599	"Single quotes aren't part of JSON; you may want 'json8 read'",
600	end_pos)
601	if tok_id == Id.Ignored_Comment:
602	raise self._Error(
603	"Comments aren't part of JSON; you may want 'json8 read'",
604	end_pos)
605
606	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
607	Id.Left_USingleQuote):
608	return self._DecodeString(tok_id, end_pos)
609
610	if tok_id == Id.Left_JDoubleQuote:
611	if self.is_j8:
612	return self._DecodeString(tok_id, end_pos)
613	else:
614	raise self._Error('Pure JSON does not accept j"" prefix',
615	end_pos)
616
617	if tok_id == Id.Ignored_Newline:
618	#log('LINE %d', self.cur_line_num)
619	self.cur_line_num += 1
620
621	self.pos = end_pos
622	return tok_id, end_pos, None
623
624	def NextForLines(self):
625	# type: () -> Tuple[Id_t, int, Optional[str]]
626	""" Like Next(), but for J8 Lines """
627
628	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
629
630	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
631	Id.Left_BSingleQuote, Id.Left_USingleQuote):
632	return self._DecodeString(tok_id, end_pos)
633
634	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
635	# this for quoted strings.)
636	if (tok_id == Id.Lit_Chars and
637	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
638	raise self._Error(
639	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
640	if tok_id == Id.Char_AsciiControl:
641	raise self._Error(
642	"J8 Lines can't have unescaped ASCII control chars", end_pos)
643
644	if tok_id == Id.J8_Newline:
645	#log('LINE %d', self.cur_line_num)
646	self.cur_line_num += 1
647
648	self.pos = end_pos
649	return tok_id, end_pos, None
650
651	def _DecodeString(self, left_id, str_pos):
652	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
653	""" Returns a string token and updates self.pos """
654
655	while True:
656	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
657	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
658	else:
659	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
660
661	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
662
663	if tok_id == Id.Eol_Tok:
664	# TODO: point to beginning of # quote?
665	raise self._Error(
666	'Unexpected EOF while lexing %s string' % self.lang_str,
667	str_end)
668	if tok_id == Id.Unknown_Backslash:
669	raise self._Error(
670	'Bad backslash escape in %s string' % self.lang_str,
671	str_end)
672	if tok_id == Id.Char_AsciiControl:
673	raise self._Error(
674	"%s strings can't have unescaped ASCII control chars" %
675	self.lang_str, str_end)
676
677	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
678
679	self.pos = str_end
680
681	s = self.decoded.getvalue()
682	self.decoded.clear() # reuse this instance
683
684	#log('decoded %r', self.decoded.getvalue())
685	return Id.J8_String, str_end, s
686
687	#
688	# Now handle each kind of token
689	#
690
691	if tok_id == Id.Lit_Chars: # JSON and J8
692	part = self.s[str_pos:str_end]
693	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
694	raise self._Error(
695	'Invalid UTF-8 in %s string literal' % self.lang_str,
696	str_end)
697
698	# TODO: would be nice to avoid allocation in all these cases.
699	# But LookupCharC() would have to change.
700
701	elif tok_id == Id.Char_OneChar: # JSON and J8
702	ch = self.s[str_pos + 1]
703	part = consts.LookupCharC(ch)
704
705	elif tok_id == Id.Char_UBraced: # J8 only
706	h = self.s[str_pos + 3:str_end - 1]
707	i = int(h, 16)
708
709	# Same checks in osh/word_compile.py
710	if i > 0x10ffff:
711	raise self._Error(
712	"Code point can't be greater than U+10ffff", str_end)
713	if 0xD800 <= i and i < 0xE000:
714	raise self._Error(
715	r"\u{%s} escape is illegal because it's in the surrogate range"
716	% h, str_end)
717
718	part = Utf8Encode(i)
719
720	elif tok_id == Id.Char_YHex: # J8 only
721	h = self.s[str_pos + 2:str_end]
722
723	# Same check in osh/word_parse.py
724	if left_id != Id.Left_BSingleQuote:
725	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
726	raise self._Error(
727	r"\y%s escapes not allowed in u'' strings" % h,
728	str_end)
729
730	i = int(h, 16)
731	part = chr(i)
732
733	elif tok_id == Id.Char_SurrogatePair:
734	h1 = self.s[str_pos + 2:str_pos + 6]
735	h2 = self.s[str_pos + 8:str_pos + 12]
736
737	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
738	i1 = int(h1, 16) - 0xD800 # high surrogate
739	i2 = int(h2, 16) - 0xDC00 # low surrogate
740	code_point = 0x10000 + (i1 << 10) + i2
741
742	part = Utf8Encode(code_point)
743
744	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
745	h = self.s[str_pos + 2:str_end]
746	i = int(h, 16)
747	part = Utf8Encode(i)
748
749	else:
750	# Should never happen
751	raise AssertionError(Id_str(tok_id))
752
753	#log('%s part %r', Id_str(tok_id), part)
754	self.decoded.write(part)
755	str_pos = str_end
756
757
758	class _Parser(object):
759
760	def __init__(self, s, is_j8):
761	# type: (str, bool) -> None
762	self.s = s
763	self.is_j8 = is_j8
764	self.lang_str = "J8" if is_j8 else "JSON"
765
766	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
767	self.tok_id = Id.Undefined_Tok
768	self.start_pos = 0
769	self.end_pos = 0
770	self.decoded = '' # decoded J8 string
771
772	def _Next(self):
773	# type: () -> None
774
775	# This isn't the start of a J8_Bool token, it's the END of the token before it
776	while True:
777	self.start_pos = self.end_pos
778	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
779	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
780	Id.Ignored_Comment):
781	break
782	# TODO: add Ignored_Newline to count lines, and show line numbers
783	# in errors messages. The position of the last newline and a token
784	# can be used to calculate a column number.
785
786	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
787
788	def _Eat(self, tok_id):
789	# type: (Id_t) -> None
790
791	if self.tok_id != tok_id:
792	#log('position %r %d-%d %r', self.s, self.start_pos,
793	# self.end_pos, self.s[self.start_pos:self.end_pos])
794	raise self._ParseError("Expected %s, got %s" %
795	(Id_str(tok_id), Id_str(self.tok_id)))
796	self._Next()
797
798	def _NextForLines(self):
799	# type: () -> None
800	"""Like _Next, but use the J8 Lines lexer."""
801	self.start_pos = self.end_pos
802	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
803
804	def _ParseError(self, msg):
805	# type: (str) -> error.Decode
806	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
807	self.lexer.cur_line_num)
808
809
810	class Parser(_Parser):
811	"""JSON and JSON8 Parser."""
812
813	def __init__(self, s, is_j8):
814	# type: (str, bool) -> None
815	_Parser.__init__(self, s, is_j8)
816
817	def _ParsePair(self):
818	# type: () -> Tuple[str, value_t]
819
820	k = self.decoded # Save the potential string value
821	self._Eat(Id.J8_String) # Check that it's a string
822	assert k is not None
823
824	self._Eat(Id.J8_Colon)
825
826	v = self._ParseValue()
827	return k, v
828
829	def _ParseDict(self):
830	# type: () -> value_t
831	"""
832	pair = string ':' value
833	Dict = '{' '}'
834	\| '{' pair (',' pair)* '}'
835	"""
836	# precondition
837	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
838
839	#log('> Dict')
840
841	d = NewDict() # type: Dict[str, value_t]
842
843	self._Next()
844	if self.tok_id == Id.J8_RBrace:
845	self._Next()
846	return value.Dict(d)
847
848	k, v = self._ParsePair()
849	d[k] = v
850	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
851
852	while self.tok_id == Id.J8_Comma:
853	self._Next()
854	k, v = self._ParsePair()
855	d[k] = v
856	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
857
858	self._Eat(Id.J8_RBrace)
859
860	#log('< Dict')
861
862	return value.Dict(d)
863
864	def _ParseList(self):
865	# type: () -> value_t
866	"""
867	List = '[' ']'
868	\| '[' value (',' value)* ']'
869	"""
870	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
871
872	items = [] # type: List[value_t]
873
874	self._Next()
875	if self.tok_id == Id.J8_RBracket:
876	self._Next()
877	return value.List(items)
878
879	items.append(self._ParseValue())
880
881	while self.tok_id == Id.J8_Comma:
882	self._Next()
883	items.append(self._ParseValue())
884
885	self._Eat(Id.J8_RBracket)
886
887	return value.List(items)
888
889	def _ParseValue(self):
890	# type: () -> value_t
891	if self.tok_id == Id.J8_LBrace:
892	return self._ParseDict()
893
894	elif self.tok_id == Id.J8_LBracket:
895	return self._ParseList()
896
897	elif self.tok_id == Id.J8_Null:
898	self._Next()
899	return value.Null
900
901	elif self.tok_id == Id.J8_Bool:
902	#log('%r %d', self.s[self.start_pos], self.start_pos)
903	b = value.Bool(self.s[self.start_pos] == 't')
904	self._Next()
905	return b
906
907	elif self.tok_id == Id.J8_Int:
908	part = self.s[self.start_pos:self.end_pos]
909	self._Next()
910	try:
911	big = mops.FromStr(part)
912	except ValueError:
913	raise self._ParseError('Integer is too big')
914	return value.Int(big)
915
916	elif self.tok_id == Id.J8_Float:
917	part = self.s[self.start_pos:self.end_pos]
918	self._Next()
919	return value.Float(float(part))
920
921	# UString, BString too
922	elif self.tok_id == Id.J8_String:
923	str_val = value.Str(self.decoded)
924	#log('d %r', self.decoded)
925	self._Next()
926	return str_val
927
928	elif self.tok_id == Id.Eol_Tok:
929	raise self._ParseError('Unexpected EOF while parsing %s' %
930	self.lang_str)
931
932	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
933	raise self._ParseError('Invalid token while parsing %s: %s' %
934	(self.lang_str, Id_str(self.tok_id)))
935
936	def ParseValue(self):
937	# type: () -> value_t
938	""" Raises error.Decode. """
939	self._Next()
940	obj = self._ParseValue()
941
942	n = len(self.s)
943	if self.start_pos != n:
944	extra = n - self.start_pos
945	#log('n %d pos %d', n, self.start_pos)
946	raise self._ParseError(
947	'Got %d bytes of unexpected trailing input' % extra)
948	return obj
949
950
951	class Nil8Parser(_Parser):
952	"""
953	Tokens not in JSON8:
954	LParen RParen Symbol
955
956	Tokens not in JSON, but in JSON8 and NIL8:
957	Identifier (unquoted keys)
958	Ignored_Comment
959	"""
960
961	def __init__(self, s, is_j8):
962	# type: (str, bool) -> None
963	_Parser.__init__(self, s, is_j8)
964
965	if 0:
966
967	def _LookAhead(self):
968	# type: () -> Id_t
969	"""
970	Don't need this right now
971	"""
972	end_pos = self.end_pos # look ahead from last token
973	while True:
974	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
975	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
976	Id.Ignored_Comment):
977	break
978	return tok_id
979
980	def _ParseRecord(self):
981	# type: () -> nvalue_t
982	"""
983	Yaks
984	(self->Next) => (-> self Next)
985	(self->Next obj.field) => ((-> self Next) (. obj field))
986
987	Similar to
988	((identity identity) 42) => 42 in Clojure
989
990	ASDL
991	(Node left:(. x4beef2))
992	(Node left !x4beef2)
993
994	# Ambiguous because value can be identifier.
995	# We have to look ahead to and see if there's a colon :
996	field =
997	Identifier ':' value
998	\| value
999
1000	record = '(' head field* ')'
1001
1002	- Identifier \| Symbol are treated the same, it's a side effect of
1003	the lexing style
1004	- do positional args come before named args
1005	- () is invalid? Use [] for empty list
1006	"""
1007	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1008
1009	items = [] # type: List[nvalue_t]
1010
1011	self._Next()
1012	if self.tok_id == Id.J8_RParen:
1013	self._Next()
1014	return nvalue.List(items)
1015
1016	#log('TOK %s', Id_str(self.tok_id))
1017	while self.tok_id != Id.J8_RParen:
1018	items.append(self._ParseNil8())
1019	#log('TOK 2 %s', Id_str(self.tok_id))
1020
1021	self._Eat(Id.J8_RParen)
1022
1023	return nvalue.List(items)
1024
1025	def _ParseList8(self):
1026	# type: () -> nvalue_t
1027	"""
1028	List8 = '[' value* ']'
1029
1030	No commas, not even optional ones for now.
1031	"""
1032	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1033
1034	items = [] # type: List[nvalue_t]
1035
1036	self._Next()
1037	if self.tok_id == Id.J8_RBracket:
1038	self._Next()
1039	return nvalue.List(items)
1040
1041	#log('TOK %s', Id_str(self.tok_id))
1042	while self.tok_id != Id.J8_RBracket:
1043	items.append(self._ParseNil8())
1044	#log('TOK 2 %s', Id_str(self.tok_id))
1045
1046	self._Eat(Id.J8_RBracket)
1047
1048	return nvalue.List(items)
1049
1050	def _ParseNil8(self):
1051	# type: () -> nvalue_t
1052	if self.tok_id == Id.J8_LParen:
1053	obj = self._ParseRecord() # type: nvalue_t
1054	#return obj
1055
1056	elif self.tok_id == Id.J8_LBracket:
1057	obj = self._ParseList8()
1058	#return obj
1059
1060	# Primitives are copied from J8 above.
1061	# TODO: We also want hex literals.
1062	elif self.tok_id == Id.J8_Null:
1063	self._Next()
1064	obj = nvalue.Null
1065
1066	elif self.tok_id == Id.J8_Bool:
1067	b = nvalue.Bool(self.s[self.start_pos] == 't')
1068	self._Next()
1069	obj = b
1070
1071	elif self.tok_id == Id.J8_Int:
1072	part = self.s[self.start_pos:self.end_pos]
1073	self._Next()
1074	obj = nvalue.Int(int(part))
1075
1076	elif self.tok_id == Id.J8_Float:
1077	part = self.s[self.start_pos:self.end_pos]
1078	self._Next()
1079	obj = nvalue.Float(float(part))
1080
1081	elif self.tok_id == Id.J8_String:
1082	str_val = nvalue.Str(self.decoded)
1083	self._Next()
1084	obj = str_val
1085
1086	# <- etc.
1087	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1088	Id.J8_Comma):
1089	# unquoted "word" treated like a string
1090	part = self.s[self.start_pos:self.end_pos]
1091	self._Next()
1092	obj = nvalue.Symbol(part)
1093
1094	elif self.tok_id == Id.Eol_Tok:
1095	raise self._ParseError('Unexpected EOF while parsing %s' %
1096	self.lang_str)
1097
1098	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1099	raise self._ParseError('Invalid token while parsing %s: %s' %
1100	(self.lang_str, Id_str(self.tok_id)))
1101
1102	#log('YO %s', Id_str(self.tok_id))
1103	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1104	#log('AT %s', Id_str(self.tok_id))
1105
1106	# key: "value" -> (: key "value")
1107	part = self.s[self.start_pos:self.end_pos]
1108	op = nvalue.Symbol(part)
1109
1110	self._Next()
1111	operand2 = self._ParseNil8()
1112	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1113	#print("--> INFIX %d %s" % (id(infix), infix))
1114	return infix
1115
1116	#next_id = self._LookAhead()
1117	#print('NEXT %s' % Id_str(next_id))
1118
1119	#raise AssertionError()
1120	#print("--> OBJ %d %s" % (id(obj), obj))
1121	return obj
1122
1123	def ParseNil8(self):
1124	# type: () -> nvalue_t
1125	""" Raises error.Decode. """
1126	self._Next()
1127	#print('yo')
1128	obj = self._ParseNil8()
1129	#print("==> %d %s" % (id(obj), obj))
1130	if self.tok_id != Id.Eol_Tok:
1131	raise self._ParseError('Unexpected trailing input')
1132	return obj
1133
1134
1135	class J8LinesParser(_Parser):
1136	"""Decode lines from a string with newlines.
1137
1138	We specify this with a grammar, to preserve location info and to reduce
1139	allocations. (But note that unquoted_line is more like a LOOP than it is
1140	grammatical.)
1141
1142	Grammar:
1143
1144	end = J8_Newline \| Eol_Tok
1145
1146	empty_line = WS_Space? end
1147
1148	# special case: read until end token, but REMOVE trailing WS_Space
1149	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1150
1151	j8_line = WS_Space? J8_String WS_Space? end
1152
1153	lines = (empty_line \| unquoted_line \| j8_line)*
1154
1155	where Lit_Chars is valid UTF-8
1156
1157	Notes:
1158
1159	(1) We disallow multiple strings on a line, like:
1160
1161	"json" "json2"
1162	"json" unquoted
1163
1164	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1165
1166	foo "" u''
1167
1168	The "" and u'' are not a decoded string, because the line started with
1169	Id.Lit_Chars literals.
1170
1171	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1172	Does it have - for empty cell?
1173	"""
1174
1175	def __init__(self, s):
1176	# type: (str) -> None
1177	_Parser.__init__(self, s, True)
1178
1179	def _Show(self, s):
1180	# type: (str) -> None
1181	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1182	self.end_pos)
1183
1184	def _ParseLine(self, out):
1185	# type: (List[str]) -> None
1186	""" May append a line to 'out' """
1187	#self._Show('1')
1188	if self.tok_id == Id.WS_Space:
1189	self._NextForLines()
1190
1191	# Empty line - return without doing anything
1192	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1193	self._NextForLines()
1194	return
1195
1196	# Quoted string on line
1197	if self.tok_id == Id.J8_String:
1198	out.append(self.decoded)
1199	self._NextForLines()
1200
1201	if self.tok_id == Id.WS_Space: # trailing whitespace
1202	self._NextForLines()
1203
1204	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1205	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1206	Id_str(self.tok_id))
1207
1208	self._NextForLines()
1209	return
1210
1211	# Unquoted line
1212	if self.tok_id == Id.Lit_Chars:
1213	# ' unquoted "" text on line ' # read every token until end
1214	string_start = self.start_pos
1215	while True:
1216	# for stripping whitespace
1217	prev_id = self.tok_id
1218	prev_start = self.start_pos
1219
1220	self._NextForLines()
1221
1222	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1223	# \r, but we're sticking with the JSON spec definition of
1224	# whitespace. (As another data point, CPython on Unix allows
1225	# \r in the middle of expressions, treating it as whitespace.)
1226	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1227	break
1228
1229	if prev_id == Id.WS_Space:
1230	string_end = prev_start # remove trailing whitespace
1231	else:
1232	string_end = self.start_pos
1233
1234	out.append(self.s[string_start:string_end])
1235
1236	self._NextForLines() # past newline
1237	return
1238
1239	raise AssertionError(Id_str(self.tok_id))
1240
1241	def Parse(self):
1242	# type: () -> List[str]
1243	""" Raises error.Decode. """
1244	self._NextForLines()
1245
1246	lines = [] # type: List[str]
1247	while self.tok_id != Id.Eol_Tok:
1248	self._ParseLine(lines)
1249
1250	if self.tok_id != Id.Eol_Tok:
1251	raise self._ParseError('Unexpected trailing input in J8 Lines')
1252
1253	return lines
1254
1255
1256	def SplitJ8Lines(s):
1257	# type: (str) -> List[str]
1258	"""Used by @(echo split command sub)
1259
1260	Raises:
1261	error.Decode
1262
1263	3 Errors:
1264	- J8 string syntax error inside quotes
1265	- Extra input on line
1266	- unquoted line isn't utf-8
1267	"""
1268	p = J8LinesParser(s)
1269	return p.Parse()
1270
1271
1272	# vim: sw=4