data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1267 lines, 620 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
187	f.write(buf.getvalue())
188	f.write('\n')
189
190
191	def EncodeString(s, buf, unquoted_ok=False):
192	# type: (str, mylib.BufWriter, bool) -> None
193	""" For pp proc, etc."""
194
195	if unquoted_ok and fastfunc.CanOmitQuotes(s):
196	buf.write(s)
197	return
198
199	_Print(value.Str(s), buf, -1)
200
201
202	def MaybeEncodeString(s):
203	# type: (str) -> str
204	""" For write --json8 $s and compexport """
205
206	# TODO: add unquoted_ok here?
207	# /usr/local/foo-bar/x.y/a_b
208
209	buf = mylib.BufWriter()
210	_Print(value.Str(s), buf, -1)
211	return buf.getvalue()
212
213
214	def MaybeEncodeJsonString(s):
215	# type: (str) -> str
216	""" For write --json """
217
218	# TODO: add unquoted_ok here?
219	# /usr/local/foo-bar/x.y/a_b
220	buf = mylib.BufWriter()
221	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
222	return buf.getvalue()
223
224
225	# DFS traversal state
226	UNSEEN = 0
227	EXPLORING = 1
228	FINISHED = 2
229
230
231	class InstancePrinter(object):
232	"""Print a value tree as J8/JSON."""
233
234	def __init__(self, buf, indent, options):
235	# type: (mylib.BufWriter, int, int) -> None
236	self.buf = buf
237	self.indent = indent
238	self.options = options
239
240	# Key is vm.HeapValueId(val)
241	# Value is always True
242	# Dict[int, None] doesn't translate -- it would be nice to have a set()
243	self.visited = {} # type: Dict[int, int]
244
245	def _ItemIndent(self, level):
246	# type: (int) -> None
247
248	if self.indent == -1:
249	return
250
251	self.buf.write_spaces((level + 1) * self.indent)
252
253	def _BracketIndent(self, level):
254	# type: (int) -> None
255
256	if self.indent == -1:
257	return
258
259	self.buf.write_spaces(level * self.indent)
260
261	def _MaybeNewline(self):
262	# type: () -> None
263	if self.indent == -1:
264	return
265	self.buf.write('\n')
266
267	def _MaybeSpace(self):
268	# type: () -> None
269	if self.indent == -1:
270	return
271	self.buf.write(' ')
272
273	def _PrintList(self, val, level):
274	# type: (value.List, int) -> None
275
276	if len(val.items) == 0: # Special case like Python/JS
277	self.buf.write('[]')
278	else:
279	self.buf.write('[')
280	self._MaybeNewline()
281	for i, item in enumerate(val.items):
282	if i != 0:
283	self.buf.write(',')
284	self._MaybeNewline()
285
286	self._ItemIndent(level)
287	self.Print(item, level + 1)
288	self._MaybeNewline()
289
290	self._BracketIndent(level)
291	self.buf.write(']')
292
293	def _PrintDict(self, val, level):
294	# type: (value.Dict, int) -> None
295
296	if len(val.d) == 0: # Special case like Python/JS
297	self.buf.write('{}')
298	else:
299	self.buf.write('{')
300	self._MaybeNewline()
301	i = 0
302	for k, v in iteritems(val.d):
303	if i != 0:
304	self.buf.write(',')
305	self._MaybeNewline()
306
307	self._ItemIndent(level)
308
309	pyj8.WriteString(k, self.options, self.buf)
310
311	self.buf.write(':')
312	self._MaybeSpace()
313
314	self.Print(v, level + 1)
315
316	i += 1
317
318	self._MaybeNewline()
319	self._BracketIndent(level)
320	self.buf.write('}')
321
322	def Print(self, val, level=0):
323	# type: (value_t, int) -> None
324
325	# special value that means everything is on one line
326	# It's like
327	# JSON.stringify(d, null, 0)
328	# except we use -1, not 0. 0 can still have newlines.
329
330	UP_val = val
331	with tagswitch(val) as case:
332	if case(value_e.Null):
333	self.buf.write('null')
334
335	elif case(value_e.Bool):
336	val = cast(value.Bool, UP_val)
337	self.buf.write('true' if val.b else 'false')
338
339	elif case(value_e.Int):
340	val = cast(value.Int, UP_val)
341	# TODO: avoid intermediate allocation with
342	# self.buf.WriteBigInt(val.i)
343	#
344	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
345	# be of arbitrary length, and will need a growth strategy.
346	# Although that is not very common, so we could allocate in
347	# that case.
348
349	self.buf.write(mops.ToStr(val.i))
350
351	elif case(value_e.Float):
352	val = cast(value.Float, UP_val)
353
354	fl = val.f
355	if math.isinf(fl):
356	if self.options & INF_NAN_ARE_NULL:
357	s = 'null' # negative infinity is null too
358	else:
359	s = 'INFINITY'
360	if fl < 0:
361	s = '-' + s
362	elif math.isnan(fl):
363	if self.options & INF_NAN_ARE_NULL:
364	# JavaScript JSON lib behavior: Inf and NaN are null
365	# Python has a bug in the encoder by default, and then
366	# allow_nan=False raises an error
367	s = 'null'
368	else:
369	s = 'NAN'
370	else:
371	# TODO: can we avoid intermediate allocation?
372	# self.buf.WriteFloat(val.f)
373	s = str(fl)
374
375	self.buf.write(s)
376
377	elif case(value_e.Str):
378	val = cast(value.Str, UP_val)
379
380	pyj8.WriteString(val.s, self.options, self.buf)
381
382	elif case(value_e.List):
383	val = cast(value.List, UP_val)
384
385	# Cycle detection, only for containers that can be in cycles
386	heap_id = HeapValueId(val)
387
388	node_state = self.visited.get(heap_id, UNSEEN)
389	if node_state == FINISHED:
390	# Print it AGAIN. We print a JSON tree, which means we can
391	# visit and print nodes MANY TIMES, as long as they're not
392	# in a cycle.
393	self._PrintList(val, level)
394	return
395	if node_state == EXPLORING:
396	if self.options & SHOW_CYCLES:
397	self.buf.write('[ -->%s ]' % ValueIdString(val))
398	return
399	else:
400	# node.js prints which index closes the cycle
401	raise error.Encode(
402	"Can't encode List%s in object cycle" %
403	ValueIdString(val))
404
405	self.visited[heap_id] = EXPLORING
406	self._PrintList(val, level)
407	self.visited[heap_id] = FINISHED
408
409	elif case(value_e.Dict):
410	val = cast(value.Dict, UP_val)
411
412	# Cycle detection, only for containers that can be in cycles
413	heap_id = HeapValueId(val)
414
415	node_state = self.visited.get(heap_id, UNSEEN)
416	if node_state == FINISHED:
417	# Print it AGAIN. We print a JSON tree, which means we can
418	# visit and print nodes MANY TIMES, as long as they're not
419	# in a cycle.
420	self._PrintDict(val, level)
421	return
422	if node_state == EXPLORING:
423	if self.options & SHOW_CYCLES:
424	self.buf.write('{ -->%s }' % ValueIdString(val))
425	return
426	else:
427	# node.js prints which key closes the cycle
428	raise error.Encode(
429	"Can't encode Dict%s in object cycle" %
430	ValueIdString(val))
431
432	self.visited[heap_id] = EXPLORING
433	self._PrintDict(val, level)
434	self.visited[heap_id] = FINISHED
435
436	# BashArray and BashAssoc should be printed with pp line (x), e.g.
437	# for spec tests.
438	# - BashAssoc has a clear encoding.
439	# - BashArray could eventually be Dict[int, str]. But that's not
440	# encodable in JSON, which has string keys!
441	# So I think we can print it like ["a",null,'b"] and that won't
442	# change. That's what users expect.
443	elif case(value_e.BashArray):
444	val = cast(value.BashArray, UP_val)
445
446	self.buf.write('[')
447	self._MaybeNewline()
448	for i, s in enumerate(val.strs):
449	if i != 0:
450	self.buf.write(',')
451	self._MaybeNewline()
452
453	self._ItemIndent(level)
454	if s is None:
455	self.buf.write('null')
456	else:
457	pyj8.WriteString(s, self.options, self.buf)
458
459	self._MaybeNewline()
460
461	self._BracketIndent(level)
462	self.buf.write(']')
463
464	elif case(value_e.BashAssoc):
465	val = cast(value.BashAssoc, UP_val)
466
467	self.buf.write('{')
468	self._MaybeNewline()
469	i = 0
470	for k2, v2 in iteritems(val.d):
471	if i != 0:
472	self.buf.write(',')
473	self._MaybeNewline()
474
475	self._ItemIndent(level)
476
477	pyj8.WriteString(k2, self.options, self.buf)
478
479	self.buf.write(':')
480	self._MaybeSpace()
481
482	pyj8.WriteString(v2, self.options, self.buf)
483
484	i += 1
485
486	self._MaybeNewline()
487	self._BracketIndent(level)
488	self.buf.write('}')
489
490	else:
491	pass # mycpp workaround
492	if self.options & SHOW_NON_DATA:
493	# Similar to = operator, ui.DebugPrint()
494	# TODO: that prints value.Range in a special way
495	ysh_type = ValType(val)
496	id_str = ValueIdString(val)
497	self.buf.write('<%s%s>' % (ysh_type, id_str))
498	else:
499	raise error.Encode("Can't serialize object of type %s" %
500	ValType(val))
501
502
503	class PrettyPrinter(object):
504	""" Unused right now, but could enhance the = operator.
505
506	Output to polymorphic ColorOutput
507
508	Features like asdl/format.py:
509	- line wrapping
510	- color
511	- sharing detection by passing in a REF COUTN dict
512	- print @123 the first time, and then print ... the second time
513
514	and
515
516	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
517	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
518
519	- Omitting commas for ASDL? Maybe we can use two spaces
520
521	(Token id: Id.VSub_DollarName start: 0 length: 3)
522	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
523	"""
524
525	def __init__(self, max_col):
526	# type: (int) -> None
527	self.max_col = max_col
528
529	# This could be an optimized set an C++ bit set like
530	# mark_sweep_heap.h, rather than a Dict
531	#self.unique_objs = mylib.UniqueObjects()
532
533	# first pass of object ID -> number of times references
534
535	self.ref_count = {} # type: Dict[int, int]
536
537	def PrettyTree(self, val, f):
538	# type: (value_t, fmt.ColorOutput) -> None
539
540	# TODO: first convert to hnode.asdl types?
541
542	# Although we might want
543	# hnode.AlreadyShown = (str type, int unique_id)
544	pass
545
546	def Print(self, val, buf):
547	# type: (value_t, mylib.BufWriter) -> None
548
549	# Or print to stderr?
550	f = fmt.DetectConsoleOutput(mylib.Stdout())
551	self.PrettyTree(val, f)
552
553	# Then print those with ASDL
554	pass
555
556
557	class LexerDecoder(object):
558	"""J8 lexer and string decoder.
559
560	Similar interface as SimpleLexer, except we return an optional decoded
561	string
562	"""
563
564	def __init__(self, s, is_j8, lang_str):
565	# type: (str, bool, str) -> None
566	self.s = s
567	self.is_j8 = is_j8
568	self.lang_str = lang_str
569
570	self.pos = 0
571
572	# current line being lexed -- for error messages
573	self.cur_line_num = 1
574
575	# Reuse this instance to save GC objects. JSON objects could have
576	# thousands of strings.
577	self.decoded = mylib.BufWriter()
578
579	def _Error(self, msg, end_pos):
580	# type: (str, int) -> error.Decode
581
582	# Use the current position as start pos
583	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
584
585	def Next(self):
586	# type: () -> Tuple[Id_t, int, Optional[str]]
587	""" Returns a token and updates self.pos """
588
589	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
590
591	if not self.is_j8:
592	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
593	raise self._Error(
594	"Single quotes aren't part of JSON; you may want 'json8 read'",
595	end_pos)
596	if tok_id == Id.Ignored_Comment:
597	raise self._Error(
598	"Comments aren't part of JSON; you may want 'json8 read'",
599	end_pos)
600
601	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
602	Id.Left_USingleQuote):
603	return self._DecodeString(tok_id, end_pos)
604
605	if tok_id == Id.Left_JDoubleQuote:
606	if self.is_j8:
607	return self._DecodeString(tok_id, end_pos)
608	else:
609	raise self._Error('Pure JSON does not accept j"" prefix',
610	end_pos)
611
612	if tok_id == Id.Ignored_Newline:
613	#log('LINE %d', self.cur_line_num)
614	self.cur_line_num += 1
615
616	self.pos = end_pos
617	return tok_id, end_pos, None
618
619	def NextForLines(self):
620	# type: () -> Tuple[Id_t, int, Optional[str]]
621	""" Like Next(), but for J8 Lines """
622
623	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
624
625	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
626	Id.Left_BSingleQuote, Id.Left_USingleQuote):
627	return self._DecodeString(tok_id, end_pos)
628
629	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
630	# this for quoted strings.)
631	if (tok_id == Id.Lit_Chars and
632	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
633	raise self._Error(
634	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
635	if tok_id == Id.Char_AsciiControl:
636	raise self._Error(
637	"J8 Lines can't have unescaped ASCII control chars", end_pos)
638
639	if tok_id == Id.J8_Newline:
640	#log('LINE %d', self.cur_line_num)
641	self.cur_line_num += 1
642
643	self.pos = end_pos
644	return tok_id, end_pos, None
645
646	def _DecodeString(self, left_id, str_pos):
647	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
648	""" Returns a string token and updates self.pos """
649
650	while True:
651	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
652	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
653	else:
654	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
655
656	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
657
658	if tok_id == Id.Eol_Tok:
659	# TODO: point to beginning of # quote?
660	raise self._Error(
661	'Unexpected EOF while lexing %s string' % self.lang_str,
662	str_end)
663	if tok_id == Id.Unknown_Backslash:
664	raise self._Error(
665	'Bad backslash escape in %s string' % self.lang_str,
666	str_end)
667	if tok_id == Id.Char_AsciiControl:
668	raise self._Error(
669	"%s strings can't have unescaped ASCII control chars" %
670	self.lang_str, str_end)
671
672	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
673
674	self.pos = str_end
675
676	s = self.decoded.getvalue()
677	self.decoded.clear() # reuse this instance
678
679	#log('decoded %r', self.decoded.getvalue())
680	return Id.J8_String, str_end, s
681
682	#
683	# Now handle each kind of token
684	#
685
686	if tok_id == Id.Lit_Chars: # JSON and J8
687	part = self.s[str_pos:str_end]
688	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
689	raise self._Error(
690	'Invalid UTF-8 in %s string literal' % self.lang_str,
691	str_end)
692
693	# TODO: would be nice to avoid allocation in all these cases.
694	# But LookupCharC() would have to change.
695
696	elif tok_id == Id.Char_OneChar: # JSON and J8
697	ch = self.s[str_pos + 1]
698	part = consts.LookupCharC(ch)
699
700	elif tok_id == Id.Char_UBraced: # J8 only
701	h = self.s[str_pos + 3:str_end - 1]
702	i = int(h, 16)
703
704	# Same checks in osh/word_compile.py
705	if i > 0x10ffff:
706	raise self._Error(
707	"Code point can't be greater than U+10ffff", str_end)
708	if 0xD800 <= i and i < 0xE000:
709	raise self._Error(
710	r"\u{%s} escape is illegal because it's in the surrogate range"
711	% h, str_end)
712
713	part = Utf8Encode(i)
714
715	elif tok_id == Id.Char_YHex: # J8 only
716	h = self.s[str_pos + 2:str_end]
717
718	# Same check in osh/word_parse.py
719	if left_id != Id.Left_BSingleQuote:
720	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
721	raise self._Error(
722	r"\y%s escapes not allowed in u'' strings" % h,
723	str_end)
724
725	i = int(h, 16)
726	part = chr(i)
727
728	elif tok_id == Id.Char_SurrogatePair:
729	h1 = self.s[str_pos + 2:str_pos + 6]
730	h2 = self.s[str_pos + 8:str_pos + 12]
731
732	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
733	i1 = int(h1, 16) - 0xD800 # high surrogate
734	i2 = int(h2, 16) - 0xDC00 # low surrogate
735	code_point = 0x10000 + (i1 << 10) + i2
736
737	part = Utf8Encode(code_point)
738
739	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
740	h = self.s[str_pos + 2:str_end]
741	i = int(h, 16)
742	part = Utf8Encode(i)
743
744	else:
745	# Should never happen
746	raise AssertionError(Id_str(tok_id))
747
748	#log('%s part %r', Id_str(tok_id), part)
749	self.decoded.write(part)
750	str_pos = str_end
751
752
753	class _Parser(object):
754
755	def __init__(self, s, is_j8):
756	# type: (str, bool) -> None
757	self.s = s
758	self.is_j8 = is_j8
759	self.lang_str = "J8" if is_j8 else "JSON"
760
761	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
762	self.tok_id = Id.Undefined_Tok
763	self.start_pos = 0
764	self.end_pos = 0
765	self.decoded = '' # decoded J8 string
766
767	def _Next(self):
768	# type: () -> None
769
770	# This isn't the start of a J8_Bool token, it's the END of the token before it
771	while True:
772	self.start_pos = self.end_pos
773	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
774	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
775	Id.Ignored_Comment):
776	break
777	# TODO: add Ignored_Newline to count lines, and show line numbers
778	# in errors messages. The position of the last newline and a token
779	# can be used to calculate a column number.
780
781	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
782
783	def _Eat(self, tok_id):
784	# type: (Id_t) -> None
785
786	if self.tok_id != tok_id:
787	#log('position %r %d-%d %r', self.s, self.start_pos,
788	# self.end_pos, self.s[self.start_pos:self.end_pos])
789	raise self._ParseError("Expected %s, got %s" %
790	(Id_str(tok_id), Id_str(self.tok_id)))
791	self._Next()
792
793	def _NextForLines(self):
794	# type: () -> None
795	"""Like _Next, but use the J8 Lines lexer."""
796	self.start_pos = self.end_pos
797	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
798
799	def _ParseError(self, msg):
800	# type: (str) -> error.Decode
801	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
802	self.lexer.cur_line_num)
803
804
805	class Parser(_Parser):
806	"""JSON and JSON8 Parser."""
807
808	def __init__(self, s, is_j8):
809	# type: (str, bool) -> None
810	_Parser.__init__(self, s, is_j8)
811
812	def _ParsePair(self):
813	# type: () -> Tuple[str, value_t]
814
815	k = self.decoded # Save the potential string value
816	self._Eat(Id.J8_String) # Check that it's a string
817	assert k is not None
818
819	self._Eat(Id.J8_Colon)
820
821	v = self._ParseValue()
822	return k, v
823
824	def _ParseDict(self):
825	# type: () -> value_t
826	"""
827	pair = string ':' value
828	Dict = '{' '}'
829	\| '{' pair (',' pair)* '}'
830	"""
831	# precondition
832	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
833
834	#log('> Dict')
835
836	d = NewDict() # type: Dict[str, value_t]
837
838	self._Next()
839	if self.tok_id == Id.J8_RBrace:
840	self._Next()
841	return value.Dict(d)
842
843	k, v = self._ParsePair()
844	d[k] = v
845	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
846
847	while self.tok_id == Id.J8_Comma:
848	self._Next()
849	k, v = self._ParsePair()
850	d[k] = v
851	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
852
853	self._Eat(Id.J8_RBrace)
854
855	#log('< Dict')
856
857	return value.Dict(d)
858
859	def _ParseList(self):
860	# type: () -> value_t
861	"""
862	List = '[' ']'
863	\| '[' value (',' value)* ']'
864	"""
865	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
866
867	items = [] # type: List[value_t]
868
869	self._Next()
870	if self.tok_id == Id.J8_RBracket:
871	self._Next()
872	return value.List(items)
873
874	items.append(self._ParseValue())
875
876	while self.tok_id == Id.J8_Comma:
877	self._Next()
878	items.append(self._ParseValue())
879
880	self._Eat(Id.J8_RBracket)
881
882	return value.List(items)
883
884	def _ParseValue(self):
885	# type: () -> value_t
886	if self.tok_id == Id.J8_LBrace:
887	return self._ParseDict()
888
889	elif self.tok_id == Id.J8_LBracket:
890	return self._ParseList()
891
892	elif self.tok_id == Id.J8_Null:
893	self._Next()
894	return value.Null
895
896	elif self.tok_id == Id.J8_Bool:
897	#log('%r %d', self.s[self.start_pos], self.start_pos)
898	b = value.Bool(self.s[self.start_pos] == 't')
899	self._Next()
900	return b
901
902	elif self.tok_id == Id.J8_Int:
903	part = self.s[self.start_pos:self.end_pos]
904	self._Next()
905	try:
906	big = mops.FromStr(part)
907	except ValueError:
908	raise self._ParseError('Integer is too big')
909	return value.Int(big)
910
911	elif self.tok_id == Id.J8_Float:
912	part = self.s[self.start_pos:self.end_pos]
913	self._Next()
914	return value.Float(float(part))
915
916	# UString, BString too
917	elif self.tok_id == Id.J8_String:
918	str_val = value.Str(self.decoded)
919	#log('d %r', self.decoded)
920	self._Next()
921	return str_val
922
923	elif self.tok_id == Id.Eol_Tok:
924	raise self._ParseError('Unexpected EOF while parsing %s' %
925	self.lang_str)
926
927	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
928	raise self._ParseError('Invalid token while parsing %s: %s' %
929	(self.lang_str, Id_str(self.tok_id)))
930
931	def ParseValue(self):
932	# type: () -> value_t
933	""" Raises error.Decode. """
934	self._Next()
935	obj = self._ParseValue()
936
937	n = len(self.s)
938	if self.start_pos != n:
939	extra = n - self.start_pos
940	#log('n %d pos %d', n, self.start_pos)
941	raise self._ParseError(
942	'Got %d bytes of unexpected trailing input' % extra)
943	return obj
944
945
946	class Nil8Parser(_Parser):
947	"""
948	Tokens not in JSON8:
949	LParen RParen Symbol
950
951	Tokens not in JSON, but in JSON8 and NIL8:
952	Identifier (unquoted keys)
953	Ignored_Comment
954	"""
955
956	def __init__(self, s, is_j8):
957	# type: (str, bool) -> None
958	_Parser.__init__(self, s, is_j8)
959
960	if 0:
961
962	def _LookAhead(self):
963	# type: () -> Id_t
964	"""
965	Don't need this right now
966	"""
967	end_pos = self.end_pos # look ahead from last token
968	while True:
969	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
970	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
971	Id.Ignored_Comment):
972	break
973	return tok_id
974
975	def _ParseRecord(self):
976	# type: () -> nvalue_t
977	"""
978	Yaks
979	(self->Next) => (-> self Next)
980	(self->Next obj.field) => ((-> self Next) (. obj field))
981
982	Similar to
983	((identity identity) 42) => 42 in Clojure
984
985	ASDL
986	(Node left:(. x4beef2))
987	(Node left !x4beef2)
988
989	# Ambiguous because value can be identifier.
990	# We have to look ahead to and see if there's a colon :
991	field =
992	Identifier ':' value
993	\| value
994
995	record = '(' head field* ')'
996
997	- Identifier \| Symbol are treated the same, it's a side effect of
998	the lexing style
999	- do positional args come before named args
1000	- () is invalid? Use [] for empty list
1001	"""
1002	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1003
1004	items = [] # type: List[nvalue_t]
1005
1006	self._Next()
1007	if self.tok_id == Id.J8_RParen:
1008	self._Next()
1009	return nvalue.List(items)
1010
1011	#log('TOK %s', Id_str(self.tok_id))
1012	while self.tok_id != Id.J8_RParen:
1013	items.append(self._ParseNil8())
1014	#log('TOK 2 %s', Id_str(self.tok_id))
1015
1016	self._Eat(Id.J8_RParen)
1017
1018	return nvalue.List(items)
1019
1020	def _ParseList8(self):
1021	# type: () -> nvalue_t
1022	"""
1023	List8 = '[' value* ']'
1024
1025	No commas, not even optional ones for now.
1026	"""
1027	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1028
1029	items = [] # type: List[nvalue_t]
1030
1031	self._Next()
1032	if self.tok_id == Id.J8_RBracket:
1033	self._Next()
1034	return nvalue.List(items)
1035
1036	#log('TOK %s', Id_str(self.tok_id))
1037	while self.tok_id != Id.J8_RBracket:
1038	items.append(self._ParseNil8())
1039	#log('TOK 2 %s', Id_str(self.tok_id))
1040
1041	self._Eat(Id.J8_RBracket)
1042
1043	return nvalue.List(items)
1044
1045	def _ParseNil8(self):
1046	# type: () -> nvalue_t
1047	if self.tok_id == Id.J8_LParen:
1048	obj = self._ParseRecord() # type: nvalue_t
1049	#return obj
1050
1051	elif self.tok_id == Id.J8_LBracket:
1052	obj = self._ParseList8()
1053	#return obj
1054
1055	# Primitives are copied from J8 above.
1056	# TODO: We also want hex literals.
1057	elif self.tok_id == Id.J8_Null:
1058	self._Next()
1059	obj = nvalue.Null
1060
1061	elif self.tok_id == Id.J8_Bool:
1062	b = nvalue.Bool(self.s[self.start_pos] == 't')
1063	self._Next()
1064	obj = b
1065
1066	elif self.tok_id == Id.J8_Int:
1067	part = self.s[self.start_pos:self.end_pos]
1068	self._Next()
1069	obj = nvalue.Int(int(part))
1070
1071	elif self.tok_id == Id.J8_Float:
1072	part = self.s[self.start_pos:self.end_pos]
1073	self._Next()
1074	obj = nvalue.Float(float(part))
1075
1076	elif self.tok_id == Id.J8_String:
1077	str_val = nvalue.Str(self.decoded)
1078	self._Next()
1079	obj = str_val
1080
1081	# <- etc.
1082	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1083	Id.J8_Comma):
1084	# unquoted "word" treated like a string
1085	part = self.s[self.start_pos:self.end_pos]
1086	self._Next()
1087	obj = nvalue.Symbol(part)
1088
1089	elif self.tok_id == Id.Eol_Tok:
1090	raise self._ParseError('Unexpected EOF while parsing %s' %
1091	self.lang_str)
1092
1093	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1094	raise self._ParseError('Invalid token while parsing %s: %s' %
1095	(self.lang_str, Id_str(self.tok_id)))
1096
1097	#log('YO %s', Id_str(self.tok_id))
1098	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1099	#log('AT %s', Id_str(self.tok_id))
1100
1101	# key: "value" -> (: key "value")
1102	part = self.s[self.start_pos:self.end_pos]
1103	op = nvalue.Symbol(part)
1104
1105	self._Next()
1106	operand2 = self._ParseNil8()
1107	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1108	#print("--> INFIX %d %s" % (id(infix), infix))
1109	return infix
1110
1111	#next_id = self._LookAhead()
1112	#print('NEXT %s' % Id_str(next_id))
1113
1114	#raise AssertionError()
1115	#print("--> OBJ %d %s" % (id(obj), obj))
1116	return obj
1117
1118	def ParseNil8(self):
1119	# type: () -> nvalue_t
1120	""" Raises error.Decode. """
1121	self._Next()
1122	#print('yo')
1123	obj = self._ParseNil8()
1124	#print("==> %d %s" % (id(obj), obj))
1125	if self.tok_id != Id.Eol_Tok:
1126	raise self._ParseError('Unexpected trailing input')
1127	return obj
1128
1129
1130	class J8LinesParser(_Parser):
1131	"""Decode lines from a string with newlines.
1132
1133	We specify this with a grammar, to preserve location info and to reduce
1134	allocations. (But note that unquoted_line is more like a LOOP than it is
1135	grammatical.)
1136
1137	Grammar:
1138
1139	end = J8_Newline \| Eol_Tok
1140
1141	empty_line = WS_Space? end
1142
1143	# special case: read until end token, but REMOVE trailing WS_Space
1144	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1145
1146	j8_line = WS_Space? J8_String WS_Space? end
1147
1148	lines = (empty_line \| unquoted_line \| j8_line)*
1149
1150	where Lit_Chars is valid UTF-8
1151
1152	Notes:
1153
1154	(1) We disallow multiple strings on a line, like:
1155
1156	"json" "json2"
1157	"json" unquoted
1158
1159	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1160
1161	foo "" u''
1162
1163	The "" and u'' are not a decoded string, because the line started with
1164	Id.Lit_Chars literals.
1165
1166	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1167	Does it have - for empty cell?
1168	"""
1169
1170	def __init__(self, s):
1171	# type: (str) -> None
1172	_Parser.__init__(self, s, True)
1173
1174	def _Show(self, s):
1175	# type: (str) -> None
1176	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1177	self.end_pos)
1178
1179	def _ParseLine(self, out):
1180	# type: (List[str]) -> None
1181	""" May append a line to 'out' """
1182	#self._Show('1')
1183	if self.tok_id == Id.WS_Space:
1184	self._NextForLines()
1185
1186	# Empty line - return without doing anything
1187	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1188	self._NextForLines()
1189	return
1190
1191	# Quoted string on line
1192	if self.tok_id == Id.J8_String:
1193	out.append(self.decoded)
1194	self._NextForLines()
1195
1196	if self.tok_id == Id.WS_Space: # trailing whitespace
1197	self._NextForLines()
1198
1199	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1200	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1201	Id_str(self.tok_id))
1202
1203	self._NextForLines()
1204	return
1205
1206	# Unquoted line
1207	if self.tok_id == Id.Lit_Chars:
1208	# ' unquoted "" text on line ' # read every token until end
1209	string_start = self.start_pos
1210	while True:
1211	# for stripping whitespace
1212	prev_id = self.tok_id
1213	prev_start = self.start_pos
1214
1215	self._NextForLines()
1216
1217	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1218	# \r, but we're sticking with the JSON spec definition of
1219	# whitespace. (As another data point, CPython on Unix allows
1220	# \r in the middle of expressions, treating it as whitespace.)
1221	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1222	break
1223
1224	if prev_id == Id.WS_Space:
1225	string_end = prev_start # remove trailing whitespace
1226	else:
1227	string_end = self.start_pos
1228
1229	out.append(self.s[string_start:string_end])
1230
1231	self._NextForLines() # past newline
1232	return
1233
1234	raise AssertionError(Id_str(self.tok_id))
1235
1236	def Parse(self):
1237	# type: () -> List[str]
1238	""" Raises error.Decode. """
1239	self._NextForLines()
1240
1241	lines = [] # type: List[str]
1242	while self.tok_id != Id.Eol_Tok:
1243	self._ParseLine(lines)
1244
1245	if self.tok_id != Id.Eol_Tok:
1246	raise self._ParseError('Unexpected trailing input in J8 Lines')
1247
1248	return lines
1249
1250
1251	def SplitJ8Lines(s):
1252	# type: (str) -> List[str]
1253	"""Used by @(echo split command sub)
1254
1255	Raises:
1256	error.Decode
1257
1258	3 Errors:
1259	- J8 string syntax error inside quotes
1260	- Extra input on line
1261	- unquoted line isn't utf-8
1262	"""
1263	p = J8LinesParser(s)
1264	return p.Parse()
1265
1266
1267	# vim: sw=4