data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1299 lines, 639 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	# TODO: Omit type at top level
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	def EncodeString(s, buf, unquoted_ok=False):
195	# type: (str, mylib.BufWriter, bool) -> None
196	""" For pp proc, etc."""
197
198	if unquoted_ok and fastfunc.CanOmitQuotes(s):
199	buf.write(s)
200	return
201
202	_Print(value.Str(s), buf, -1)
203
204
205	def MaybeEncodeString(s):
206	# type: (str) -> str
207	""" For write --json8 $s and compexport """
208
209	# TODO: add unquoted_ok here?
210	# /usr/local/foo-bar/x.y/a_b
211
212	buf = mylib.BufWriter()
213	_Print(value.Str(s), buf, -1)
214	return buf.getvalue()
215
216
217	def MaybeEncodeJsonString(s):
218	# type: (str) -> str
219	""" For write --json """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225	return buf.getvalue()
226
227
228	# DFS traversal state
229	UNSEEN = 0
230	EXPLORING = 1
231	FINISHED = 2
232
233
234	class InstancePrinter(object):
235	"""Print a value tree as J8/JSON."""
236
237	def __init__(self, buf, indent, options):
238	# type: (mylib.BufWriter, int, int) -> None
239	self.buf = buf
240	self.indent = indent
241	self.options = options
242
243	# Key is vm.HeapValueId(val)
244	# Value is always True
245	# Dict[int, None] doesn't translate -- it would be nice to have a set()
246	self.visited = {} # type: Dict[int, int]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintDict(self, val, level):
297	# type: (value.Dict, int) -> None
298
299	if len(val.d) == 0: # Special case like Python/JS
300	self.buf.write('{}')
301	else:
302	self.buf.write('{')
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(val.d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write('}')
324
325	def Print(self, val, level=0):
326	# type: (value_t, int) -> None
327
328	# special value that means everything is on one line
329	# It's like
330	# JSON.stringify(d, null, 0)
331	# except we use -1, not 0. 0 can still have newlines.
332
333	UP_val = val
334	with tagswitch(val) as case:
335	if case(value_e.Null):
336	self.buf.write('null')
337
338	elif case(value_e.Bool):
339	val = cast(value.Bool, UP_val)
340	self.buf.write('true' if val.b else 'false')
341
342	elif case(value_e.Int):
343	val = cast(value.Int, UP_val)
344	# TODO: avoid intermediate allocation with
345	# self.buf.WriteBigInt(val.i)
346	#
347	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
348	# be of arbitrary length, and will need a growth strategy.
349	# Although that is not very common, so we could allocate in
350	# that case.
351
352	self.buf.write(mops.ToStr(val.i))
353
354	elif case(value_e.Float):
355	val = cast(value.Float, UP_val)
356
357	fl = val.f
358	if math.isinf(fl):
359	if self.options & INF_NAN_ARE_NULL:
360	s = 'null' # negative infinity is null too
361	else:
362	s = 'INFINITY'
363	if fl < 0:
364	s = '-' + s
365	elif math.isnan(fl):
366	if self.options & INF_NAN_ARE_NULL:
367	# JavaScript JSON lib behavior: Inf and NaN are null
368	# Python has a bug in the encoder by default, and then
369	# allow_nan=False raises an error
370	s = 'null'
371	else:
372	s = 'NAN'
373	else:
374	# TODO: can we avoid intermediate allocation?
375	# self.buf.WriteFloat(val.f)
376	s = str(fl)
377
378	self.buf.write(s)
379
380	elif case(value_e.Str):
381	val = cast(value.Str, UP_val)
382
383	pyj8.WriteString(val.s, self.options, self.buf)
384
385	elif case(value_e.List):
386	val = cast(value.List, UP_val)
387
388	# Cycle detection, only for containers that can be in cycles
389	heap_id = HeapValueId(val)
390
391	node_state = self.visited.get(heap_id, UNSEEN)
392	if node_state == FINISHED:
393	# Print it AGAIN. We print a JSON tree, which means we can
394	# visit and print nodes MANY TIMES, as long as they're not
395	# in a cycle.
396	self._PrintList(val, level)
397	return
398	if node_state == EXPLORING:
399	if self.options & SHOW_CYCLES:
400	self.buf.write('[ -->%s ]' % ValueIdString(val))
401	return
402	else:
403	# node.js prints which index closes the cycle
404	raise error.Encode(
405	"Can't encode List%s in object cycle" %
406	ValueIdString(val))
407
408	self.visited[heap_id] = EXPLORING
409	self._PrintList(val, level)
410	self.visited[heap_id] = FINISHED
411
412	elif case(value_e.Dict):
413	val = cast(value.Dict, UP_val)
414
415	# Cycle detection, only for containers that can be in cycles
416	heap_id = HeapValueId(val)
417
418	node_state = self.visited.get(heap_id, UNSEEN)
419	if node_state == FINISHED:
420	# Print it AGAIN. We print a JSON tree, which means we can
421	# visit and print nodes MANY TIMES, as long as they're not
422	# in a cycle.
423	self._PrintDict(val, level)
424	return
425	if node_state == EXPLORING:
426	if self.options & SHOW_CYCLES:
427	self.buf.write('{ -->%s }' % ValueIdString(val))
428	return
429	else:
430	# node.js prints which key closes the cycle
431	raise error.Encode(
432	"Can't encode Dict%s in object cycle" %
433	ValueIdString(val))
434
435	self.visited[heap_id] = EXPLORING
436	self._PrintDict(val, level)
437	self.visited[heap_id] = FINISHED
438
439	# TODO: New format, which should consistent with pretty printing
440	# pp line (x) supports BashArray and BashAssoc, e.g. for spec
441	# tests.
442
443	# - BashAssoc is Dict[str, str]
444	# (BashAssoc ['1']='foo' ['3']='bar')
445	# - BashArray will be Dict[int, str] - SparseArray. We should write it like
446	# (BashArray [1]='foo' [3]='bar')
447
448	elif case(value_e.BashArray):
449	val = cast(value.BashArray, UP_val)
450
451	self.buf.write('{')
452	self._MaybeNewline()
453	self._ItemIndent(level)
454	self.buf.write('"type":')
455	self._MaybeSpace()
456	self.buf.write('"BashArray",')
457
458	self._MaybeNewline()
459
460	self._ItemIndent(level)
461	self.buf.write('"value":')
462	self._MaybeSpace()
463	self.buf.write('{')
464	self._MaybeNewline()
465
466	level += 1
467	first = True
468	for i, s in enumerate(val.strs):
469	if s is None:
470	continue
471
472	if not first:
473	self.buf.write(',')
474	self._MaybeNewline()
475
476	self._ItemIndent(level)
477
478	pyj8.WriteString(str(i), self.options, self.buf)
479	self.buf.write(':')
480	self._MaybeSpace()
481
482	pyj8.WriteString(s, self.options, self.buf)
483
484	first = False
485
486	self._MaybeNewline()
487
488	self._BracketIndent(level)
489	self.buf.write('}')
490
491	level -= 1
492	self._MaybeNewline()
493	self._BracketIndent(level)
494	self.buf.write('}')
495
496	elif case(value_e.BashAssoc):
497	val = cast(value.BashAssoc, UP_val)
498
499	self.buf.write('{')
500	self._MaybeNewline()
501	i = 0
502	for k2, v2 in iteritems(val.d):
503	if i != 0:
504	self.buf.write(',')
505	self._MaybeNewline()
506
507	self._ItemIndent(level)
508
509	pyj8.WriteString(k2, self.options, self.buf)
510
511	self.buf.write(':')
512	self._MaybeSpace()
513
514	pyj8.WriteString(v2, self.options, self.buf)
515
516	i += 1
517
518	self._MaybeNewline()
519	self._BracketIndent(level)
520	self.buf.write('}')
521
522	else:
523	pass # mycpp workaround
524	if self.options & SHOW_NON_DATA:
525	# Similar to = operator, ui.DebugPrint()
526	# TODO: that prints value.Range in a special way
527	ysh_type = ValType(val)
528	id_str = ValueIdString(val)
529	self.buf.write('<%s%s>' % (ysh_type, id_str))
530	else:
531	raise error.Encode("Can't serialize object of type %s" %
532	ValType(val))
533
534
535	class PrettyPrinter(object):
536	""" Unused right now, but could enhance the = operator.
537
538	Output to polymorphic ColorOutput
539
540	Features like asdl/format.py:
541	- line wrapping
542	- color
543	- sharing detection by passing in a REF COUTN dict
544	- print @123 the first time, and then print ... the second time
545
546	and
547
548	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
549	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
550
551	- Omitting commas for ASDL? Maybe we can use two spaces
552
553	(Token id: Id.VSub_DollarName start: 0 length: 3)
554	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
555	"""
556
557	def __init__(self, max_col):
558	# type: (int) -> None
559	self.max_col = max_col
560
561	# This could be an optimized set an C++ bit set like
562	# mark_sweep_heap.h, rather than a Dict
563	#self.unique_objs = mylib.UniqueObjects()
564
565	# first pass of object ID -> number of times references
566
567	self.ref_count = {} # type: Dict[int, int]
568
569	def PrettyTree(self, val, f):
570	# type: (value_t, fmt.ColorOutput) -> None
571
572	# TODO: first convert to hnode.asdl types?
573
574	# Although we might want
575	# hnode.AlreadyShown = (str type, int unique_id)
576	pass
577
578	def Print(self, val, buf):
579	# type: (value_t, mylib.BufWriter) -> None
580
581	# Or print to stderr?
582	f = fmt.DetectConsoleOutput(mylib.Stdout())
583	self.PrettyTree(val, f)
584
585	# Then print those with ASDL
586	pass
587
588
589	class LexerDecoder(object):
590	"""J8 lexer and string decoder.
591
592	Similar interface as SimpleLexer, except we return an optional decoded
593	string
594	"""
595
596	def __init__(self, s, is_j8, lang_str):
597	# type: (str, bool, str) -> None
598	self.s = s
599	self.is_j8 = is_j8
600	self.lang_str = lang_str
601
602	self.pos = 0
603
604	# current line being lexed -- for error messages
605	self.cur_line_num = 1
606
607	# Reuse this instance to save GC objects. JSON objects could have
608	# thousands of strings.
609	self.decoded = mylib.BufWriter()
610
611	def _Error(self, msg, end_pos):
612	# type: (str, int) -> error.Decode
613
614	# Use the current position as start pos
615	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
616
617	def Next(self):
618	# type: () -> Tuple[Id_t, int, Optional[str]]
619	""" Returns a token and updates self.pos """
620
621	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
622
623	if not self.is_j8:
624	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
625	raise self._Error(
626	"Single quotes aren't part of JSON; you may want 'json8 read'",
627	end_pos)
628	if tok_id == Id.Ignored_Comment:
629	raise self._Error(
630	"Comments aren't part of JSON; you may want 'json8 read'",
631	end_pos)
632
633	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
634	Id.Left_USingleQuote):
635	return self._DecodeString(tok_id, end_pos)
636
637	if tok_id == Id.Left_JDoubleQuote:
638	if self.is_j8:
639	return self._DecodeString(tok_id, end_pos)
640	else:
641	raise self._Error('Pure JSON does not accept j"" prefix',
642	end_pos)
643
644	if tok_id == Id.Ignored_Newline:
645	#log('LINE %d', self.cur_line_num)
646	self.cur_line_num += 1
647
648	self.pos = end_pos
649	return tok_id, end_pos, None
650
651	def NextForLines(self):
652	# type: () -> Tuple[Id_t, int, Optional[str]]
653	""" Like Next(), but for J8 Lines """
654
655	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
656
657	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
658	Id.Left_BSingleQuote, Id.Left_USingleQuote):
659	return self._DecodeString(tok_id, end_pos)
660
661	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
662	# this for quoted strings.)
663	if (tok_id == Id.Lit_Chars and
664	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
665	raise self._Error(
666	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
667	if tok_id == Id.Char_AsciiControl:
668	raise self._Error(
669	"J8 Lines can't have unescaped ASCII control chars", end_pos)
670
671	if tok_id == Id.J8_Newline:
672	#log('LINE %d', self.cur_line_num)
673	self.cur_line_num += 1
674
675	self.pos = end_pos
676	return tok_id, end_pos, None
677
678	def _DecodeString(self, left_id, str_pos):
679	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
680	""" Returns a string token and updates self.pos """
681
682	while True:
683	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
684	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
685	else:
686	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
687
688	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
689
690	if tok_id == Id.Eol_Tok:
691	# TODO: point to beginning of # quote?
692	raise self._Error(
693	'Unexpected EOF while lexing %s string' % self.lang_str,
694	str_end)
695	if tok_id == Id.Unknown_Backslash:
696	raise self._Error(
697	'Bad backslash escape in %s string' % self.lang_str,
698	str_end)
699	if tok_id == Id.Char_AsciiControl:
700	raise self._Error(
701	"%s strings can't have unescaped ASCII control chars" %
702	self.lang_str, str_end)
703
704	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
705
706	self.pos = str_end
707
708	s = self.decoded.getvalue()
709	self.decoded.clear() # reuse this instance
710
711	#log('decoded %r', self.decoded.getvalue())
712	return Id.J8_String, str_end, s
713
714	#
715	# Now handle each kind of token
716	#
717
718	if tok_id == Id.Lit_Chars: # JSON and J8
719	part = self.s[str_pos:str_end]
720	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
721	raise self._Error(
722	'Invalid UTF-8 in %s string literal' % self.lang_str,
723	str_end)
724
725	# TODO: would be nice to avoid allocation in all these cases.
726	# But LookupCharC() would have to change.
727
728	elif tok_id == Id.Char_OneChar: # JSON and J8
729	ch = self.s[str_pos + 1]
730	part = consts.LookupCharC(ch)
731
732	elif tok_id == Id.Char_UBraced: # J8 only
733	h = self.s[str_pos + 3:str_end - 1]
734	i = int(h, 16)
735
736	# Same checks in osh/word_compile.py
737	if i > 0x10ffff:
738	raise self._Error(
739	"Code point can't be greater than U+10ffff", str_end)
740	if 0xD800 <= i and i < 0xE000:
741	raise self._Error(
742	r"\u{%s} escape is illegal because it's in the surrogate range"
743	% h, str_end)
744
745	part = Utf8Encode(i)
746
747	elif tok_id == Id.Char_YHex: # J8 only
748	h = self.s[str_pos + 2:str_end]
749
750	# Same check in osh/word_parse.py
751	if left_id != Id.Left_BSingleQuote:
752	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
753	raise self._Error(
754	r"\y%s escapes not allowed in u'' strings" % h,
755	str_end)
756
757	i = int(h, 16)
758	part = chr(i)
759
760	elif tok_id == Id.Char_SurrogatePair:
761	h1 = self.s[str_pos + 2:str_pos + 6]
762	h2 = self.s[str_pos + 8:str_pos + 12]
763
764	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
765	i1 = int(h1, 16) - 0xD800 # high surrogate
766	i2 = int(h2, 16) - 0xDC00 # low surrogate
767	code_point = 0x10000 + (i1 << 10) + i2
768
769	part = Utf8Encode(code_point)
770
771	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
772	h = self.s[str_pos + 2:str_end]
773	i = int(h, 16)
774	part = Utf8Encode(i)
775
776	else:
777	# Should never happen
778	raise AssertionError(Id_str(tok_id))
779
780	#log('%s part %r', Id_str(tok_id), part)
781	self.decoded.write(part)
782	str_pos = str_end
783
784
785	class _Parser(object):
786
787	def __init__(self, s, is_j8):
788	# type: (str, bool) -> None
789	self.s = s
790	self.is_j8 = is_j8
791	self.lang_str = "J8" if is_j8 else "JSON"
792
793	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
794	self.tok_id = Id.Undefined_Tok
795	self.start_pos = 0
796	self.end_pos = 0
797	self.decoded = '' # decoded J8 string
798
799	def _Next(self):
800	# type: () -> None
801
802	# This isn't the start of a J8_Bool token, it's the END of the token before it
803	while True:
804	self.start_pos = self.end_pos
805	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
806	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
807	Id.Ignored_Comment):
808	break
809	# TODO: add Ignored_Newline to count lines, and show line numbers
810	# in errors messages. The position of the last newline and a token
811	# can be used to calculate a column number.
812
813	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
814
815	def _Eat(self, tok_id):
816	# type: (Id_t) -> None
817
818	if self.tok_id != tok_id:
819	#log('position %r %d-%d %r', self.s, self.start_pos,
820	# self.end_pos, self.s[self.start_pos:self.end_pos])
821	raise self._ParseError("Expected %s, got %s" %
822	(Id_str(tok_id), Id_str(self.tok_id)))
823	self._Next()
824
825	def _NextForLines(self):
826	# type: () -> None
827	"""Like _Next, but use the J8 Lines lexer."""
828	self.start_pos = self.end_pos
829	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
830
831	def _ParseError(self, msg):
832	# type: (str) -> error.Decode
833	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
834	self.lexer.cur_line_num)
835
836
837	class Parser(_Parser):
838	"""JSON and JSON8 Parser."""
839
840	def __init__(self, s, is_j8):
841	# type: (str, bool) -> None
842	_Parser.__init__(self, s, is_j8)
843
844	def _ParsePair(self):
845	# type: () -> Tuple[str, value_t]
846
847	k = self.decoded # Save the potential string value
848	self._Eat(Id.J8_String) # Check that it's a string
849	assert k is not None
850
851	self._Eat(Id.J8_Colon)
852
853	v = self._ParseValue()
854	return k, v
855
856	def _ParseDict(self):
857	# type: () -> value_t
858	"""
859	pair = string ':' value
860	Dict = '{' '}'
861	\| '{' pair (',' pair)* '}'
862	"""
863	# precondition
864	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
865
866	#log('> Dict')
867
868	d = NewDict() # type: Dict[str, value_t]
869
870	self._Next()
871	if self.tok_id == Id.J8_RBrace:
872	self._Next()
873	return value.Dict(d)
874
875	k, v = self._ParsePair()
876	d[k] = v
877	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
878
879	while self.tok_id == Id.J8_Comma:
880	self._Next()
881	k, v = self._ParsePair()
882	d[k] = v
883	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
884
885	self._Eat(Id.J8_RBrace)
886
887	#log('< Dict')
888
889	return value.Dict(d)
890
891	def _ParseList(self):
892	# type: () -> value_t
893	"""
894	List = '[' ']'
895	\| '[' value (',' value)* ']'
896	"""
897	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
898
899	items = [] # type: List[value_t]
900
901	self._Next()
902	if self.tok_id == Id.J8_RBracket:
903	self._Next()
904	return value.List(items)
905
906	items.append(self._ParseValue())
907
908	while self.tok_id == Id.J8_Comma:
909	self._Next()
910	items.append(self._ParseValue())
911
912	self._Eat(Id.J8_RBracket)
913
914	return value.List(items)
915
916	def _ParseValue(self):
917	# type: () -> value_t
918	if self.tok_id == Id.J8_LBrace:
919	return self._ParseDict()
920
921	elif self.tok_id == Id.J8_LBracket:
922	return self._ParseList()
923
924	elif self.tok_id == Id.J8_Null:
925	self._Next()
926	return value.Null
927
928	elif self.tok_id == Id.J8_Bool:
929	#log('%r %d', self.s[self.start_pos], self.start_pos)
930	b = value.Bool(self.s[self.start_pos] == 't')
931	self._Next()
932	return b
933
934	elif self.tok_id == Id.J8_Int:
935	part = self.s[self.start_pos:self.end_pos]
936	self._Next()
937	try:
938	big = mops.FromStr(part)
939	except ValueError:
940	raise self._ParseError('Integer is too big')
941	return value.Int(big)
942
943	elif self.tok_id == Id.J8_Float:
944	part = self.s[self.start_pos:self.end_pos]
945	self._Next()
946	return value.Float(float(part))
947
948	# UString, BString too
949	elif self.tok_id == Id.J8_String:
950	str_val = value.Str(self.decoded)
951	#log('d %r', self.decoded)
952	self._Next()
953	return str_val
954
955	elif self.tok_id == Id.Eol_Tok:
956	raise self._ParseError('Unexpected EOF while parsing %s' %
957	self.lang_str)
958
959	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
960	raise self._ParseError('Invalid token while parsing %s: %s' %
961	(self.lang_str, Id_str(self.tok_id)))
962
963	def ParseValue(self):
964	# type: () -> value_t
965	""" Raises error.Decode. """
966	self._Next()
967	obj = self._ParseValue()
968
969	n = len(self.s)
970	if self.start_pos != n:
971	extra = n - self.start_pos
972	#log('n %d pos %d', n, self.start_pos)
973	raise self._ParseError(
974	'Got %d bytes of unexpected trailing input' % extra)
975	return obj
976
977
978	class Nil8Parser(_Parser):
979	"""
980	Tokens not in JSON8:
981	LParen RParen Symbol
982
983	Tokens not in JSON, but in JSON8 and NIL8:
984	Identifier (unquoted keys)
985	Ignored_Comment
986	"""
987
988	def __init__(self, s, is_j8):
989	# type: (str, bool) -> None
990	_Parser.__init__(self, s, is_j8)
991
992	if 0:
993
994	def _LookAhead(self):
995	# type: () -> Id_t
996	"""
997	Don't need this right now
998	"""
999	end_pos = self.end_pos # look ahead from last token
1000	while True:
1001	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1002	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1003	Id.Ignored_Comment):
1004	break
1005	return tok_id
1006
1007	def _ParseRecord(self):
1008	# type: () -> nvalue_t
1009	"""
1010	Yaks
1011	(self->Next) => (-> self Next)
1012	(self->Next obj.field) => ((-> self Next) (. obj field))
1013
1014	Similar to
1015	((identity identity) 42) => 42 in Clojure
1016
1017	ASDL
1018	(Node left:(. x4beef2))
1019	(Node left !x4beef2)
1020
1021	# Ambiguous because value can be identifier.
1022	# We have to look ahead to and see if there's a colon :
1023	field =
1024	Identifier ':' value
1025	\| value
1026
1027	record = '(' head field* ')'
1028
1029	- Identifier \| Symbol are treated the same, it's a side effect of
1030	the lexing style
1031	- do positional args come before named args
1032	- () is invalid? Use [] for empty list
1033	"""
1034	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1035
1036	items = [] # type: List[nvalue_t]
1037
1038	self._Next()
1039	if self.tok_id == Id.J8_RParen:
1040	self._Next()
1041	return nvalue.List(items)
1042
1043	#log('TOK %s', Id_str(self.tok_id))
1044	while self.tok_id != Id.J8_RParen:
1045	items.append(self._ParseNil8())
1046	#log('TOK 2 %s', Id_str(self.tok_id))
1047
1048	self._Eat(Id.J8_RParen)
1049
1050	return nvalue.List(items)
1051
1052	def _ParseList8(self):
1053	# type: () -> nvalue_t
1054	"""
1055	List8 = '[' value* ']'
1056
1057	No commas, not even optional ones for now.
1058	"""
1059	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1060
1061	items = [] # type: List[nvalue_t]
1062
1063	self._Next()
1064	if self.tok_id == Id.J8_RBracket:
1065	self._Next()
1066	return nvalue.List(items)
1067
1068	#log('TOK %s', Id_str(self.tok_id))
1069	while self.tok_id != Id.J8_RBracket:
1070	items.append(self._ParseNil8())
1071	#log('TOK 2 %s', Id_str(self.tok_id))
1072
1073	self._Eat(Id.J8_RBracket)
1074
1075	return nvalue.List(items)
1076
1077	def _ParseNil8(self):
1078	# type: () -> nvalue_t
1079	if self.tok_id == Id.J8_LParen:
1080	obj = self._ParseRecord() # type: nvalue_t
1081	#return obj
1082
1083	elif self.tok_id == Id.J8_LBracket:
1084	obj = self._ParseList8()
1085	#return obj
1086
1087	# Primitives are copied from J8 above.
1088	# TODO: We also want hex literals.
1089	elif self.tok_id == Id.J8_Null:
1090	self._Next()
1091	obj = nvalue.Null
1092
1093	elif self.tok_id == Id.J8_Bool:
1094	b = nvalue.Bool(self.s[self.start_pos] == 't')
1095	self._Next()
1096	obj = b
1097
1098	elif self.tok_id == Id.J8_Int:
1099	part = self.s[self.start_pos:self.end_pos]
1100	self._Next()
1101	obj = nvalue.Int(int(part))
1102
1103	elif self.tok_id == Id.J8_Float:
1104	part = self.s[self.start_pos:self.end_pos]
1105	self._Next()
1106	obj = nvalue.Float(float(part))
1107
1108	elif self.tok_id == Id.J8_String:
1109	str_val = nvalue.Str(self.decoded)
1110	self._Next()
1111	obj = str_val
1112
1113	# <- etc.
1114	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1115	Id.J8_Comma):
1116	# unquoted "word" treated like a string
1117	part = self.s[self.start_pos:self.end_pos]
1118	self._Next()
1119	obj = nvalue.Symbol(part)
1120
1121	elif self.tok_id == Id.Eol_Tok:
1122	raise self._ParseError('Unexpected EOF while parsing %s' %
1123	self.lang_str)
1124
1125	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1126	raise self._ParseError('Invalid token while parsing %s: %s' %
1127	(self.lang_str, Id_str(self.tok_id)))
1128
1129	#log('YO %s', Id_str(self.tok_id))
1130	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1131	#log('AT %s', Id_str(self.tok_id))
1132
1133	# key: "value" -> (: key "value")
1134	part = self.s[self.start_pos:self.end_pos]
1135	op = nvalue.Symbol(part)
1136
1137	self._Next()
1138	operand2 = self._ParseNil8()
1139	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1140	#print("--> INFIX %d %s" % (id(infix), infix))
1141	return infix
1142
1143	#next_id = self._LookAhead()
1144	#print('NEXT %s' % Id_str(next_id))
1145
1146	#raise AssertionError()
1147	#print("--> OBJ %d %s" % (id(obj), obj))
1148	return obj
1149
1150	def ParseNil8(self):
1151	# type: () -> nvalue_t
1152	""" Raises error.Decode. """
1153	self._Next()
1154	#print('yo')
1155	obj = self._ParseNil8()
1156	#print("==> %d %s" % (id(obj), obj))
1157	if self.tok_id != Id.Eol_Tok:
1158	raise self._ParseError('Unexpected trailing input')
1159	return obj
1160
1161
1162	class J8LinesParser(_Parser):
1163	"""Decode lines from a string with newlines.
1164
1165	We specify this with a grammar, to preserve location info and to reduce
1166	allocations. (But note that unquoted_line is more like a LOOP than it is
1167	grammatical.)
1168
1169	Grammar:
1170
1171	end = J8_Newline \| Eol_Tok
1172
1173	empty_line = WS_Space? end
1174
1175	# special case: read until end token, but REMOVE trailing WS_Space
1176	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1177
1178	j8_line = WS_Space? J8_String WS_Space? end
1179
1180	lines = (empty_line \| unquoted_line \| j8_line)*
1181
1182	where Lit_Chars is valid UTF-8
1183
1184	Notes:
1185
1186	(1) We disallow multiple strings on a line, like:
1187
1188	"json" "json2"
1189	"json" unquoted
1190
1191	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1192
1193	foo "" u''
1194
1195	The "" and u'' are not a decoded string, because the line started with
1196	Id.Lit_Chars literals.
1197
1198	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1199	Does it have - for empty cell?
1200	"""
1201
1202	def __init__(self, s):
1203	# type: (str) -> None
1204	_Parser.__init__(self, s, True)
1205
1206	def _Show(self, s):
1207	# type: (str) -> None
1208	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1209	self.end_pos)
1210
1211	def _ParseLine(self, out):
1212	# type: (List[str]) -> None
1213	""" May append a line to 'out' """
1214	#self._Show('1')
1215	if self.tok_id == Id.WS_Space:
1216	self._NextForLines()
1217
1218	# Empty line - return without doing anything
1219	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1220	self._NextForLines()
1221	return
1222
1223	# Quoted string on line
1224	if self.tok_id == Id.J8_String:
1225	out.append(self.decoded)
1226	self._NextForLines()
1227
1228	if self.tok_id == Id.WS_Space: # trailing whitespace
1229	self._NextForLines()
1230
1231	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1232	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1233	Id_str(self.tok_id))
1234
1235	self._NextForLines()
1236	return
1237
1238	# Unquoted line
1239	if self.tok_id == Id.Lit_Chars:
1240	# ' unquoted "" text on line ' # read every token until end
1241	string_start = self.start_pos
1242	while True:
1243	# for stripping whitespace
1244	prev_id = self.tok_id
1245	prev_start = self.start_pos
1246
1247	self._NextForLines()
1248
1249	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1250	# \r, but we're sticking with the JSON spec definition of
1251	# whitespace. (As another data point, CPython on Unix allows
1252	# \r in the middle of expressions, treating it as whitespace.)
1253	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1254	break
1255
1256	if prev_id == Id.WS_Space:
1257	string_end = prev_start # remove trailing whitespace
1258	else:
1259	string_end = self.start_pos
1260
1261	out.append(self.s[string_start:string_end])
1262
1263	self._NextForLines() # past newline
1264	return
1265
1266	raise AssertionError(Id_str(self.tok_id))
1267
1268	def Parse(self):
1269	# type: () -> List[str]
1270	""" Raises error.Decode. """
1271	self._NextForLines()
1272
1273	lines = [] # type: List[str]
1274	while self.tok_id != Id.Eol_Tok:
1275	self._ParseLine(lines)
1276
1277	if self.tok_id != Id.Eol_Tok:
1278	raise self._ParseError('Unexpected trailing input in J8 Lines')
1279
1280	return lines
1281
1282
1283	def SplitJ8Lines(s):
1284	# type: (str) -> List[str]
1285	"""Used by @(echo split command sub)
1286
1287	Raises:
1288	error.Decode
1289
1290	3 Errors:
1291	- J8 string syntax error inside quotes
1292	- Extra input on line
1293	- unquoted line isn't utf-8
1294	"""
1295	p = J8LinesParser(s)
1296	return p.Parse()
1297
1298
1299	# vim: sw=4