data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1367 lines, 683 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def Repr(val):
194	# type: (value_t) -> str
195	""" For assert [x]
196
197	This is like Python's repr
198	"""
199	# error.Encode should be impossible - we show cycles and non-data
200	buf = mylib.BufWriter()
201	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
202	return buf.getvalue()
203
204
205	def EncodeString(s, buf, unquoted_ok=False):
206	# type: (str, mylib.BufWriter, bool) -> None
207	""" For pp proc, etc."""
208
209	if unquoted_ok and fastfunc.CanOmitQuotes(s):
210	buf.write(s)
211	return
212
213	_Print(value.Str(s), buf, -1)
214
215
216	def MaybeEncodeString(s):
217	# type: (str) -> str
218	""" For write --json8 $s and compexport """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222
223	buf = mylib.BufWriter()
224	_Print(value.Str(s), buf, -1)
225	return buf.getvalue()
226
227
228	def MaybeEncodeJsonString(s):
229	# type: (str) -> str
230	""" For write --json """
231
232	# TODO: add unquoted_ok here?
233	# /usr/local/foo-bar/x.y/a_b
234	buf = mylib.BufWriter()
235	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
236	return buf.getvalue()
237
238
239	# DFS traversal state
240	UNSEEN = 0
241	EXPLORING = 1
242	FINISHED = 2
243
244
245	class InstancePrinter(object):
246	"""Print a value tree as J8/JSON."""
247
248	def __init__(self, buf, indent, options):
249	# type: (mylib.BufWriter, int, int) -> None
250	self.buf = buf
251	self.indent = indent
252	self.options = options
253
254	# Key is vm.HeapValueId(val)
255	# Value is always True
256	# Dict[int, None] doesn't translate -- it would be nice to have a set()
257	self.visited = {} # type: Dict[int, int]
258
259	def _ItemIndent(self, level):
260	# type: (int) -> None
261
262	if self.indent == -1:
263	return
264
265	self.buf.write_spaces((level + 1) * self.indent)
266
267	def _BracketIndent(self, level):
268	# type: (int) -> None
269
270	if self.indent == -1:
271	return
272
273	self.buf.write_spaces(level * self.indent)
274
275	def _MaybeNewline(self):
276	# type: () -> None
277	if self.indent == -1:
278	return
279	self.buf.write('\n')
280
281	def _MaybeSpace(self):
282	# type: () -> None
283	if self.indent == -1:
284	return
285	self.buf.write(' ')
286
287	def _PrintList(self, val, level):
288	# type: (value.List, int) -> None
289
290	if len(val.items) == 0: # Special case like Python/JS
291	self.buf.write('[]')
292	else:
293	self.buf.write('[')
294	self._MaybeNewline()
295	for i, item in enumerate(val.items):
296	if i != 0:
297	self.buf.write(',')
298	self._MaybeNewline()
299
300	self._ItemIndent(level)
301	self.Print(item, level + 1)
302	self._MaybeNewline()
303
304	self._BracketIndent(level)
305	self.buf.write(']')
306
307	def _PrintDict(self, val, level):
308	# type: (value.Dict, int) -> None
309
310	if len(val.d) == 0: # Special case like Python/JS
311	self.buf.write('{}')
312	else:
313	self.buf.write('{')
314	self._MaybeNewline()
315	i = 0
316	for k, v in iteritems(val.d):
317	if i != 0:
318	self.buf.write(',')
319	self._MaybeNewline()
320
321	self._ItemIndent(level)
322
323	pyj8.WriteString(k, self.options, self.buf)
324
325	self.buf.write(':')
326	self._MaybeSpace()
327
328	self.Print(v, level + 1)
329
330	i += 1
331
332	self._MaybeNewline()
333	self._BracketIndent(level)
334	self.buf.write('}')
335
336	def _PrintBashPrefix(self, type_str, level):
337	# type: (str, int) -> None
338
339	self.buf.write('{')
340	self._MaybeNewline()
341	self._ItemIndent(level)
342	self.buf.write('"type":')
343	self._MaybeSpace()
344	self.buf.write(type_str) # "BashArray", or "BashAssoc",
345
346	self._MaybeNewline()
347
348	self._ItemIndent(level)
349	self.buf.write('"data":')
350	self._MaybeSpace()
351
352	def _PrintBashSuffix(self, level):
353	# type: (int) -> None
354	self._MaybeNewline()
355	self._BracketIndent(level)
356	self.buf.write('}')
357
358	def _PrintSparseArray(self, val, level):
359	# type: (value.SparseArray, int) -> None
360
361	self._PrintBashPrefix('"SparseArray",', level)
362
363	if len(val.d) == 0: # Special case like Python/JS
364	self.buf.write('{}')
365	else:
366	self.buf.write('{')
367	self._MaybeNewline()
368
369	first = True
370	i = 0
371	for k, v in iteritems(val.d):
372	if i != 0:
373	self.buf.write(',')
374	self._MaybeNewline()
375
376	self._ItemIndent(level + 1)
377	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
378
379	self.buf.write(':')
380	self._MaybeSpace()
381
382	pyj8.WriteString(v, self.options, self.buf)
383
384	i += 1
385
386	self._MaybeNewline()
387
388	self._BracketIndent(level + 1)
389	self.buf.write('}')
390
391	self._PrintBashSuffix(level)
392
393	def _PrintBashArray(self, val, level):
394	# type: (value.BashArray, int) -> None
395
396	self._PrintBashPrefix('"BashArray",', level)
397
398	if len(val.strs) == 0: # Special case like Python/JS
399	self.buf.write('{}')
400	else:
401	self.buf.write('{')
402	self._MaybeNewline()
403
404	first = True
405	for i, s in enumerate(val.strs):
406	if s is None:
407	continue
408
409	if not first:
410	self.buf.write(',')
411	self._MaybeNewline()
412
413	self._ItemIndent(level + 1)
414	pyj8.WriteString(str(i), self.options, self.buf)
415
416	self.buf.write(':')
417	self._MaybeSpace()
418
419	pyj8.WriteString(s, self.options, self.buf)
420
421	first = False
422
423	self._MaybeNewline()
424
425	self._BracketIndent(level + 1)
426	self.buf.write('}')
427
428	self._PrintBashSuffix(level)
429
430	def _PrintBashAssoc(self, val, level):
431	# type: (value.BashAssoc, int) -> None
432
433	self._PrintBashPrefix('"BashAssoc",', level)
434
435	if len(val.d) == 0: # Special case like Python/JS
436	self.buf.write('{}')
437	else:
438	self.buf.write('{')
439	self._MaybeNewline()
440
441	i = 0
442	for k2, v2 in iteritems(val.d):
443	if i != 0:
444	self.buf.write(',')
445	self._MaybeNewline()
446
447	self._ItemIndent(level + 1)
448	pyj8.WriteString(k2, self.options, self.buf)
449
450	self.buf.write(':')
451	self._MaybeSpace()
452
453	pyj8.WriteString(v2, self.options, self.buf)
454
455	i += 1
456
457	self._MaybeNewline()
458
459	self._BracketIndent(level + 1)
460	self.buf.write('}')
461
462	self._PrintBashSuffix(level)
463
464	def Print(self, val, level=0):
465	# type: (value_t, int) -> None
466
467	# special value that means everything is on one line
468	# It's like
469	# JSON.stringify(d, null, 0)
470	# except we use -1, not 0. 0 can still have newlines.
471
472	UP_val = val
473	with tagswitch(val) as case:
474	if case(value_e.Null):
475	self.buf.write('null')
476
477	elif case(value_e.Bool):
478	val = cast(value.Bool, UP_val)
479	self.buf.write('true' if val.b else 'false')
480
481	elif case(value_e.Int):
482	val = cast(value.Int, UP_val)
483	# TODO: avoid intermediate allocation with
484	# self.buf.WriteBigInt(val.i)
485	#
486	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
487	# be of arbitrary length, and will need a growth strategy.
488	# Although that is not very common, so we could allocate in
489	# that case.
490
491	self.buf.write(mops.ToStr(val.i))
492
493	elif case(value_e.Float):
494	val = cast(value.Float, UP_val)
495
496	fl = val.f
497	if math.isinf(fl):
498	if self.options & INF_NAN_ARE_NULL:
499	s = 'null' # negative infinity is null too
500	else:
501	s = 'INFINITY'
502	if fl < 0:
503	s = '-' + s
504	elif math.isnan(fl):
505	if self.options & INF_NAN_ARE_NULL:
506	# JavaScript JSON lib behavior: Inf and NaN are null
507	# Python has a bug in the encoder by default, and then
508	# allow_nan=False raises an error
509	s = 'null'
510	else:
511	s = 'NAN'
512	else:
513	# TODO: can we avoid intermediate allocation?
514	# self.buf.WriteFloat(val.f)
515	s = str(fl)
516
517	self.buf.write(s)
518
519	elif case(value_e.Str):
520	val = cast(value.Str, UP_val)
521
522	pyj8.WriteString(val.s, self.options, self.buf)
523
524	elif case(value_e.List):
525	val = cast(value.List, UP_val)
526
527	# Cycle detection, only for containers that can be in cycles
528	heap_id = HeapValueId(val)
529
530	node_state = self.visited.get(heap_id, UNSEEN)
531	if node_state == FINISHED:
532	# Print it AGAIN. We print a JSON tree, which means we can
533	# visit and print nodes MANY TIMES, as long as they're not
534	# in a cycle.
535	self._PrintList(val, level)
536	return
537	if node_state == EXPLORING:
538	if self.options & SHOW_CYCLES:
539	self.buf.write('[ -->%s ]' % ValueIdString(val))
540	return
541	else:
542	# node.js prints which index closes the cycle
543	raise error.Encode(
544	"Can't encode List%s in object cycle" %
545	ValueIdString(val))
546
547	self.visited[heap_id] = EXPLORING
548	self._PrintList(val, level)
549	self.visited[heap_id] = FINISHED
550
551	elif case(value_e.Dict):
552	val = cast(value.Dict, UP_val)
553
554	# Cycle detection, only for containers that can be in cycles
555	heap_id = HeapValueId(val)
556
557	node_state = self.visited.get(heap_id, UNSEEN)
558	if node_state == FINISHED:
559	# Print it AGAIN. We print a JSON tree, which means we can
560	# visit and print nodes MANY TIMES, as long as they're not
561	# in a cycle.
562	self._PrintDict(val, level)
563	return
564	if node_state == EXPLORING:
565	if self.options & SHOW_CYCLES:
566	self.buf.write('{ -->%s }' % ValueIdString(val))
567	return
568	else:
569	# node.js prints which key closes the cycle
570	raise error.Encode(
571	"Can't encode Dict%s in object cycle" %
572	ValueIdString(val))
573
574	self.visited[heap_id] = EXPLORING
575	self._PrintDict(val, level)
576	self.visited[heap_id] = FINISHED
577
578	elif case(value_e.SparseArray):
579	val = cast(value.SparseArray, UP_val)
580	self._PrintSparseArray(val, level)
581
582	elif case(value_e.BashArray):
583	val = cast(value.BashArray, UP_val)
584	self._PrintBashArray(val, level)
585
586	elif case(value_e.BashAssoc):
587	val = cast(value.BashAssoc, UP_val)
588	self._PrintBashAssoc(val, level)
589
590	else:
591	pass # mycpp workaround
592	if self.options & SHOW_NON_DATA:
593	# Similar to = operator, ui.DebugPrint()
594	# TODO: that prints value.Range in a special way
595	ysh_type = ValType(val)
596	id_str = ValueIdString(val)
597	self.buf.write('<%s%s>' % (ysh_type, id_str))
598	else:
599	raise error.Encode("Can't serialize object of type %s" %
600	ValType(val))
601
602
603	class PrettyPrinter(object):
604	""" Unused right now, but could enhance the = operator.
605
606	Output to polymorphic ColorOutput
607
608	Features like asdl/format.py:
609	- line wrapping
610	- color
611	- sharing detection by passing in a REF COUTN dict
612	- print @123 the first time, and then print ... the second time
613
614	and
615
616	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
617	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
618
619	- Omitting commas for ASDL? Maybe we can use two spaces
620
621	(Token id: Id.VSub_DollarName start: 0 length: 3)
622	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
623	"""
624
625	def __init__(self, max_col):
626	# type: (int) -> None
627	self.max_col = max_col
628
629	# This could be an optimized set an C++ bit set like
630	# mark_sweep_heap.h, rather than a Dict
631	#self.unique_objs = mylib.UniqueObjects()
632
633	# first pass of object ID -> number of times references
634
635	self.ref_count = {} # type: Dict[int, int]
636
637	def PrettyTree(self, val, f):
638	# type: (value_t, fmt.ColorOutput) -> None
639
640	# TODO: first convert to hnode.asdl types?
641
642	# Although we might want
643	# hnode.AlreadyShown = (str type, int unique_id)
644	pass
645
646	def Print(self, val, buf):
647	# type: (value_t, mylib.BufWriter) -> None
648
649	# Or print to stderr?
650	f = fmt.DetectConsoleOutput(mylib.Stdout())
651	self.PrettyTree(val, f)
652
653	# Then print those with ASDL
654	pass
655
656
657	class LexerDecoder(object):
658	"""J8 lexer and string decoder.
659
660	Similar interface as SimpleLexer, except we return an optional decoded
661	string
662	"""
663
664	def __init__(self, s, is_j8, lang_str):
665	# type: (str, bool, str) -> None
666	self.s = s
667	self.is_j8 = is_j8
668	self.lang_str = lang_str
669
670	self.pos = 0
671
672	# current line being lexed -- for error messages
673	self.cur_line_num = 1
674
675	# Reuse this instance to save GC objects. JSON objects could have
676	# thousands of strings.
677	self.decoded = mylib.BufWriter()
678
679	def _Error(self, msg, end_pos):
680	# type: (str, int) -> error.Decode
681
682	# Use the current position as start pos
683	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
684
685	def Next(self):
686	# type: () -> Tuple[Id_t, int, Optional[str]]
687	""" Returns a token and updates self.pos """
688
689	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
690
691	if not self.is_j8:
692	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
693	raise self._Error(
694	"Single quotes aren't part of JSON; you may want 'json8 read'",
695	end_pos)
696	if tok_id == Id.Ignored_Comment:
697	raise self._Error(
698	"Comments aren't part of JSON; you may want 'json8 read'",
699	end_pos)
700
701	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
702	Id.Left_USingleQuote):
703	return self._DecodeString(tok_id, end_pos)
704
705	if tok_id == Id.Left_JDoubleQuote:
706	if self.is_j8:
707	return self._DecodeString(tok_id, end_pos)
708	else:
709	raise self._Error('Pure JSON does not accept j"" prefix',
710	end_pos)
711
712	if tok_id == Id.Ignored_Newline:
713	#log('LINE %d', self.cur_line_num)
714	self.cur_line_num += 1
715
716	self.pos = end_pos
717	return tok_id, end_pos, None
718
719	def NextForLines(self):
720	# type: () -> Tuple[Id_t, int, Optional[str]]
721	""" Like Next(), but for J8 Lines """
722
723	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
724
725	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
726	Id.Left_BSingleQuote, Id.Left_USingleQuote):
727	return self._DecodeString(tok_id, end_pos)
728
729	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
730	# this for quoted strings.)
731	if (tok_id == Id.Lit_Chars and
732	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
733	raise self._Error(
734	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
735	if tok_id == Id.Char_AsciiControl:
736	raise self._Error(
737	"J8 Lines can't have unescaped ASCII control chars", end_pos)
738
739	if tok_id == Id.J8_Newline:
740	#log('LINE %d', self.cur_line_num)
741	self.cur_line_num += 1
742
743	self.pos = end_pos
744	return tok_id, end_pos, None
745
746	def _DecodeString(self, left_id, str_pos):
747	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
748	""" Returns a string token and updates self.pos """
749
750	while True:
751	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
752	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
753	else:
754	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
755
756	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
757
758	if tok_id == Id.Eol_Tok:
759	# TODO: point to beginning of # quote?
760	raise self._Error(
761	'Unexpected EOF while lexing %s string' % self.lang_str,
762	str_end)
763	if tok_id == Id.Unknown_Backslash:
764	raise self._Error(
765	'Bad backslash escape in %s string' % self.lang_str,
766	str_end)
767	if tok_id == Id.Char_AsciiControl:
768	raise self._Error(
769	"%s strings can't have unescaped ASCII control chars" %
770	self.lang_str, str_end)
771
772	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
773
774	self.pos = str_end
775
776	s = self.decoded.getvalue()
777	self.decoded.clear() # reuse this instance
778
779	#log('decoded %r', self.decoded.getvalue())
780	return Id.J8_String, str_end, s
781
782	#
783	# Now handle each kind of token
784	#
785
786	if tok_id == Id.Lit_Chars: # JSON and J8
787	part = self.s[str_pos:str_end]
788	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
789	raise self._Error(
790	'Invalid UTF-8 in %s string literal' % self.lang_str,
791	str_end)
792
793	# TODO: would be nice to avoid allocation in all these cases.
794	# But LookupCharC() would have to change.
795
796	elif tok_id == Id.Char_OneChar: # JSON and J8
797	ch = self.s[str_pos + 1]
798	part = consts.LookupCharC(ch)
799
800	elif tok_id == Id.Char_UBraced: # J8 only
801	h = self.s[str_pos + 3:str_end - 1]
802	i = int(h, 16)
803
804	# Same checks in osh/word_compile.py
805	if i > 0x10ffff:
806	raise self._Error(
807	"Code point can't be greater than U+10ffff", str_end)
808	if 0xD800 <= i and i < 0xE000:
809	raise self._Error(
810	r"\u{%s} escape is illegal because it's in the surrogate range"
811	% h, str_end)
812
813	part = Utf8Encode(i)
814
815	elif tok_id == Id.Char_YHex: # J8 only
816	h = self.s[str_pos + 2:str_end]
817
818	# Same check in osh/word_parse.py
819	if left_id != Id.Left_BSingleQuote:
820	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
821	raise self._Error(
822	r"\y%s escapes not allowed in u'' strings" % h,
823	str_end)
824
825	i = int(h, 16)
826	part = chr(i)
827
828	elif tok_id == Id.Char_SurrogatePair:
829	h1 = self.s[str_pos + 2:str_pos + 6]
830	h2 = self.s[str_pos + 8:str_pos + 12]
831
832	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
833	i1 = int(h1, 16) - 0xD800 # high surrogate
834	i2 = int(h2, 16) - 0xDC00 # low surrogate
835	code_point = 0x10000 + (i1 << 10) + i2
836
837	part = Utf8Encode(code_point)
838
839	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
840	h = self.s[str_pos + 2:str_end]
841	i = int(h, 16)
842	part = Utf8Encode(i)
843
844	else:
845	# Should never happen
846	raise AssertionError(Id_str(tok_id))
847
848	#log('%s part %r', Id_str(tok_id), part)
849	self.decoded.write(part)
850	str_pos = str_end
851
852
853	class _Parser(object):
854
855	def __init__(self, s, is_j8):
856	# type: (str, bool) -> None
857	self.s = s
858	self.is_j8 = is_j8
859	self.lang_str = "J8" if is_j8 else "JSON"
860
861	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
862	self.tok_id = Id.Undefined_Tok
863	self.start_pos = 0
864	self.end_pos = 0
865	self.decoded = '' # decoded J8 string
866
867	def _Next(self):
868	# type: () -> None
869
870	# This isn't the start of a J8_Bool token, it's the END of the token before it
871	while True:
872	self.start_pos = self.end_pos
873	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
874	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
875	Id.Ignored_Comment):
876	break
877	# TODO: add Ignored_Newline to count lines, and show line numbers
878	# in errors messages. The position of the last newline and a token
879	# can be used to calculate a column number.
880
881	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
882
883	def _Eat(self, tok_id):
884	# type: (Id_t) -> None
885
886	if self.tok_id != tok_id:
887	#log('position %r %d-%d %r', self.s, self.start_pos,
888	# self.end_pos, self.s[self.start_pos:self.end_pos])
889	raise self._ParseError("Expected %s, got %s" %
890	(Id_str(tok_id), Id_str(self.tok_id)))
891	self._Next()
892
893	def _NextForLines(self):
894	# type: () -> None
895	"""Like _Next, but use the J8 Lines lexer."""
896	self.start_pos = self.end_pos
897	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
898
899	def _ParseError(self, msg):
900	# type: (str) -> error.Decode
901	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
902	self.lexer.cur_line_num)
903
904
905	class Parser(_Parser):
906	"""JSON and JSON8 Parser."""
907
908	def __init__(self, s, is_j8):
909	# type: (str, bool) -> None
910	_Parser.__init__(self, s, is_j8)
911
912	def _ParsePair(self):
913	# type: () -> Tuple[str, value_t]
914
915	k = self.decoded # Save the potential string value
916	self._Eat(Id.J8_String) # Check that it's a string
917	assert k is not None
918
919	self._Eat(Id.J8_Colon)
920
921	v = self._ParseValue()
922	return k, v
923
924	def _ParseDict(self):
925	# type: () -> value_t
926	"""
927	pair = string ':' value
928	Dict = '{' '}'
929	\| '{' pair (',' pair)* '}'
930	"""
931	# precondition
932	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
933
934	#log('> Dict')
935
936	d = NewDict() # type: Dict[str, value_t]
937
938	self._Next()
939	if self.tok_id == Id.J8_RBrace:
940	self._Next()
941	return value.Dict(d)
942
943	k, v = self._ParsePair()
944	d[k] = v
945	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
946
947	while self.tok_id == Id.J8_Comma:
948	self._Next()
949	k, v = self._ParsePair()
950	d[k] = v
951	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
952
953	self._Eat(Id.J8_RBrace)
954
955	#log('< Dict')
956
957	return value.Dict(d)
958
959	def _ParseList(self):
960	# type: () -> value_t
961	"""
962	List = '[' ']'
963	\| '[' value (',' value)* ']'
964	"""
965	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
966
967	items = [] # type: List[value_t]
968
969	self._Next()
970	if self.tok_id == Id.J8_RBracket:
971	self._Next()
972	return value.List(items)
973
974	items.append(self._ParseValue())
975
976	while self.tok_id == Id.J8_Comma:
977	self._Next()
978	items.append(self._ParseValue())
979
980	self._Eat(Id.J8_RBracket)
981
982	return value.List(items)
983
984	def _ParseValue(self):
985	# type: () -> value_t
986	if self.tok_id == Id.J8_LBrace:
987	return self._ParseDict()
988
989	elif self.tok_id == Id.J8_LBracket:
990	return self._ParseList()
991
992	elif self.tok_id == Id.J8_Null:
993	self._Next()
994	return value.Null
995
996	elif self.tok_id == Id.J8_Bool:
997	#log('%r %d', self.s[self.start_pos], self.start_pos)
998	b = value.Bool(self.s[self.start_pos] == 't')
999	self._Next()
1000	return b
1001
1002	elif self.tok_id == Id.J8_Int:
1003	part = self.s[self.start_pos:self.end_pos]
1004	self._Next()
1005	try:
1006	big = mops.FromStr(part)
1007	except ValueError:
1008	raise self._ParseError('Integer is too big')
1009	return value.Int(big)
1010
1011	elif self.tok_id == Id.J8_Float:
1012	part = self.s[self.start_pos:self.end_pos]
1013	self._Next()
1014	return value.Float(float(part))
1015
1016	# UString, BString too
1017	elif self.tok_id == Id.J8_String:
1018	str_val = value.Str(self.decoded)
1019	#log('d %r', self.decoded)
1020	self._Next()
1021	return str_val
1022
1023	elif self.tok_id == Id.Eol_Tok:
1024	raise self._ParseError('Unexpected EOF while parsing %s' %
1025	self.lang_str)
1026
1027	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1028	raise self._ParseError('Invalid token while parsing %s: %s' %
1029	(self.lang_str, Id_str(self.tok_id)))
1030
1031	def ParseValue(self):
1032	# type: () -> value_t
1033	""" Raises error.Decode. """
1034	self._Next()
1035	obj = self._ParseValue()
1036
1037	n = len(self.s)
1038	if self.start_pos != n:
1039	extra = n - self.start_pos
1040	#log('n %d pos %d', n, self.start_pos)
1041	raise self._ParseError(
1042	'Got %d bytes of unexpected trailing input' % extra)
1043	return obj
1044
1045
1046	class Nil8Parser(_Parser):
1047	"""
1048	Tokens not in JSON8:
1049	LParen RParen Symbol
1050
1051	Tokens not in JSON, but in JSON8 and NIL8:
1052	Identifier (unquoted keys)
1053	Ignored_Comment
1054	"""
1055
1056	def __init__(self, s, is_j8):
1057	# type: (str, bool) -> None
1058	_Parser.__init__(self, s, is_j8)
1059
1060	if 0:
1061
1062	def _LookAhead(self):
1063	# type: () -> Id_t
1064	"""
1065	Don't need this right now
1066	"""
1067	end_pos = self.end_pos # look ahead from last token
1068	while True:
1069	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1070	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1071	Id.Ignored_Comment):
1072	break
1073	return tok_id
1074
1075	def _ParseRecord(self):
1076	# type: () -> nvalue_t
1077	"""
1078	Yaks
1079	(self->Next) => (-> self Next)
1080	(self->Next obj.field) => ((-> self Next) (. obj field))
1081
1082	Similar to
1083	((identity identity) 42) => 42 in Clojure
1084
1085	ASDL
1086	(Node left:(. x4beef2))
1087	(Node left !x4beef2)
1088
1089	# Ambiguous because value can be identifier.
1090	# We have to look ahead to and see if there's a colon :
1091	field =
1092	Identifier ':' value
1093	\| value
1094
1095	record = '(' head field* ')'
1096
1097	- Identifier \| Symbol are treated the same, it's a side effect of
1098	the lexing style
1099	- do positional args come before named args
1100	- () is invalid? Use [] for empty list
1101	"""
1102	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1103
1104	items = [] # type: List[nvalue_t]
1105
1106	self._Next()
1107	if self.tok_id == Id.J8_RParen:
1108	self._Next()
1109	return nvalue.List(items)
1110
1111	#log('TOK %s', Id_str(self.tok_id))
1112	while self.tok_id != Id.J8_RParen:
1113	items.append(self._ParseNil8())
1114	#log('TOK 2 %s', Id_str(self.tok_id))
1115
1116	self._Eat(Id.J8_RParen)
1117
1118	return nvalue.List(items)
1119
1120	def _ParseList8(self):
1121	# type: () -> nvalue_t
1122	"""
1123	List8 = '[' value* ']'
1124
1125	No commas, not even optional ones for now.
1126	"""
1127	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1128
1129	items = [] # type: List[nvalue_t]
1130
1131	self._Next()
1132	if self.tok_id == Id.J8_RBracket:
1133	self._Next()
1134	return nvalue.List(items)
1135
1136	#log('TOK %s', Id_str(self.tok_id))
1137	while self.tok_id != Id.J8_RBracket:
1138	items.append(self._ParseNil8())
1139	#log('TOK 2 %s', Id_str(self.tok_id))
1140
1141	self._Eat(Id.J8_RBracket)
1142
1143	return nvalue.List(items)
1144
1145	def _ParseNil8(self):
1146	# type: () -> nvalue_t
1147	if self.tok_id == Id.J8_LParen:
1148	obj = self._ParseRecord() # type: nvalue_t
1149	#return obj
1150
1151	elif self.tok_id == Id.J8_LBracket:
1152	obj = self._ParseList8()
1153	#return obj
1154
1155	# Primitives are copied from J8 above.
1156	# TODO: We also want hex literals.
1157	elif self.tok_id == Id.J8_Null:
1158	self._Next()
1159	obj = nvalue.Null
1160
1161	elif self.tok_id == Id.J8_Bool:
1162	b = nvalue.Bool(self.s[self.start_pos] == 't')
1163	self._Next()
1164	obj = b
1165
1166	elif self.tok_id == Id.J8_Int:
1167	part = self.s[self.start_pos:self.end_pos]
1168	self._Next()
1169	obj = nvalue.Int(int(part))
1170
1171	elif self.tok_id == Id.J8_Float:
1172	part = self.s[self.start_pos:self.end_pos]
1173	self._Next()
1174	obj = nvalue.Float(float(part))
1175
1176	elif self.tok_id == Id.J8_String:
1177	str_val = nvalue.Str(self.decoded)
1178	self._Next()
1179	obj = str_val
1180
1181	# <- etc.
1182	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1183	Id.J8_Comma):
1184	# unquoted "word" treated like a string
1185	part = self.s[self.start_pos:self.end_pos]
1186	self._Next()
1187	obj = nvalue.Symbol(part)
1188
1189	elif self.tok_id == Id.Eol_Tok:
1190	raise self._ParseError('Unexpected EOF while parsing %s' %
1191	self.lang_str)
1192
1193	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1194	raise self._ParseError('Invalid token while parsing %s: %s' %
1195	(self.lang_str, Id_str(self.tok_id)))
1196
1197	#log('YO %s', Id_str(self.tok_id))
1198	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1199	#log('AT %s', Id_str(self.tok_id))
1200
1201	# key: "value" -> (: key "value")
1202	part = self.s[self.start_pos:self.end_pos]
1203	op = nvalue.Symbol(part)
1204
1205	self._Next()
1206	operand2 = self._ParseNil8()
1207	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1208	#print("--> INFIX %d %s" % (id(infix), infix))
1209	return infix
1210
1211	#next_id = self._LookAhead()
1212	#print('NEXT %s' % Id_str(next_id))
1213
1214	#raise AssertionError()
1215	#print("--> OBJ %d %s" % (id(obj), obj))
1216	return obj
1217
1218	def ParseNil8(self):
1219	# type: () -> nvalue_t
1220	""" Raises error.Decode. """
1221	self._Next()
1222	#print('yo')
1223	obj = self._ParseNil8()
1224	#print("==> %d %s" % (id(obj), obj))
1225	if self.tok_id != Id.Eol_Tok:
1226	raise self._ParseError('Unexpected trailing input')
1227	return obj
1228
1229
1230	class J8LinesParser(_Parser):
1231	"""Decode lines from a string with newlines.
1232
1233	We specify this with a grammar, to preserve location info and to reduce
1234	allocations. (But note that unquoted_line is more like a LOOP than it is
1235	grammatical.)
1236
1237	Grammar:
1238
1239	end = J8_Newline \| Eol_Tok
1240
1241	empty_line = WS_Space? end
1242
1243	# special case: read until end token, but REMOVE trailing WS_Space
1244	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1245
1246	j8_line = WS_Space? J8_String WS_Space? end
1247
1248	lines = (empty_line \| unquoted_line \| j8_line)*
1249
1250	where Lit_Chars is valid UTF-8
1251
1252	Notes:
1253
1254	(1) We disallow multiple strings on a line, like:
1255
1256	"json" "json2"
1257	"json" unquoted
1258
1259	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1260
1261	foo "" u''
1262
1263	The "" and u'' are not a decoded string, because the line started with
1264	Id.Lit_Chars literals.
1265
1266	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1267	Does it have - for empty cell?
1268	"""
1269
1270	def __init__(self, s):
1271	# type: (str) -> None
1272	_Parser.__init__(self, s, True)
1273
1274	def _Show(self, s):
1275	# type: (str) -> None
1276	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1277	self.end_pos)
1278
1279	def _ParseLine(self, out):
1280	# type: (List[str]) -> None
1281	""" May append a line to 'out' """
1282	#self._Show('1')
1283	if self.tok_id == Id.WS_Space:
1284	self._NextForLines()
1285
1286	# Empty line - return without doing anything
1287	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1288	self._NextForLines()
1289	return
1290
1291	# Quoted string on line
1292	if self.tok_id == Id.J8_String:
1293	out.append(self.decoded)
1294	self._NextForLines()
1295
1296	if self.tok_id == Id.WS_Space: # trailing whitespace
1297	self._NextForLines()
1298
1299	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1300	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1301	Id_str(self.tok_id))
1302
1303	self._NextForLines()
1304	return
1305
1306	# Unquoted line
1307	if self.tok_id == Id.Lit_Chars:
1308	# ' unquoted "" text on line ' # read every token until end
1309	string_start = self.start_pos
1310	while True:
1311	# for stripping whitespace
1312	prev_id = self.tok_id
1313	prev_start = self.start_pos
1314
1315	self._NextForLines()
1316
1317	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1318	# \r, but we're sticking with the JSON spec definition of
1319	# whitespace. (As another data point, CPython on Unix allows
1320	# \r in the middle of expressions, treating it as whitespace.)
1321	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1322	break
1323
1324	if prev_id == Id.WS_Space:
1325	string_end = prev_start # remove trailing whitespace
1326	else:
1327	string_end = self.start_pos
1328
1329	out.append(self.s[string_start:string_end])
1330
1331	self._NextForLines() # past newline
1332	return
1333
1334	raise AssertionError(Id_str(self.tok_id))
1335
1336	def Parse(self):
1337	# type: () -> List[str]
1338	""" Raises error.Decode. """
1339	self._NextForLines()
1340
1341	lines = [] # type: List[str]
1342	while self.tok_id != Id.Eol_Tok:
1343	self._ParseLine(lines)
1344
1345	if self.tok_id != Id.Eol_Tok:
1346	raise self._ParseError('Unexpected trailing input in J8 Lines')
1347
1348	return lines
1349
1350
1351	def SplitJ8Lines(s):
1352	# type: (str) -> List[str]
1353	"""Used by @(echo split command sub)
1354
1355	Raises:
1356	error.Decode
1357
1358	3 Errors:
1359	- J8 string syntax error inside quotes
1360	- Extra input on line
1361	- unquoted line isn't utf-8
1362	"""
1363	p = J8LinesParser(s)
1364	return p.Parse()
1365
1366
1367	# vim: sw=4