OILS / data_lang / j8.py View on Github | oilshell.org

1368 lines, 684 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193if 0:
194
195 def Repr(val):
196 # type: (value_t) -> str
197 """ Unused
198 This is like Python's repr
199 """
200 # error.Encode should be impossible - we show cycles and non-data
201 buf = mylib.BufWriter()
202 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
203 return buf.getvalue()
204
205
206def EncodeString(s, buf, unquoted_ok=False):
207 # type: (str, mylib.BufWriter, bool) -> None
208 """ For pp proc, etc."""
209
210 if unquoted_ok and fastfunc.CanOmitQuotes(s):
211 buf.write(s)
212 return
213
214 _Print(value.Str(s), buf, -1)
215
216
217def MaybeEncodeString(s):
218 # type: (str) -> str
219 """ For write --json8 $s and compexport """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223
224 buf = mylib.BufWriter()
225 _Print(value.Str(s), buf, -1)
226 return buf.getvalue()
227
228
229def MaybeEncodeJsonString(s):
230 # type: (str) -> str
231 """ For write --json """
232
233 # TODO: add unquoted_ok here?
234 # /usr/local/foo-bar/x.y/a_b
235 buf = mylib.BufWriter()
236 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237 return buf.getvalue()
238
239
240# DFS traversal state
241UNSEEN = 0
242EXPLORING = 1
243FINISHED = 2
244
245
246class InstancePrinter(object):
247 """Print a value tree as J8/JSON."""
248
249 def __init__(self, buf, indent, options):
250 # type: (mylib.BufWriter, int, int) -> None
251 self.buf = buf
252 self.indent = indent
253 self.options = options
254
255 # Key is vm.HeapValueId(val)
256 # Value is always True
257 # Dict[int, None] doesn't translate -- it would be nice to have a set()
258 self.visited = {} # type: Dict[int, int]
259
260 def _ItemIndent(self, level):
261 # type: (int) -> None
262
263 if self.indent == -1:
264 return
265
266 self.buf.write_spaces((level + 1) * self.indent)
267
268 def _BracketIndent(self, level):
269 # type: (int) -> None
270
271 if self.indent == -1:
272 return
273
274 self.buf.write_spaces(level * self.indent)
275
276 def _MaybeNewline(self):
277 # type: () -> None
278 if self.indent == -1:
279 return
280 self.buf.write('\n')
281
282 def _MaybeSpace(self):
283 # type: () -> None
284 if self.indent == -1:
285 return
286 self.buf.write(' ')
287
288 def _PrintList(self, val, level):
289 # type: (value.List, int) -> None
290
291 if len(val.items) == 0: # Special case like Python/JS
292 self.buf.write('[]')
293 else:
294 self.buf.write('[')
295 self._MaybeNewline()
296 for i, item in enumerate(val.items):
297 if i != 0:
298 self.buf.write(',')
299 self._MaybeNewline()
300
301 self._ItemIndent(level)
302 self.Print(item, level + 1)
303 self._MaybeNewline()
304
305 self._BracketIndent(level)
306 self.buf.write(']')
307
308 def _PrintDict(self, val, level):
309 # type: (value.Dict, int) -> None
310
311 if len(val.d) == 0: # Special case like Python/JS
312 self.buf.write('{}')
313 else:
314 self.buf.write('{')
315 self._MaybeNewline()
316 i = 0
317 for k, v in iteritems(val.d):
318 if i != 0:
319 self.buf.write(',')
320 self._MaybeNewline()
321
322 self._ItemIndent(level)
323
324 pyj8.WriteString(k, self.options, self.buf)
325
326 self.buf.write(':')
327 self._MaybeSpace()
328
329 self.Print(v, level + 1)
330
331 i += 1
332
333 self._MaybeNewline()
334 self._BracketIndent(level)
335 self.buf.write('}')
336
337 def _PrintBashPrefix(self, type_str, level):
338 # type: (str, int) -> None
339
340 self.buf.write('{')
341 self._MaybeNewline()
342 self._ItemIndent(level)
343 self.buf.write('"type":')
344 self._MaybeSpace()
345 self.buf.write(type_str) # "BashArray", or "BashAssoc",
346
347 self._MaybeNewline()
348
349 self._ItemIndent(level)
350 self.buf.write('"data":')
351 self._MaybeSpace()
352
353 def _PrintBashSuffix(self, level):
354 # type: (int) -> None
355 self._MaybeNewline()
356 self._BracketIndent(level)
357 self.buf.write('}')
358
359 def _PrintSparseArray(self, val, level):
360 # type: (value.SparseArray, int) -> None
361
362 self._PrintBashPrefix('"SparseArray",', level)
363
364 if len(val.d) == 0: # Special case like Python/JS
365 self.buf.write('{}')
366 else:
367 self.buf.write('{')
368 self._MaybeNewline()
369
370 first = True
371 i = 0
372 for k, v in iteritems(val.d):
373 if i != 0:
374 self.buf.write(',')
375 self._MaybeNewline()
376
377 self._ItemIndent(level + 1)
378 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
379
380 self.buf.write(':')
381 self._MaybeSpace()
382
383 pyj8.WriteString(v, self.options, self.buf)
384
385 i += 1
386
387 self._MaybeNewline()
388
389 self._BracketIndent(level + 1)
390 self.buf.write('}')
391
392 self._PrintBashSuffix(level)
393
394 def _PrintBashArray(self, val, level):
395 # type: (value.BashArray, int) -> None
396
397 self._PrintBashPrefix('"BashArray",', level)
398
399 if len(val.strs) == 0: # Special case like Python/JS
400 self.buf.write('{}')
401 else:
402 self.buf.write('{')
403 self._MaybeNewline()
404
405 first = True
406 for i, s in enumerate(val.strs):
407 if s is None:
408 continue
409
410 if not first:
411 self.buf.write(',')
412 self._MaybeNewline()
413
414 self._ItemIndent(level + 1)
415 pyj8.WriteString(str(i), self.options, self.buf)
416
417 self.buf.write(':')
418 self._MaybeSpace()
419
420 pyj8.WriteString(s, self.options, self.buf)
421
422 first = False
423
424 self._MaybeNewline()
425
426 self._BracketIndent(level + 1)
427 self.buf.write('}')
428
429 self._PrintBashSuffix(level)
430
431 def _PrintBashAssoc(self, val, level):
432 # type: (value.BashAssoc, int) -> None
433
434 self._PrintBashPrefix('"BashAssoc",', level)
435
436 if len(val.d) == 0: # Special case like Python/JS
437 self.buf.write('{}')
438 else:
439 self.buf.write('{')
440 self._MaybeNewline()
441
442 i = 0
443 for k2, v2 in iteritems(val.d):
444 if i != 0:
445 self.buf.write(',')
446 self._MaybeNewline()
447
448 self._ItemIndent(level + 1)
449 pyj8.WriteString(k2, self.options, self.buf)
450
451 self.buf.write(':')
452 self._MaybeSpace()
453
454 pyj8.WriteString(v2, self.options, self.buf)
455
456 i += 1
457
458 self._MaybeNewline()
459
460 self._BracketIndent(level + 1)
461 self.buf.write('}')
462
463 self._PrintBashSuffix(level)
464
465 def Print(self, val, level=0):
466 # type: (value_t, int) -> None
467
468 # special value that means everything is on one line
469 # It's like
470 # JSON.stringify(d, null, 0)
471 # except we use -1, not 0. 0 can still have newlines.
472
473 UP_val = val
474 with tagswitch(val) as case:
475 if case(value_e.Null):
476 self.buf.write('null')
477
478 elif case(value_e.Bool):
479 val = cast(value.Bool, UP_val)
480 self.buf.write('true' if val.b else 'false')
481
482 elif case(value_e.Int):
483 val = cast(value.Int, UP_val)
484 # TODO: avoid intermediate allocation with
485 # self.buf.WriteBigInt(val.i)
486 #
487 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
488 # be of arbitrary length, and will need a growth strategy.
489 # Although that is not very common, so we could allocate in
490 # that case.
491
492 self.buf.write(mops.ToStr(val.i))
493
494 elif case(value_e.Float):
495 val = cast(value.Float, UP_val)
496
497 fl = val.f
498 if math.isinf(fl):
499 if self.options & INF_NAN_ARE_NULL:
500 s = 'null' # negative infinity is null too
501 else:
502 s = 'INFINITY'
503 if fl < 0:
504 s = '-' + s
505 elif math.isnan(fl):
506 if self.options & INF_NAN_ARE_NULL:
507 # JavaScript JSON lib behavior: Inf and NaN are null
508 # Python has a bug in the encoder by default, and then
509 # allow_nan=False raises an error
510 s = 'null'
511 else:
512 s = 'NAN'
513 else:
514 # TODO: can we avoid intermediate allocation?
515 # self.buf.WriteFloat(val.f)
516 s = str(fl)
517
518 self.buf.write(s)
519
520 elif case(value_e.Str):
521 val = cast(value.Str, UP_val)
522
523 pyj8.WriteString(val.s, self.options, self.buf)
524
525 elif case(value_e.List):
526 val = cast(value.List, UP_val)
527
528 # Cycle detection, only for containers that can be in cycles
529 heap_id = HeapValueId(val)
530
531 node_state = self.visited.get(heap_id, UNSEEN)
532 if node_state == FINISHED:
533 # Print it AGAIN. We print a JSON tree, which means we can
534 # visit and print nodes MANY TIMES, as long as they're not
535 # in a cycle.
536 self._PrintList(val, level)
537 return
538 if node_state == EXPLORING:
539 if self.options & SHOW_CYCLES:
540 self.buf.write('[ -->%s ]' % ValueIdString(val))
541 return
542 else:
543 # node.js prints which index closes the cycle
544 raise error.Encode(
545 "Can't encode List%s in object cycle" %
546 ValueIdString(val))
547
548 self.visited[heap_id] = EXPLORING
549 self._PrintList(val, level)
550 self.visited[heap_id] = FINISHED
551
552 elif case(value_e.Dict):
553 val = cast(value.Dict, UP_val)
554
555 # Cycle detection, only for containers that can be in cycles
556 heap_id = HeapValueId(val)
557
558 node_state = self.visited.get(heap_id, UNSEEN)
559 if node_state == FINISHED:
560 # Print it AGAIN. We print a JSON tree, which means we can
561 # visit and print nodes MANY TIMES, as long as they're not
562 # in a cycle.
563 self._PrintDict(val, level)
564 return
565 if node_state == EXPLORING:
566 if self.options & SHOW_CYCLES:
567 self.buf.write('{ -->%s }' % ValueIdString(val))
568 return
569 else:
570 # node.js prints which key closes the cycle
571 raise error.Encode(
572 "Can't encode Dict%s in object cycle" %
573 ValueIdString(val))
574
575 self.visited[heap_id] = EXPLORING
576 self._PrintDict(val, level)
577 self.visited[heap_id] = FINISHED
578
579 elif case(value_e.SparseArray):
580 val = cast(value.SparseArray, UP_val)
581 self._PrintSparseArray(val, level)
582
583 elif case(value_e.BashArray):
584 val = cast(value.BashArray, UP_val)
585 self._PrintBashArray(val, level)
586
587 elif case(value_e.BashAssoc):
588 val = cast(value.BashAssoc, UP_val)
589 self._PrintBashAssoc(val, level)
590
591 else:
592 pass # mycpp workaround
593 if self.options & SHOW_NON_DATA:
594 # Similar to = operator, ui.DebugPrint()
595 # TODO: that prints value.Range in a special way
596 ysh_type = ValType(val)
597 id_str = ValueIdString(val)
598 self.buf.write('<%s%s>' % (ysh_type, id_str))
599 else:
600 raise error.Encode("Can't serialize object of type %s" %
601 ValType(val))
602
603
604class PrettyPrinter(object):
605 """ Unused right now, but could enhance the = operator.
606
607 Output to polymorphic ColorOutput
608
609 Features like asdl/format.py:
610 - line wrapping
611 - color
612 - sharing detection by passing in a REF COUTN dict
613 - print @123 the first time, and then print ... the second time
614
615 and
616
617 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
618 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
619
620 - Omitting commas for ASDL? Maybe we can use two spaces
621
622 (Token id: Id.VSub_DollarName start: 0 length: 3)
623 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
624 """
625
626 def __init__(self, max_col):
627 # type: (int) -> None
628 self.max_col = max_col
629
630 # This could be an optimized set an C++ bit set like
631 # mark_sweep_heap.h, rather than a Dict
632 #self.unique_objs = mylib.UniqueObjects()
633
634 # first pass of object ID -> number of times references
635
636 self.ref_count = {} # type: Dict[int, int]
637
638 def PrettyTree(self, val, f):
639 # type: (value_t, fmt.ColorOutput) -> None
640
641 # TODO: first convert to hnode.asdl types?
642
643 # Although we might want
644 # hnode.AlreadyShown = (str type, int unique_id)
645 pass
646
647 def Print(self, val, buf):
648 # type: (value_t, mylib.BufWriter) -> None
649
650 # Or print to stderr?
651 f = fmt.DetectConsoleOutput(mylib.Stdout())
652 self.PrettyTree(val, f)
653
654 # Then print those with ASDL
655 pass
656
657
658class LexerDecoder(object):
659 """J8 lexer and string decoder.
660
661 Similar interface as SimpleLexer, except we return an optional decoded
662 string
663 """
664
665 def __init__(self, s, is_j8, lang_str):
666 # type: (str, bool, str) -> None
667 self.s = s
668 self.is_j8 = is_j8
669 self.lang_str = lang_str
670
671 self.pos = 0
672
673 # current line being lexed -- for error messages
674 self.cur_line_num = 1
675
676 # Reuse this instance to save GC objects. JSON objects could have
677 # thousands of strings.
678 self.decoded = mylib.BufWriter()
679
680 def _Error(self, msg, end_pos):
681 # type: (str, int) -> error.Decode
682
683 # Use the current position as start pos
684 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
685
686 def Next(self):
687 # type: () -> Tuple[Id_t, int, Optional[str]]
688 """ Returns a token and updates self.pos """
689
690 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
691
692 if not self.is_j8:
693 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
694 raise self._Error(
695 "Single quotes aren't part of JSON; you may want 'json8 read'",
696 end_pos)
697 if tok_id == Id.Ignored_Comment:
698 raise self._Error(
699 "Comments aren't part of JSON; you may want 'json8 read'",
700 end_pos)
701
702 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
703 Id.Left_USingleQuote):
704 return self._DecodeString(tok_id, end_pos)
705
706 if tok_id == Id.Left_JDoubleQuote:
707 if self.is_j8:
708 return self._DecodeString(tok_id, end_pos)
709 else:
710 raise self._Error('Pure JSON does not accept j"" prefix',
711 end_pos)
712
713 if tok_id == Id.Ignored_Newline:
714 #log('LINE %d', self.cur_line_num)
715 self.cur_line_num += 1
716
717 self.pos = end_pos
718 return tok_id, end_pos, None
719
720 def NextForLines(self):
721 # type: () -> Tuple[Id_t, int, Optional[str]]
722 """ Like Next(), but for J8 Lines """
723
724 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
725
726 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
727 Id.Left_BSingleQuote, Id.Left_USingleQuote):
728 return self._DecodeString(tok_id, end_pos)
729
730 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
731 # this for quoted strings.)
732 if (tok_id == Id.Lit_Chars and
733 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
734 raise self._Error(
735 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
736 if tok_id == Id.Char_AsciiControl:
737 raise self._Error(
738 "J8 Lines can't have unescaped ASCII control chars", end_pos)
739
740 if tok_id == Id.J8_Newline:
741 #log('LINE %d', self.cur_line_num)
742 self.cur_line_num += 1
743
744 self.pos = end_pos
745 return tok_id, end_pos, None
746
747 def _DecodeString(self, left_id, str_pos):
748 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
749 """ Returns a string token and updates self.pos """
750
751 while True:
752 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
753 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
754 else:
755 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
756
757 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
758
759 if tok_id == Id.Eol_Tok:
760 # TODO: point to beginning of # quote?
761 raise self._Error(
762 'Unexpected EOF while lexing %s string' % self.lang_str,
763 str_end)
764 if tok_id == Id.Unknown_Backslash:
765 raise self._Error(
766 'Bad backslash escape in %s string' % self.lang_str,
767 str_end)
768 if tok_id == Id.Char_AsciiControl:
769 raise self._Error(
770 "%s strings can't have unescaped ASCII control chars" %
771 self.lang_str, str_end)
772
773 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
774
775 self.pos = str_end
776
777 s = self.decoded.getvalue()
778 self.decoded.clear() # reuse this instance
779
780 #log('decoded %r', self.decoded.getvalue())
781 return Id.J8_String, str_end, s
782
783 #
784 # Now handle each kind of token
785 #
786
787 if tok_id == Id.Lit_Chars: # JSON and J8
788 part = self.s[str_pos:str_end]
789 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
790 raise self._Error(
791 'Invalid UTF-8 in %s string literal' % self.lang_str,
792 str_end)
793
794 # TODO: would be nice to avoid allocation in all these cases.
795 # But LookupCharC() would have to change.
796
797 elif tok_id == Id.Char_OneChar: # JSON and J8
798 ch = self.s[str_pos + 1]
799 part = consts.LookupCharC(ch)
800
801 elif tok_id == Id.Char_UBraced: # J8 only
802 h = self.s[str_pos + 3:str_end - 1]
803 i = int(h, 16)
804
805 # Same checks in osh/word_compile.py
806 if i > 0x10ffff:
807 raise self._Error(
808 "Code point can't be greater than U+10ffff", str_end)
809 if 0xD800 <= i and i < 0xE000:
810 raise self._Error(
811 r"\u{%s} escape is illegal because it's in the surrogate range"
812 % h, str_end)
813
814 part = Utf8Encode(i)
815
816 elif tok_id == Id.Char_YHex: # J8 only
817 h = self.s[str_pos + 2:str_end]
818
819 # Same check in osh/word_parse.py
820 if left_id != Id.Left_BSingleQuote:
821 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
822 raise self._Error(
823 r"\y%s escapes not allowed in u'' strings" % h,
824 str_end)
825
826 i = int(h, 16)
827 part = chr(i)
828
829 elif tok_id == Id.Char_SurrogatePair:
830 h1 = self.s[str_pos + 2:str_pos + 6]
831 h2 = self.s[str_pos + 8:str_pos + 12]
832
833 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
834 i1 = int(h1, 16) - 0xD800 # high surrogate
835 i2 = int(h2, 16) - 0xDC00 # low surrogate
836 code_point = 0x10000 + (i1 << 10) + i2
837
838 part = Utf8Encode(code_point)
839
840 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
841 h = self.s[str_pos + 2:str_end]
842 i = int(h, 16)
843 part = Utf8Encode(i)
844
845 else:
846 # Should never happen
847 raise AssertionError(Id_str(tok_id))
848
849 #log('%s part %r', Id_str(tok_id), part)
850 self.decoded.write(part)
851 str_pos = str_end
852
853
854class _Parser(object):
855
856 def __init__(self, s, is_j8):
857 # type: (str, bool) -> None
858 self.s = s
859 self.is_j8 = is_j8
860 self.lang_str = "J8" if is_j8 else "JSON"
861
862 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
863 self.tok_id = Id.Undefined_Tok
864 self.start_pos = 0
865 self.end_pos = 0
866 self.decoded = '' # decoded J8 string
867
868 def _Next(self):
869 # type: () -> None
870
871 # This isn't the start of a J8_Bool token, it's the END of the token before it
872 while True:
873 self.start_pos = self.end_pos
874 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
875 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
876 Id.Ignored_Comment):
877 break
878 # TODO: add Ignored_Newline to count lines, and show line numbers
879 # in errors messages. The position of the last newline and a token
880 # can be used to calculate a column number.
881
882 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
883
884 def _Eat(self, tok_id):
885 # type: (Id_t) -> None
886
887 if self.tok_id != tok_id:
888 #log('position %r %d-%d %r', self.s, self.start_pos,
889 # self.end_pos, self.s[self.start_pos:self.end_pos])
890 raise self._ParseError("Expected %s, got %s" %
891 (Id_str(tok_id), Id_str(self.tok_id)))
892 self._Next()
893
894 def _NextForLines(self):
895 # type: () -> None
896 """Like _Next, but use the J8 Lines lexer."""
897 self.start_pos = self.end_pos
898 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
899
900 def _ParseError(self, msg):
901 # type: (str) -> error.Decode
902 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
903 self.lexer.cur_line_num)
904
905
906class Parser(_Parser):
907 """JSON and JSON8 Parser."""
908
909 def __init__(self, s, is_j8):
910 # type: (str, bool) -> None
911 _Parser.__init__(self, s, is_j8)
912
913 def _ParsePair(self):
914 # type: () -> Tuple[str, value_t]
915
916 k = self.decoded # Save the potential string value
917 self._Eat(Id.J8_String) # Check that it's a string
918 assert k is not None
919
920 self._Eat(Id.J8_Colon)
921
922 v = self._ParseValue()
923 return k, v
924
925 def _ParseDict(self):
926 # type: () -> value_t
927 """
928 pair = string ':' value
929 Dict = '{' '}'
930 | '{' pair (',' pair)* '}'
931 """
932 # precondition
933 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
934
935 #log('> Dict')
936
937 d = NewDict() # type: Dict[str, value_t]
938
939 self._Next()
940 if self.tok_id == Id.J8_RBrace:
941 self._Next()
942 return value.Dict(d)
943
944 k, v = self._ParsePair()
945 d[k] = v
946 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
947
948 while self.tok_id == Id.J8_Comma:
949 self._Next()
950 k, v = self._ParsePair()
951 d[k] = v
952 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
953
954 self._Eat(Id.J8_RBrace)
955
956 #log('< Dict')
957
958 return value.Dict(d)
959
960 def _ParseList(self):
961 # type: () -> value_t
962 """
963 List = '[' ']'
964 | '[' value (',' value)* ']'
965 """
966 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
967
968 items = [] # type: List[value_t]
969
970 self._Next()
971 if self.tok_id == Id.J8_RBracket:
972 self._Next()
973 return value.List(items)
974
975 items.append(self._ParseValue())
976
977 while self.tok_id == Id.J8_Comma:
978 self._Next()
979 items.append(self._ParseValue())
980
981 self._Eat(Id.J8_RBracket)
982
983 return value.List(items)
984
985 def _ParseValue(self):
986 # type: () -> value_t
987 if self.tok_id == Id.J8_LBrace:
988 return self._ParseDict()
989
990 elif self.tok_id == Id.J8_LBracket:
991 return self._ParseList()
992
993 elif self.tok_id == Id.J8_Null:
994 self._Next()
995 return value.Null
996
997 elif self.tok_id == Id.J8_Bool:
998 #log('%r %d', self.s[self.start_pos], self.start_pos)
999 b = value.Bool(self.s[self.start_pos] == 't')
1000 self._Next()
1001 return b
1002
1003 elif self.tok_id == Id.J8_Int:
1004 part = self.s[self.start_pos:self.end_pos]
1005 self._Next()
1006 try:
1007 big = mops.FromStr(part)
1008 except ValueError:
1009 raise self._ParseError('Integer is too big')
1010 return value.Int(big)
1011
1012 elif self.tok_id == Id.J8_Float:
1013 part = self.s[self.start_pos:self.end_pos]
1014 self._Next()
1015 return value.Float(float(part))
1016
1017 # UString, BString too
1018 elif self.tok_id == Id.J8_String:
1019 str_val = value.Str(self.decoded)
1020 #log('d %r', self.decoded)
1021 self._Next()
1022 return str_val
1023
1024 elif self.tok_id == Id.Eol_Tok:
1025 raise self._ParseError('Unexpected EOF while parsing %s' %
1026 self.lang_str)
1027
1028 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1029 raise self._ParseError('Invalid token while parsing %s: %s' %
1030 (self.lang_str, Id_str(self.tok_id)))
1031
1032 def ParseValue(self):
1033 # type: () -> value_t
1034 """ Raises error.Decode. """
1035 self._Next()
1036 obj = self._ParseValue()
1037
1038 n = len(self.s)
1039 if self.start_pos != n:
1040 extra = n - self.start_pos
1041 #log('n %d pos %d', n, self.start_pos)
1042 raise self._ParseError(
1043 'Got %d bytes of unexpected trailing input' % extra)
1044 return obj
1045
1046
1047class Nil8Parser(_Parser):
1048 """
1049 Tokens not in JSON8:
1050 LParen RParen Symbol
1051
1052 Tokens not in JSON, but in JSON8 and NIL8:
1053 Identifier (unquoted keys)
1054 Ignored_Comment
1055 """
1056
1057 def __init__(self, s, is_j8):
1058 # type: (str, bool) -> None
1059 _Parser.__init__(self, s, is_j8)
1060
1061 if 0:
1062
1063 def _LookAhead(self):
1064 # type: () -> Id_t
1065 """
1066 Don't need this right now
1067 """
1068 end_pos = self.end_pos # look ahead from last token
1069 while True:
1070 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1071 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1072 Id.Ignored_Comment):
1073 break
1074 return tok_id
1075
1076 def _ParseRecord(self):
1077 # type: () -> nvalue_t
1078 """
1079 Yaks
1080 (self->Next) => (-> self Next)
1081 (self->Next obj.field) => ((-> self Next) (. obj field))
1082
1083 Similar to
1084 ((identity identity) 42) => 42 in Clojure
1085
1086 ASDL
1087 (Node left:(. x4beef2))
1088 (Node left !x4beef2)
1089
1090 # Ambiguous because value can be identifier.
1091 # We have to look ahead to and see if there's a colon :
1092 field =
1093 Identifier ':' value
1094 | value
1095
1096 record = '(' head field* ')'
1097
1098 - Identifier | Symbol are treated the same, it's a side effect of
1099 the lexing style
1100 - do positional args come before named args
1101 - () is invalid? Use [] for empty list
1102 """
1103 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1104
1105 items = [] # type: List[nvalue_t]
1106
1107 self._Next()
1108 if self.tok_id == Id.J8_RParen:
1109 self._Next()
1110 return nvalue.List(items)
1111
1112 #log('TOK %s', Id_str(self.tok_id))
1113 while self.tok_id != Id.J8_RParen:
1114 items.append(self._ParseNil8())
1115 #log('TOK 2 %s', Id_str(self.tok_id))
1116
1117 self._Eat(Id.J8_RParen)
1118
1119 return nvalue.List(items)
1120
1121 def _ParseList8(self):
1122 # type: () -> nvalue_t
1123 """
1124 List8 = '[' value* ']'
1125
1126 No commas, not even optional ones for now.
1127 """
1128 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1129
1130 items = [] # type: List[nvalue_t]
1131
1132 self._Next()
1133 if self.tok_id == Id.J8_RBracket:
1134 self._Next()
1135 return nvalue.List(items)
1136
1137 #log('TOK %s', Id_str(self.tok_id))
1138 while self.tok_id != Id.J8_RBracket:
1139 items.append(self._ParseNil8())
1140 #log('TOK 2 %s', Id_str(self.tok_id))
1141
1142 self._Eat(Id.J8_RBracket)
1143
1144 return nvalue.List(items)
1145
1146 def _ParseNil8(self):
1147 # type: () -> nvalue_t
1148 if self.tok_id == Id.J8_LParen:
1149 obj = self._ParseRecord() # type: nvalue_t
1150 #return obj
1151
1152 elif self.tok_id == Id.J8_LBracket:
1153 obj = self._ParseList8()
1154 #return obj
1155
1156 # Primitives are copied from J8 above.
1157 # TODO: We also want hex literals.
1158 elif self.tok_id == Id.J8_Null:
1159 self._Next()
1160 obj = nvalue.Null
1161
1162 elif self.tok_id == Id.J8_Bool:
1163 b = nvalue.Bool(self.s[self.start_pos] == 't')
1164 self._Next()
1165 obj = b
1166
1167 elif self.tok_id == Id.J8_Int:
1168 part = self.s[self.start_pos:self.end_pos]
1169 self._Next()
1170 obj = nvalue.Int(int(part))
1171
1172 elif self.tok_id == Id.J8_Float:
1173 part = self.s[self.start_pos:self.end_pos]
1174 self._Next()
1175 obj = nvalue.Float(float(part))
1176
1177 elif self.tok_id == Id.J8_String:
1178 str_val = nvalue.Str(self.decoded)
1179 self._Next()
1180 obj = str_val
1181
1182 # <- etc.
1183 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1184 Id.J8_Comma):
1185 # unquoted "word" treated like a string
1186 part = self.s[self.start_pos:self.end_pos]
1187 self._Next()
1188 obj = nvalue.Symbol(part)
1189
1190 elif self.tok_id == Id.Eol_Tok:
1191 raise self._ParseError('Unexpected EOF while parsing %s' %
1192 self.lang_str)
1193
1194 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1195 raise self._ParseError('Invalid token while parsing %s: %s' %
1196 (self.lang_str, Id_str(self.tok_id)))
1197
1198 #log('YO %s', Id_str(self.tok_id))
1199 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1200 #log('AT %s', Id_str(self.tok_id))
1201
1202 # key: "value" -> (: key "value")
1203 part = self.s[self.start_pos:self.end_pos]
1204 op = nvalue.Symbol(part)
1205
1206 self._Next()
1207 operand2 = self._ParseNil8()
1208 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1209 #print("--> INFIX %d %s" % (id(infix), infix))
1210 return infix
1211
1212 #next_id = self._LookAhead()
1213 #print('NEXT %s' % Id_str(next_id))
1214
1215 #raise AssertionError()
1216 #print("--> OBJ %d %s" % (id(obj), obj))
1217 return obj
1218
1219 def ParseNil8(self):
1220 # type: () -> nvalue_t
1221 """ Raises error.Decode. """
1222 self._Next()
1223 #print('yo')
1224 obj = self._ParseNil8()
1225 #print("==> %d %s" % (id(obj), obj))
1226 if self.tok_id != Id.Eol_Tok:
1227 raise self._ParseError('Unexpected trailing input')
1228 return obj
1229
1230
1231class J8LinesParser(_Parser):
1232 """Decode lines from a string with newlines.
1233
1234 We specify this with a grammar, to preserve location info and to reduce
1235 allocations. (But note that unquoted_line is more like a LOOP than it is
1236 grammatical.)
1237
1238 Grammar:
1239
1240 end = J8_Newline | Eol_Tok
1241
1242 empty_line = WS_Space? end
1243
1244 # special case: read until end token, but REMOVE trailing WS_Space
1245 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1246
1247 j8_line = WS_Space? J8_String WS_Space? end
1248
1249 lines = (empty_line | unquoted_line | j8_line)*
1250
1251 where Lit_Chars is valid UTF-8
1252
1253 Notes:
1254
1255 (1) We disallow multiple strings on a line, like:
1256
1257 "json" "json2"
1258 "json" unquoted
1259
1260 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1261
1262 foo "" u''
1263
1264 The "" and u'' are not a decoded string, because the line started with
1265 Id.Lit_Chars literals.
1266
1267 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1268 Does it have - for empty cell?
1269 """
1270
1271 def __init__(self, s):
1272 # type: (str) -> None
1273 _Parser.__init__(self, s, True)
1274
1275 def _Show(self, s):
1276 # type: (str) -> None
1277 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1278 self.end_pos)
1279
1280 def _ParseLine(self, out):
1281 # type: (List[str]) -> None
1282 """ May append a line to 'out' """
1283 #self._Show('1')
1284 if self.tok_id == Id.WS_Space:
1285 self._NextForLines()
1286
1287 # Empty line - return without doing anything
1288 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1289 self._NextForLines()
1290 return
1291
1292 # Quoted string on line
1293 if self.tok_id == Id.J8_String:
1294 out.append(self.decoded)
1295 self._NextForLines()
1296
1297 if self.tok_id == Id.WS_Space: # trailing whitespace
1298 self._NextForLines()
1299
1300 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1301 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1302 Id_str(self.tok_id))
1303
1304 self._NextForLines()
1305 return
1306
1307 # Unquoted line
1308 if self.tok_id == Id.Lit_Chars:
1309 # ' unquoted "" text on line ' # read every token until end
1310 string_start = self.start_pos
1311 while True:
1312 # for stripping whitespace
1313 prev_id = self.tok_id
1314 prev_start = self.start_pos
1315
1316 self._NextForLines()
1317
1318 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1319 # \r, but we're sticking with the JSON spec definition of
1320 # whitespace. (As another data point, CPython on Unix allows
1321 # \r in the middle of expressions, treating it as whitespace.)
1322 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1323 break
1324
1325 if prev_id == Id.WS_Space:
1326 string_end = prev_start # remove trailing whitespace
1327 else:
1328 string_end = self.start_pos
1329
1330 out.append(self.s[string_start:string_end])
1331
1332 self._NextForLines() # past newline
1333 return
1334
1335 raise AssertionError(Id_str(self.tok_id))
1336
1337 def Parse(self):
1338 # type: () -> List[str]
1339 """ Raises error.Decode. """
1340 self._NextForLines()
1341
1342 lines = [] # type: List[str]
1343 while self.tok_id != Id.Eol_Tok:
1344 self._ParseLine(lines)
1345
1346 if self.tok_id != Id.Eol_Tok:
1347 raise self._ParseError('Unexpected trailing input in J8 Lines')
1348
1349 return lines
1350
1351
1352def SplitJ8Lines(s):
1353 # type: (str) -> List[str]
1354 """Used by @(echo split command sub)
1355
1356 Raises:
1357 error.Decode
1358
1359 3 Errors:
1360 - J8 string syntax error inside quotes
1361 - Extra input on line
1362 - unquoted line isn't utf-8
1363 """
1364 p = J8LinesParser(s)
1365 return p.Parse()
1366
1367
1368# vim: sw=4