OILS / data_lang / j8.py View on Github | oilshell.org

1356 lines, 679 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 # TODO: Omit type at top level
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194def EncodeString(s, buf, unquoted_ok=False):
195 # type: (str, mylib.BufWriter, bool) -> None
196 """ For pp proc, etc."""
197
198 if unquoted_ok and fastfunc.CanOmitQuotes(s):
199 buf.write(s)
200 return
201
202 _Print(value.Str(s), buf, -1)
203
204
205def MaybeEncodeString(s):
206 # type: (str) -> str
207 """ For write --json8 $s and compexport """
208
209 # TODO: add unquoted_ok here?
210 # /usr/local/foo-bar/x.y/a_b
211
212 buf = mylib.BufWriter()
213 _Print(value.Str(s), buf, -1)
214 return buf.getvalue()
215
216
217def MaybeEncodeJsonString(s):
218 # type: (str) -> str
219 """ For write --json """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225 return buf.getvalue()
226
227
228# DFS traversal state
229UNSEEN = 0
230EXPLORING = 1
231FINISHED = 2
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 # Value is always True
245 # Dict[int, None] doesn't translate -- it would be nice to have a set()
246 self.visited = {} # type: Dict[int, int]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintDict(self, val, level):
297 # type: (value.Dict, int) -> None
298
299 if len(val.d) == 0: # Special case like Python/JS
300 self.buf.write('{}')
301 else:
302 self.buf.write('{')
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(val.d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write('}')
324
325 def _PrintBashPrefix(self, type_str, level):
326 # type: (str, int) -> None
327
328 self.buf.write('{')
329 self._MaybeNewline()
330 self._ItemIndent(level)
331 self.buf.write('"type":')
332 self._MaybeSpace()
333 self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335 self._MaybeNewline()
336
337 self._ItemIndent(level)
338 self.buf.write('"data":')
339 self._MaybeSpace()
340
341 def _PrintBashSuffix(self, level):
342 # type: (int) -> None
343 self._MaybeNewline()
344 self._BracketIndent(level)
345 self.buf.write('}')
346
347 def _PrintSparseArray(self, val, level):
348 # type: (value.SparseArray, int) -> None
349
350 self._PrintBashPrefix('"SparseArray",', level)
351
352 if len(val.d) == 0: # Special case like Python/JS
353 self.buf.write('{}')
354 else:
355 self.buf.write('{')
356 self._MaybeNewline()
357
358 first = True
359 i = 0
360 for k, v in iteritems(val.d):
361 if i != 0:
362 self.buf.write(',')
363 self._MaybeNewline()
364
365 self._ItemIndent(level + 1)
366 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
367
368 self.buf.write(':')
369 self._MaybeSpace()
370
371 pyj8.WriteString(v, self.options, self.buf)
372
373 i += 1
374
375 self._MaybeNewline()
376
377 self._BracketIndent(level + 1)
378 self.buf.write('}')
379
380 self._PrintBashSuffix(level)
381
382 def _PrintBashArray(self, val, level):
383 # type: (value.BashArray, int) -> None
384
385 self._PrintBashPrefix('"BashArray",', level)
386
387 if len(val.strs) == 0: # Special case like Python/JS
388 self.buf.write('{}')
389 else:
390 self.buf.write('{')
391 self._MaybeNewline()
392
393 first = True
394 for i, s in enumerate(val.strs):
395 if s is None:
396 continue
397
398 if not first:
399 self.buf.write(',')
400 self._MaybeNewline()
401
402 self._ItemIndent(level + 1)
403 pyj8.WriteString(str(i), self.options, self.buf)
404
405 self.buf.write(':')
406 self._MaybeSpace()
407
408 pyj8.WriteString(s, self.options, self.buf)
409
410 first = False
411
412 self._MaybeNewline()
413
414 self._BracketIndent(level + 1)
415 self.buf.write('}')
416
417 self._PrintBashSuffix(level)
418
419 def _PrintBashAssoc(self, val, level):
420 # type: (value.BashAssoc, int) -> None
421
422 self._PrintBashPrefix('"BashAssoc",', level)
423
424 if len(val.d) == 0: # Special case like Python/JS
425 self.buf.write('{}')
426 else:
427 self.buf.write('{')
428 self._MaybeNewline()
429
430 i = 0
431 for k2, v2 in iteritems(val.d):
432 if i != 0:
433 self.buf.write(',')
434 self._MaybeNewline()
435
436 self._ItemIndent(level + 1)
437 pyj8.WriteString(k2, self.options, self.buf)
438
439 self.buf.write(':')
440 self._MaybeSpace()
441
442 pyj8.WriteString(v2, self.options, self.buf)
443
444 i += 1
445
446 self._MaybeNewline()
447
448 self._BracketIndent(level + 1)
449 self.buf.write('}')
450
451 self._PrintBashSuffix(level)
452
453 def Print(self, val, level=0):
454 # type: (value_t, int) -> None
455
456 # special value that means everything is on one line
457 # It's like
458 # JSON.stringify(d, null, 0)
459 # except we use -1, not 0. 0 can still have newlines.
460
461 UP_val = val
462 with tagswitch(val) as case:
463 if case(value_e.Null):
464 self.buf.write('null')
465
466 elif case(value_e.Bool):
467 val = cast(value.Bool, UP_val)
468 self.buf.write('true' if val.b else 'false')
469
470 elif case(value_e.Int):
471 val = cast(value.Int, UP_val)
472 # TODO: avoid intermediate allocation with
473 # self.buf.WriteBigInt(val.i)
474 #
475 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
476 # be of arbitrary length, and will need a growth strategy.
477 # Although that is not very common, so we could allocate in
478 # that case.
479
480 self.buf.write(mops.ToStr(val.i))
481
482 elif case(value_e.Float):
483 val = cast(value.Float, UP_val)
484
485 fl = val.f
486 if math.isinf(fl):
487 if self.options & INF_NAN_ARE_NULL:
488 s = 'null' # negative infinity is null too
489 else:
490 s = 'INFINITY'
491 if fl < 0:
492 s = '-' + s
493 elif math.isnan(fl):
494 if self.options & INF_NAN_ARE_NULL:
495 # JavaScript JSON lib behavior: Inf and NaN are null
496 # Python has a bug in the encoder by default, and then
497 # allow_nan=False raises an error
498 s = 'null'
499 else:
500 s = 'NAN'
501 else:
502 # TODO: can we avoid intermediate allocation?
503 # self.buf.WriteFloat(val.f)
504 s = str(fl)
505
506 self.buf.write(s)
507
508 elif case(value_e.Str):
509 val = cast(value.Str, UP_val)
510
511 pyj8.WriteString(val.s, self.options, self.buf)
512
513 elif case(value_e.List):
514 val = cast(value.List, UP_val)
515
516 # Cycle detection, only for containers that can be in cycles
517 heap_id = HeapValueId(val)
518
519 node_state = self.visited.get(heap_id, UNSEEN)
520 if node_state == FINISHED:
521 # Print it AGAIN. We print a JSON tree, which means we can
522 # visit and print nodes MANY TIMES, as long as they're not
523 # in a cycle.
524 self._PrintList(val, level)
525 return
526 if node_state == EXPLORING:
527 if self.options & SHOW_CYCLES:
528 self.buf.write('[ -->%s ]' % ValueIdString(val))
529 return
530 else:
531 # node.js prints which index closes the cycle
532 raise error.Encode(
533 "Can't encode List%s in object cycle" %
534 ValueIdString(val))
535
536 self.visited[heap_id] = EXPLORING
537 self._PrintList(val, level)
538 self.visited[heap_id] = FINISHED
539
540 elif case(value_e.Dict):
541 val = cast(value.Dict, UP_val)
542
543 # Cycle detection, only for containers that can be in cycles
544 heap_id = HeapValueId(val)
545
546 node_state = self.visited.get(heap_id, UNSEEN)
547 if node_state == FINISHED:
548 # Print it AGAIN. We print a JSON tree, which means we can
549 # visit and print nodes MANY TIMES, as long as they're not
550 # in a cycle.
551 self._PrintDict(val, level)
552 return
553 if node_state == EXPLORING:
554 if self.options & SHOW_CYCLES:
555 self.buf.write('{ -->%s }' % ValueIdString(val))
556 return
557 else:
558 # node.js prints which key closes the cycle
559 raise error.Encode(
560 "Can't encode Dict%s in object cycle" %
561 ValueIdString(val))
562
563 self.visited[heap_id] = EXPLORING
564 self._PrintDict(val, level)
565 self.visited[heap_id] = FINISHED
566
567 elif case(value_e.SparseArray):
568 val = cast(value.SparseArray, UP_val)
569 self._PrintSparseArray(val, level)
570
571 elif case(value_e.BashArray):
572 val = cast(value.BashArray, UP_val)
573 self._PrintBashArray(val, level)
574
575 elif case(value_e.BashAssoc):
576 val = cast(value.BashAssoc, UP_val)
577 self._PrintBashAssoc(val, level)
578
579 else:
580 pass # mycpp workaround
581 if self.options & SHOW_NON_DATA:
582 # Similar to = operator, ui.DebugPrint()
583 # TODO: that prints value.Range in a special way
584 ysh_type = ValType(val)
585 id_str = ValueIdString(val)
586 self.buf.write('<%s%s>' % (ysh_type, id_str))
587 else:
588 raise error.Encode("Can't serialize object of type %s" %
589 ValType(val))
590
591
592class PrettyPrinter(object):
593 """ Unused right now, but could enhance the = operator.
594
595 Output to polymorphic ColorOutput
596
597 Features like asdl/format.py:
598 - line wrapping
599 - color
600 - sharing detection by passing in a REF COUTN dict
601 - print @123 the first time, and then print ... the second time
602
603 and
604
605 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
606 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
607
608 - Omitting commas for ASDL? Maybe we can use two spaces
609
610 (Token id: Id.VSub_DollarName start: 0 length: 3)
611 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
612 """
613
614 def __init__(self, max_col):
615 # type: (int) -> None
616 self.max_col = max_col
617
618 # This could be an optimized set an C++ bit set like
619 # mark_sweep_heap.h, rather than a Dict
620 #self.unique_objs = mylib.UniqueObjects()
621
622 # first pass of object ID -> number of times references
623
624 self.ref_count = {} # type: Dict[int, int]
625
626 def PrettyTree(self, val, f):
627 # type: (value_t, fmt.ColorOutput) -> None
628
629 # TODO: first convert to hnode.asdl types?
630
631 # Although we might want
632 # hnode.AlreadyShown = (str type, int unique_id)
633 pass
634
635 def Print(self, val, buf):
636 # type: (value_t, mylib.BufWriter) -> None
637
638 # Or print to stderr?
639 f = fmt.DetectConsoleOutput(mylib.Stdout())
640 self.PrettyTree(val, f)
641
642 # Then print those with ASDL
643 pass
644
645
646class LexerDecoder(object):
647 """J8 lexer and string decoder.
648
649 Similar interface as SimpleLexer, except we return an optional decoded
650 string
651 """
652
653 def __init__(self, s, is_j8, lang_str):
654 # type: (str, bool, str) -> None
655 self.s = s
656 self.is_j8 = is_j8
657 self.lang_str = lang_str
658
659 self.pos = 0
660
661 # current line being lexed -- for error messages
662 self.cur_line_num = 1
663
664 # Reuse this instance to save GC objects. JSON objects could have
665 # thousands of strings.
666 self.decoded = mylib.BufWriter()
667
668 def _Error(self, msg, end_pos):
669 # type: (str, int) -> error.Decode
670
671 # Use the current position as start pos
672 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
673
674 def Next(self):
675 # type: () -> Tuple[Id_t, int, Optional[str]]
676 """ Returns a token and updates self.pos """
677
678 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
679
680 if not self.is_j8:
681 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
682 raise self._Error(
683 "Single quotes aren't part of JSON; you may want 'json8 read'",
684 end_pos)
685 if tok_id == Id.Ignored_Comment:
686 raise self._Error(
687 "Comments aren't part of JSON; you may want 'json8 read'",
688 end_pos)
689
690 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
691 Id.Left_USingleQuote):
692 return self._DecodeString(tok_id, end_pos)
693
694 if tok_id == Id.Left_JDoubleQuote:
695 if self.is_j8:
696 return self._DecodeString(tok_id, end_pos)
697 else:
698 raise self._Error('Pure JSON does not accept j"" prefix',
699 end_pos)
700
701 if tok_id == Id.Ignored_Newline:
702 #log('LINE %d', self.cur_line_num)
703 self.cur_line_num += 1
704
705 self.pos = end_pos
706 return tok_id, end_pos, None
707
708 def NextForLines(self):
709 # type: () -> Tuple[Id_t, int, Optional[str]]
710 """ Like Next(), but for J8 Lines """
711
712 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
713
714 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
715 Id.Left_BSingleQuote, Id.Left_USingleQuote):
716 return self._DecodeString(tok_id, end_pos)
717
718 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
719 # this for quoted strings.)
720 if (tok_id == Id.Lit_Chars and
721 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
722 raise self._Error(
723 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
724 if tok_id == Id.Char_AsciiControl:
725 raise self._Error(
726 "J8 Lines can't have unescaped ASCII control chars", end_pos)
727
728 if tok_id == Id.J8_Newline:
729 #log('LINE %d', self.cur_line_num)
730 self.cur_line_num += 1
731
732 self.pos = end_pos
733 return tok_id, end_pos, None
734
735 def _DecodeString(self, left_id, str_pos):
736 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
737 """ Returns a string token and updates self.pos """
738
739 while True:
740 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
741 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
742 else:
743 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
744
745 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
746
747 if tok_id == Id.Eol_Tok:
748 # TODO: point to beginning of # quote?
749 raise self._Error(
750 'Unexpected EOF while lexing %s string' % self.lang_str,
751 str_end)
752 if tok_id == Id.Unknown_Backslash:
753 raise self._Error(
754 'Bad backslash escape in %s string' % self.lang_str,
755 str_end)
756 if tok_id == Id.Char_AsciiControl:
757 raise self._Error(
758 "%s strings can't have unescaped ASCII control chars" %
759 self.lang_str, str_end)
760
761 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
762
763 self.pos = str_end
764
765 s = self.decoded.getvalue()
766 self.decoded.clear() # reuse this instance
767
768 #log('decoded %r', self.decoded.getvalue())
769 return Id.J8_String, str_end, s
770
771 #
772 # Now handle each kind of token
773 #
774
775 if tok_id == Id.Lit_Chars: # JSON and J8
776 part = self.s[str_pos:str_end]
777 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
778 raise self._Error(
779 'Invalid UTF-8 in %s string literal' % self.lang_str,
780 str_end)
781
782 # TODO: would be nice to avoid allocation in all these cases.
783 # But LookupCharC() would have to change.
784
785 elif tok_id == Id.Char_OneChar: # JSON and J8
786 ch = self.s[str_pos + 1]
787 part = consts.LookupCharC(ch)
788
789 elif tok_id == Id.Char_UBraced: # J8 only
790 h = self.s[str_pos + 3:str_end - 1]
791 i = int(h, 16)
792
793 # Same checks in osh/word_compile.py
794 if i > 0x10ffff:
795 raise self._Error(
796 "Code point can't be greater than U+10ffff", str_end)
797 if 0xD800 <= i and i < 0xE000:
798 raise self._Error(
799 r"\u{%s} escape is illegal because it's in the surrogate range"
800 % h, str_end)
801
802 part = Utf8Encode(i)
803
804 elif tok_id == Id.Char_YHex: # J8 only
805 h = self.s[str_pos + 2:str_end]
806
807 # Same check in osh/word_parse.py
808 if left_id != Id.Left_BSingleQuote:
809 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
810 raise self._Error(
811 r"\y%s escapes not allowed in u'' strings" % h,
812 str_end)
813
814 i = int(h, 16)
815 part = chr(i)
816
817 elif tok_id == Id.Char_SurrogatePair:
818 h1 = self.s[str_pos + 2:str_pos + 6]
819 h2 = self.s[str_pos + 8:str_pos + 12]
820
821 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
822 i1 = int(h1, 16) - 0xD800 # high surrogate
823 i2 = int(h2, 16) - 0xDC00 # low surrogate
824 code_point = 0x10000 + (i1 << 10) + i2
825
826 part = Utf8Encode(code_point)
827
828 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
829 h = self.s[str_pos + 2:str_end]
830 i = int(h, 16)
831 part = Utf8Encode(i)
832
833 else:
834 # Should never happen
835 raise AssertionError(Id_str(tok_id))
836
837 #log('%s part %r', Id_str(tok_id), part)
838 self.decoded.write(part)
839 str_pos = str_end
840
841
842class _Parser(object):
843
844 def __init__(self, s, is_j8):
845 # type: (str, bool) -> None
846 self.s = s
847 self.is_j8 = is_j8
848 self.lang_str = "J8" if is_j8 else "JSON"
849
850 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
851 self.tok_id = Id.Undefined_Tok
852 self.start_pos = 0
853 self.end_pos = 0
854 self.decoded = '' # decoded J8 string
855
856 def _Next(self):
857 # type: () -> None
858
859 # This isn't the start of a J8_Bool token, it's the END of the token before it
860 while True:
861 self.start_pos = self.end_pos
862 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
863 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
864 Id.Ignored_Comment):
865 break
866 # TODO: add Ignored_Newline to count lines, and show line numbers
867 # in errors messages. The position of the last newline and a token
868 # can be used to calculate a column number.
869
870 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
871
872 def _Eat(self, tok_id):
873 # type: (Id_t) -> None
874
875 if self.tok_id != tok_id:
876 #log('position %r %d-%d %r', self.s, self.start_pos,
877 # self.end_pos, self.s[self.start_pos:self.end_pos])
878 raise self._ParseError("Expected %s, got %s" %
879 (Id_str(tok_id), Id_str(self.tok_id)))
880 self._Next()
881
882 def _NextForLines(self):
883 # type: () -> None
884 """Like _Next, but use the J8 Lines lexer."""
885 self.start_pos = self.end_pos
886 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
887
888 def _ParseError(self, msg):
889 # type: (str) -> error.Decode
890 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
891 self.lexer.cur_line_num)
892
893
894class Parser(_Parser):
895 """JSON and JSON8 Parser."""
896
897 def __init__(self, s, is_j8):
898 # type: (str, bool) -> None
899 _Parser.__init__(self, s, is_j8)
900
901 def _ParsePair(self):
902 # type: () -> Tuple[str, value_t]
903
904 k = self.decoded # Save the potential string value
905 self._Eat(Id.J8_String) # Check that it's a string
906 assert k is not None
907
908 self._Eat(Id.J8_Colon)
909
910 v = self._ParseValue()
911 return k, v
912
913 def _ParseDict(self):
914 # type: () -> value_t
915 """
916 pair = string ':' value
917 Dict = '{' '}'
918 | '{' pair (',' pair)* '}'
919 """
920 # precondition
921 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
922
923 #log('> Dict')
924
925 d = NewDict() # type: Dict[str, value_t]
926
927 self._Next()
928 if self.tok_id == Id.J8_RBrace:
929 self._Next()
930 return value.Dict(d)
931
932 k, v = self._ParsePair()
933 d[k] = v
934 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
935
936 while self.tok_id == Id.J8_Comma:
937 self._Next()
938 k, v = self._ParsePair()
939 d[k] = v
940 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
941
942 self._Eat(Id.J8_RBrace)
943
944 #log('< Dict')
945
946 return value.Dict(d)
947
948 def _ParseList(self):
949 # type: () -> value_t
950 """
951 List = '[' ']'
952 | '[' value (',' value)* ']'
953 """
954 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
955
956 items = [] # type: List[value_t]
957
958 self._Next()
959 if self.tok_id == Id.J8_RBracket:
960 self._Next()
961 return value.List(items)
962
963 items.append(self._ParseValue())
964
965 while self.tok_id == Id.J8_Comma:
966 self._Next()
967 items.append(self._ParseValue())
968
969 self._Eat(Id.J8_RBracket)
970
971 return value.List(items)
972
973 def _ParseValue(self):
974 # type: () -> value_t
975 if self.tok_id == Id.J8_LBrace:
976 return self._ParseDict()
977
978 elif self.tok_id == Id.J8_LBracket:
979 return self._ParseList()
980
981 elif self.tok_id == Id.J8_Null:
982 self._Next()
983 return value.Null
984
985 elif self.tok_id == Id.J8_Bool:
986 #log('%r %d', self.s[self.start_pos], self.start_pos)
987 b = value.Bool(self.s[self.start_pos] == 't')
988 self._Next()
989 return b
990
991 elif self.tok_id == Id.J8_Int:
992 part = self.s[self.start_pos:self.end_pos]
993 self._Next()
994 try:
995 big = mops.FromStr(part)
996 except ValueError:
997 raise self._ParseError('Integer is too big')
998 return value.Int(big)
999
1000 elif self.tok_id == Id.J8_Float:
1001 part = self.s[self.start_pos:self.end_pos]
1002 self._Next()
1003 return value.Float(float(part))
1004
1005 # UString, BString too
1006 elif self.tok_id == Id.J8_String:
1007 str_val = value.Str(self.decoded)
1008 #log('d %r', self.decoded)
1009 self._Next()
1010 return str_val
1011
1012 elif self.tok_id == Id.Eol_Tok:
1013 raise self._ParseError('Unexpected EOF while parsing %s' %
1014 self.lang_str)
1015
1016 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1017 raise self._ParseError('Invalid token while parsing %s: %s' %
1018 (self.lang_str, Id_str(self.tok_id)))
1019
1020 def ParseValue(self):
1021 # type: () -> value_t
1022 """ Raises error.Decode. """
1023 self._Next()
1024 obj = self._ParseValue()
1025
1026 n = len(self.s)
1027 if self.start_pos != n:
1028 extra = n - self.start_pos
1029 #log('n %d pos %d', n, self.start_pos)
1030 raise self._ParseError(
1031 'Got %d bytes of unexpected trailing input' % extra)
1032 return obj
1033
1034
1035class Nil8Parser(_Parser):
1036 """
1037 Tokens not in JSON8:
1038 LParen RParen Symbol
1039
1040 Tokens not in JSON, but in JSON8 and NIL8:
1041 Identifier (unquoted keys)
1042 Ignored_Comment
1043 """
1044
1045 def __init__(self, s, is_j8):
1046 # type: (str, bool) -> None
1047 _Parser.__init__(self, s, is_j8)
1048
1049 if 0:
1050
1051 def _LookAhead(self):
1052 # type: () -> Id_t
1053 """
1054 Don't need this right now
1055 """
1056 end_pos = self.end_pos # look ahead from last token
1057 while True:
1058 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1059 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1060 Id.Ignored_Comment):
1061 break
1062 return tok_id
1063
1064 def _ParseRecord(self):
1065 # type: () -> nvalue_t
1066 """
1067 Yaks
1068 (self->Next) => (-> self Next)
1069 (self->Next obj.field) => ((-> self Next) (. obj field))
1070
1071 Similar to
1072 ((identity identity) 42) => 42 in Clojure
1073
1074 ASDL
1075 (Node left:(. x4beef2))
1076 (Node left !x4beef2)
1077
1078 # Ambiguous because value can be identifier.
1079 # We have to look ahead to and see if there's a colon :
1080 field =
1081 Identifier ':' value
1082 | value
1083
1084 record = '(' head field* ')'
1085
1086 - Identifier | Symbol are treated the same, it's a side effect of
1087 the lexing style
1088 - do positional args come before named args
1089 - () is invalid? Use [] for empty list
1090 """
1091 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1092
1093 items = [] # type: List[nvalue_t]
1094
1095 self._Next()
1096 if self.tok_id == Id.J8_RParen:
1097 self._Next()
1098 return nvalue.List(items)
1099
1100 #log('TOK %s', Id_str(self.tok_id))
1101 while self.tok_id != Id.J8_RParen:
1102 items.append(self._ParseNil8())
1103 #log('TOK 2 %s', Id_str(self.tok_id))
1104
1105 self._Eat(Id.J8_RParen)
1106
1107 return nvalue.List(items)
1108
1109 def _ParseList8(self):
1110 # type: () -> nvalue_t
1111 """
1112 List8 = '[' value* ']'
1113
1114 No commas, not even optional ones for now.
1115 """
1116 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1117
1118 items = [] # type: List[nvalue_t]
1119
1120 self._Next()
1121 if self.tok_id == Id.J8_RBracket:
1122 self._Next()
1123 return nvalue.List(items)
1124
1125 #log('TOK %s', Id_str(self.tok_id))
1126 while self.tok_id != Id.J8_RBracket:
1127 items.append(self._ParseNil8())
1128 #log('TOK 2 %s', Id_str(self.tok_id))
1129
1130 self._Eat(Id.J8_RBracket)
1131
1132 return nvalue.List(items)
1133
1134 def _ParseNil8(self):
1135 # type: () -> nvalue_t
1136 if self.tok_id == Id.J8_LParen:
1137 obj = self._ParseRecord() # type: nvalue_t
1138 #return obj
1139
1140 elif self.tok_id == Id.J8_LBracket:
1141 obj = self._ParseList8()
1142 #return obj
1143
1144 # Primitives are copied from J8 above.
1145 # TODO: We also want hex literals.
1146 elif self.tok_id == Id.J8_Null:
1147 self._Next()
1148 obj = nvalue.Null
1149
1150 elif self.tok_id == Id.J8_Bool:
1151 b = nvalue.Bool(self.s[self.start_pos] == 't')
1152 self._Next()
1153 obj = b
1154
1155 elif self.tok_id == Id.J8_Int:
1156 part = self.s[self.start_pos:self.end_pos]
1157 self._Next()
1158 obj = nvalue.Int(int(part))
1159
1160 elif self.tok_id == Id.J8_Float:
1161 part = self.s[self.start_pos:self.end_pos]
1162 self._Next()
1163 obj = nvalue.Float(float(part))
1164
1165 elif self.tok_id == Id.J8_String:
1166 str_val = nvalue.Str(self.decoded)
1167 self._Next()
1168 obj = str_val
1169
1170 # <- etc.
1171 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1172 Id.J8_Comma):
1173 # unquoted "word" treated like a string
1174 part = self.s[self.start_pos:self.end_pos]
1175 self._Next()
1176 obj = nvalue.Symbol(part)
1177
1178 elif self.tok_id == Id.Eol_Tok:
1179 raise self._ParseError('Unexpected EOF while parsing %s' %
1180 self.lang_str)
1181
1182 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1183 raise self._ParseError('Invalid token while parsing %s: %s' %
1184 (self.lang_str, Id_str(self.tok_id)))
1185
1186 #log('YO %s', Id_str(self.tok_id))
1187 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1188 #log('AT %s', Id_str(self.tok_id))
1189
1190 # key: "value" -> (: key "value")
1191 part = self.s[self.start_pos:self.end_pos]
1192 op = nvalue.Symbol(part)
1193
1194 self._Next()
1195 operand2 = self._ParseNil8()
1196 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1197 #print("--> INFIX %d %s" % (id(infix), infix))
1198 return infix
1199
1200 #next_id = self._LookAhead()
1201 #print('NEXT %s' % Id_str(next_id))
1202
1203 #raise AssertionError()
1204 #print("--> OBJ %d %s" % (id(obj), obj))
1205 return obj
1206
1207 def ParseNil8(self):
1208 # type: () -> nvalue_t
1209 """ Raises error.Decode. """
1210 self._Next()
1211 #print('yo')
1212 obj = self._ParseNil8()
1213 #print("==> %d %s" % (id(obj), obj))
1214 if self.tok_id != Id.Eol_Tok:
1215 raise self._ParseError('Unexpected trailing input')
1216 return obj
1217
1218
1219class J8LinesParser(_Parser):
1220 """Decode lines from a string with newlines.
1221
1222 We specify this with a grammar, to preserve location info and to reduce
1223 allocations. (But note that unquoted_line is more like a LOOP than it is
1224 grammatical.)
1225
1226 Grammar:
1227
1228 end = J8_Newline | Eol_Tok
1229
1230 empty_line = WS_Space? end
1231
1232 # special case: read until end token, but REMOVE trailing WS_Space
1233 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1234
1235 j8_line = WS_Space? J8_String WS_Space? end
1236
1237 lines = (empty_line | unquoted_line | j8_line)*
1238
1239 where Lit_Chars is valid UTF-8
1240
1241 Notes:
1242
1243 (1) We disallow multiple strings on a line, like:
1244
1245 "json" "json2"
1246 "json" unquoted
1247
1248 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1249
1250 foo "" u''
1251
1252 The "" and u'' are not a decoded string, because the line started with
1253 Id.Lit_Chars literals.
1254
1255 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1256 Does it have - for empty cell?
1257 """
1258
1259 def __init__(self, s):
1260 # type: (str) -> None
1261 _Parser.__init__(self, s, True)
1262
1263 def _Show(self, s):
1264 # type: (str) -> None
1265 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1266 self.end_pos)
1267
1268 def _ParseLine(self, out):
1269 # type: (List[str]) -> None
1270 """ May append a line to 'out' """
1271 #self._Show('1')
1272 if self.tok_id == Id.WS_Space:
1273 self._NextForLines()
1274
1275 # Empty line - return without doing anything
1276 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1277 self._NextForLines()
1278 return
1279
1280 # Quoted string on line
1281 if self.tok_id == Id.J8_String:
1282 out.append(self.decoded)
1283 self._NextForLines()
1284
1285 if self.tok_id == Id.WS_Space: # trailing whitespace
1286 self._NextForLines()
1287
1288 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1289 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1290 Id_str(self.tok_id))
1291
1292 self._NextForLines()
1293 return
1294
1295 # Unquoted line
1296 if self.tok_id == Id.Lit_Chars:
1297 # ' unquoted "" text on line ' # read every token until end
1298 string_start = self.start_pos
1299 while True:
1300 # for stripping whitespace
1301 prev_id = self.tok_id
1302 prev_start = self.start_pos
1303
1304 self._NextForLines()
1305
1306 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1307 # \r, but we're sticking with the JSON spec definition of
1308 # whitespace. (As another data point, CPython on Unix allows
1309 # \r in the middle of expressions, treating it as whitespace.)
1310 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1311 break
1312
1313 if prev_id == Id.WS_Space:
1314 string_end = prev_start # remove trailing whitespace
1315 else:
1316 string_end = self.start_pos
1317
1318 out.append(self.s[string_start:string_end])
1319
1320 self._NextForLines() # past newline
1321 return
1322
1323 raise AssertionError(Id_str(self.tok_id))
1324
1325 def Parse(self):
1326 # type: () -> List[str]
1327 """ Raises error.Decode. """
1328 self._NextForLines()
1329
1330 lines = [] # type: List[str]
1331 while self.tok_id != Id.Eol_Tok:
1332 self._ParseLine(lines)
1333
1334 if self.tok_id != Id.Eol_Tok:
1335 raise self._ParseError('Unexpected trailing input in J8 Lines')
1336
1337 return lines
1338
1339
1340def SplitJ8Lines(s):
1341 # type: (str) -> List[str]
1342 """Used by @(echo split command sub)
1343
1344 Raises:
1345 error.Decode
1346
1347 3 Errors:
1348 - J8 string syntax error inside quotes
1349 - Extra input on line
1350 - unquoted line isn't utf-8
1351 """
1352 p = J8LinesParser(s)
1353 return p.Parse()
1354
1355
1356# vim: sw=4