OILS / data_lang / j8.py View on Github | oilshell.org

1330 lines, 656 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 # TODO: Omit type at top level
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194def EncodeString(s, buf, unquoted_ok=False):
195 # type: (str, mylib.BufWriter, bool) -> None
196 """ For pp proc, etc."""
197
198 if unquoted_ok and fastfunc.CanOmitQuotes(s):
199 buf.write(s)
200 return
201
202 _Print(value.Str(s), buf, -1)
203
204
205def MaybeEncodeString(s):
206 # type: (str) -> str
207 """ For write --json8 $s and compexport """
208
209 # TODO: add unquoted_ok here?
210 # /usr/local/foo-bar/x.y/a_b
211
212 buf = mylib.BufWriter()
213 _Print(value.Str(s), buf, -1)
214 return buf.getvalue()
215
216
217def MaybeEncodeJsonString(s):
218 # type: (str) -> str
219 """ For write --json """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225 return buf.getvalue()
226
227
228# DFS traversal state
229UNSEEN = 0
230EXPLORING = 1
231FINISHED = 2
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 # Value is always True
245 # Dict[int, None] doesn't translate -- it would be nice to have a set()
246 self.visited = {} # type: Dict[int, int]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintDict(self, val, level):
297 # type: (value.Dict, int) -> None
298
299 if len(val.d) == 0: # Special case like Python/JS
300 self.buf.write('{}')
301 else:
302 self.buf.write('{')
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(val.d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write('}')
324
325 def _PrintBashPrefix(self, type_str, level):
326 # type: (str, int) -> None
327
328 self.buf.write('{')
329 self._MaybeNewline()
330 self._ItemIndent(level)
331 self.buf.write('"type":')
332 self._MaybeSpace()
333 self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335 self._MaybeNewline()
336
337 self._ItemIndent(level)
338 self.buf.write('"data":')
339 self._MaybeSpace()
340
341 def _PrintBashSuffix(self, level):
342 # type: (int) -> None
343 level -= 1
344 self._MaybeNewline()
345 self._BracketIndent(level)
346 self.buf.write('}')
347
348 def _PrintBashArray(self, val, level):
349 # type: (value.BashArray, int) -> None
350
351 self._PrintBashPrefix('"BashArray",', level)
352
353 if len(val.strs) == 0: # Special case like Python/JS
354 self.buf.write('{}')
355 else:
356 self.buf.write('{')
357 self._MaybeNewline()
358
359 level += 1
360 first = True
361 for i, s in enumerate(val.strs):
362 if s is None:
363 continue
364
365 if not first:
366 self.buf.write(',')
367 self._MaybeNewline()
368
369 self._ItemIndent(level)
370
371 pyj8.WriteString(str(i), self.options, self.buf)
372 self.buf.write(':')
373 self._MaybeSpace()
374
375 pyj8.WriteString(s, self.options, self.buf)
376
377 first = False
378
379 self._MaybeNewline()
380
381 self._BracketIndent(level)
382 self.buf.write('}')
383
384 self._PrintBashSuffix(level)
385
386 def _PrintBashAssoc(self, val, level):
387 # type: (value.BashAssoc, int) -> None
388
389 self._PrintBashPrefix('"BashAssoc",', level)
390
391 if len(val.d) == 0: # Special case like Python/JS
392 self.buf.write('{}')
393 else:
394 self.buf.write('{')
395 self._MaybeNewline()
396
397 level += 1
398 i = 0
399 for k2, v2 in iteritems(val.d):
400 if i != 0:
401 self.buf.write(',')
402 self._MaybeNewline()
403
404 self._ItemIndent(level)
405
406 pyj8.WriteString(k2, self.options, self.buf)
407
408 self.buf.write(':')
409 self._MaybeSpace()
410
411 pyj8.WriteString(v2, self.options, self.buf)
412
413 i += 1
414
415 self._MaybeNewline()
416
417 self._BracketIndent(level)
418 self.buf.write('}')
419
420 self._PrintBashSuffix(level)
421
422 def Print(self, val, level=0):
423 # type: (value_t, int) -> None
424
425 # special value that means everything is on one line
426 # It's like
427 # JSON.stringify(d, null, 0)
428 # except we use -1, not 0. 0 can still have newlines.
429
430 UP_val = val
431 with tagswitch(val) as case:
432 if case(value_e.Null):
433 self.buf.write('null')
434
435 elif case(value_e.Bool):
436 val = cast(value.Bool, UP_val)
437 self.buf.write('true' if val.b else 'false')
438
439 elif case(value_e.Int):
440 val = cast(value.Int, UP_val)
441 # TODO: avoid intermediate allocation with
442 # self.buf.WriteBigInt(val.i)
443 #
444 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
445 # be of arbitrary length, and will need a growth strategy.
446 # Although that is not very common, so we could allocate in
447 # that case.
448
449 self.buf.write(mops.ToStr(val.i))
450
451 elif case(value_e.Float):
452 val = cast(value.Float, UP_val)
453
454 fl = val.f
455 if math.isinf(fl):
456 if self.options & INF_NAN_ARE_NULL:
457 s = 'null' # negative infinity is null too
458 else:
459 s = 'INFINITY'
460 if fl < 0:
461 s = '-' + s
462 elif math.isnan(fl):
463 if self.options & INF_NAN_ARE_NULL:
464 # JavaScript JSON lib behavior: Inf and NaN are null
465 # Python has a bug in the encoder by default, and then
466 # allow_nan=False raises an error
467 s = 'null'
468 else:
469 s = 'NAN'
470 else:
471 # TODO: can we avoid intermediate allocation?
472 # self.buf.WriteFloat(val.f)
473 s = str(fl)
474
475 self.buf.write(s)
476
477 elif case(value_e.Str):
478 val = cast(value.Str, UP_val)
479
480 pyj8.WriteString(val.s, self.options, self.buf)
481
482 elif case(value_e.List):
483 val = cast(value.List, UP_val)
484
485 # Cycle detection, only for containers that can be in cycles
486 heap_id = HeapValueId(val)
487
488 node_state = self.visited.get(heap_id, UNSEEN)
489 if node_state == FINISHED:
490 # Print it AGAIN. We print a JSON tree, which means we can
491 # visit and print nodes MANY TIMES, as long as they're not
492 # in a cycle.
493 self._PrintList(val, level)
494 return
495 if node_state == EXPLORING:
496 if self.options & SHOW_CYCLES:
497 self.buf.write('[ -->%s ]' % ValueIdString(val))
498 return
499 else:
500 # node.js prints which index closes the cycle
501 raise error.Encode(
502 "Can't encode List%s in object cycle" %
503 ValueIdString(val))
504
505 self.visited[heap_id] = EXPLORING
506 self._PrintList(val, level)
507 self.visited[heap_id] = FINISHED
508
509 elif case(value_e.Dict):
510 val = cast(value.Dict, UP_val)
511
512 # Cycle detection, only for containers that can be in cycles
513 heap_id = HeapValueId(val)
514
515 node_state = self.visited.get(heap_id, UNSEEN)
516 if node_state == FINISHED:
517 # Print it AGAIN. We print a JSON tree, which means we can
518 # visit and print nodes MANY TIMES, as long as they're not
519 # in a cycle.
520 self._PrintDict(val, level)
521 return
522 if node_state == EXPLORING:
523 if self.options & SHOW_CYCLES:
524 self.buf.write('{ -->%s }' % ValueIdString(val))
525 return
526 else:
527 # node.js prints which key closes the cycle
528 raise error.Encode(
529 "Can't encode Dict%s in object cycle" %
530 ValueIdString(val))
531
532 self.visited[heap_id] = EXPLORING
533 self._PrintDict(val, level)
534 self.visited[heap_id] = FINISHED
535
536 # TODO: New format, which should consistent with pretty printing
537 # pp line (x) supports BashArray and BashAssoc, e.g. for spec
538 # tests.
539
540 # - BashAssoc is Dict[str, str]
541 # (BashAssoc ['1']='foo' ['3']='bar')
542 # - BashArray will be Dict[int, str] - SparseArray. We should write it like
543 # (BashArray [1]='foo' [3]='bar')
544
545 elif case(value_e.BashArray):
546 val = cast(value.BashArray, UP_val)
547 self._PrintBashArray(val, level)
548
549 elif case(value_e.BashAssoc):
550 val = cast(value.BashAssoc, UP_val)
551 self._PrintBashAssoc(val, level)
552
553 else:
554 pass # mycpp workaround
555 if self.options & SHOW_NON_DATA:
556 # Similar to = operator, ui.DebugPrint()
557 # TODO: that prints value.Range in a special way
558 ysh_type = ValType(val)
559 id_str = ValueIdString(val)
560 self.buf.write('<%s%s>' % (ysh_type, id_str))
561 else:
562 raise error.Encode("Can't serialize object of type %s" %
563 ValType(val))
564
565
566class PrettyPrinter(object):
567 """ Unused right now, but could enhance the = operator.
568
569 Output to polymorphic ColorOutput
570
571 Features like asdl/format.py:
572 - line wrapping
573 - color
574 - sharing detection by passing in a REF COUTN dict
575 - print @123 the first time, and then print ... the second time
576
577 and
578
579 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
580 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
581
582 - Omitting commas for ASDL? Maybe we can use two spaces
583
584 (Token id: Id.VSub_DollarName start: 0 length: 3)
585 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
586 """
587
588 def __init__(self, max_col):
589 # type: (int) -> None
590 self.max_col = max_col
591
592 # This could be an optimized set an C++ bit set like
593 # mark_sweep_heap.h, rather than a Dict
594 #self.unique_objs = mylib.UniqueObjects()
595
596 # first pass of object ID -> number of times references
597
598 self.ref_count = {} # type: Dict[int, int]
599
600 def PrettyTree(self, val, f):
601 # type: (value_t, fmt.ColorOutput) -> None
602
603 # TODO: first convert to hnode.asdl types?
604
605 # Although we might want
606 # hnode.AlreadyShown = (str type, int unique_id)
607 pass
608
609 def Print(self, val, buf):
610 # type: (value_t, mylib.BufWriter) -> None
611
612 # Or print to stderr?
613 f = fmt.DetectConsoleOutput(mylib.Stdout())
614 self.PrettyTree(val, f)
615
616 # Then print those with ASDL
617 pass
618
619
620class LexerDecoder(object):
621 """J8 lexer and string decoder.
622
623 Similar interface as SimpleLexer, except we return an optional decoded
624 string
625 """
626
627 def __init__(self, s, is_j8, lang_str):
628 # type: (str, bool, str) -> None
629 self.s = s
630 self.is_j8 = is_j8
631 self.lang_str = lang_str
632
633 self.pos = 0
634
635 # current line being lexed -- for error messages
636 self.cur_line_num = 1
637
638 # Reuse this instance to save GC objects. JSON objects could have
639 # thousands of strings.
640 self.decoded = mylib.BufWriter()
641
642 def _Error(self, msg, end_pos):
643 # type: (str, int) -> error.Decode
644
645 # Use the current position as start pos
646 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
647
648 def Next(self):
649 # type: () -> Tuple[Id_t, int, Optional[str]]
650 """ Returns a token and updates self.pos """
651
652 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
653
654 if not self.is_j8:
655 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
656 raise self._Error(
657 "Single quotes aren't part of JSON; you may want 'json8 read'",
658 end_pos)
659 if tok_id == Id.Ignored_Comment:
660 raise self._Error(
661 "Comments aren't part of JSON; you may want 'json8 read'",
662 end_pos)
663
664 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
665 Id.Left_USingleQuote):
666 return self._DecodeString(tok_id, end_pos)
667
668 if tok_id == Id.Left_JDoubleQuote:
669 if self.is_j8:
670 return self._DecodeString(tok_id, end_pos)
671 else:
672 raise self._Error('Pure JSON does not accept j"" prefix',
673 end_pos)
674
675 if tok_id == Id.Ignored_Newline:
676 #log('LINE %d', self.cur_line_num)
677 self.cur_line_num += 1
678
679 self.pos = end_pos
680 return tok_id, end_pos, None
681
682 def NextForLines(self):
683 # type: () -> Tuple[Id_t, int, Optional[str]]
684 """ Like Next(), but for J8 Lines """
685
686 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
687
688 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
689 Id.Left_BSingleQuote, Id.Left_USingleQuote):
690 return self._DecodeString(tok_id, end_pos)
691
692 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
693 # this for quoted strings.)
694 if (tok_id == Id.Lit_Chars and
695 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
696 raise self._Error(
697 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
698 if tok_id == Id.Char_AsciiControl:
699 raise self._Error(
700 "J8 Lines can't have unescaped ASCII control chars", end_pos)
701
702 if tok_id == Id.J8_Newline:
703 #log('LINE %d', self.cur_line_num)
704 self.cur_line_num += 1
705
706 self.pos = end_pos
707 return tok_id, end_pos, None
708
709 def _DecodeString(self, left_id, str_pos):
710 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
711 """ Returns a string token and updates self.pos """
712
713 while True:
714 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
715 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
716 else:
717 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
718
719 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
720
721 if tok_id == Id.Eol_Tok:
722 # TODO: point to beginning of # quote?
723 raise self._Error(
724 'Unexpected EOF while lexing %s string' % self.lang_str,
725 str_end)
726 if tok_id == Id.Unknown_Backslash:
727 raise self._Error(
728 'Bad backslash escape in %s string' % self.lang_str,
729 str_end)
730 if tok_id == Id.Char_AsciiControl:
731 raise self._Error(
732 "%s strings can't have unescaped ASCII control chars" %
733 self.lang_str, str_end)
734
735 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
736
737 self.pos = str_end
738
739 s = self.decoded.getvalue()
740 self.decoded.clear() # reuse this instance
741
742 #log('decoded %r', self.decoded.getvalue())
743 return Id.J8_String, str_end, s
744
745 #
746 # Now handle each kind of token
747 #
748
749 if tok_id == Id.Lit_Chars: # JSON and J8
750 part = self.s[str_pos:str_end]
751 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
752 raise self._Error(
753 'Invalid UTF-8 in %s string literal' % self.lang_str,
754 str_end)
755
756 # TODO: would be nice to avoid allocation in all these cases.
757 # But LookupCharC() would have to change.
758
759 elif tok_id == Id.Char_OneChar: # JSON and J8
760 ch = self.s[str_pos + 1]
761 part = consts.LookupCharC(ch)
762
763 elif tok_id == Id.Char_UBraced: # J8 only
764 h = self.s[str_pos + 3:str_end - 1]
765 i = int(h, 16)
766
767 # Same checks in osh/word_compile.py
768 if i > 0x10ffff:
769 raise self._Error(
770 "Code point can't be greater than U+10ffff", str_end)
771 if 0xD800 <= i and i < 0xE000:
772 raise self._Error(
773 r"\u{%s} escape is illegal because it's in the surrogate range"
774 % h, str_end)
775
776 part = Utf8Encode(i)
777
778 elif tok_id == Id.Char_YHex: # J8 only
779 h = self.s[str_pos + 2:str_end]
780
781 # Same check in osh/word_parse.py
782 if left_id != Id.Left_BSingleQuote:
783 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
784 raise self._Error(
785 r"\y%s escapes not allowed in u'' strings" % h,
786 str_end)
787
788 i = int(h, 16)
789 part = chr(i)
790
791 elif tok_id == Id.Char_SurrogatePair:
792 h1 = self.s[str_pos + 2:str_pos + 6]
793 h2 = self.s[str_pos + 8:str_pos + 12]
794
795 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
796 i1 = int(h1, 16) - 0xD800 # high surrogate
797 i2 = int(h2, 16) - 0xDC00 # low surrogate
798 code_point = 0x10000 + (i1 << 10) + i2
799
800 part = Utf8Encode(code_point)
801
802 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
803 h = self.s[str_pos + 2:str_end]
804 i = int(h, 16)
805 part = Utf8Encode(i)
806
807 else:
808 # Should never happen
809 raise AssertionError(Id_str(tok_id))
810
811 #log('%s part %r', Id_str(tok_id), part)
812 self.decoded.write(part)
813 str_pos = str_end
814
815
816class _Parser(object):
817
818 def __init__(self, s, is_j8):
819 # type: (str, bool) -> None
820 self.s = s
821 self.is_j8 = is_j8
822 self.lang_str = "J8" if is_j8 else "JSON"
823
824 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
825 self.tok_id = Id.Undefined_Tok
826 self.start_pos = 0
827 self.end_pos = 0
828 self.decoded = '' # decoded J8 string
829
830 def _Next(self):
831 # type: () -> None
832
833 # This isn't the start of a J8_Bool token, it's the END of the token before it
834 while True:
835 self.start_pos = self.end_pos
836 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
837 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
838 Id.Ignored_Comment):
839 break
840 # TODO: add Ignored_Newline to count lines, and show line numbers
841 # in errors messages. The position of the last newline and a token
842 # can be used to calculate a column number.
843
844 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
845
846 def _Eat(self, tok_id):
847 # type: (Id_t) -> None
848
849 if self.tok_id != tok_id:
850 #log('position %r %d-%d %r', self.s, self.start_pos,
851 # self.end_pos, self.s[self.start_pos:self.end_pos])
852 raise self._ParseError("Expected %s, got %s" %
853 (Id_str(tok_id), Id_str(self.tok_id)))
854 self._Next()
855
856 def _NextForLines(self):
857 # type: () -> None
858 """Like _Next, but use the J8 Lines lexer."""
859 self.start_pos = self.end_pos
860 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
861
862 def _ParseError(self, msg):
863 # type: (str) -> error.Decode
864 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
865 self.lexer.cur_line_num)
866
867
868class Parser(_Parser):
869 """JSON and JSON8 Parser."""
870
871 def __init__(self, s, is_j8):
872 # type: (str, bool) -> None
873 _Parser.__init__(self, s, is_j8)
874
875 def _ParsePair(self):
876 # type: () -> Tuple[str, value_t]
877
878 k = self.decoded # Save the potential string value
879 self._Eat(Id.J8_String) # Check that it's a string
880 assert k is not None
881
882 self._Eat(Id.J8_Colon)
883
884 v = self._ParseValue()
885 return k, v
886
887 def _ParseDict(self):
888 # type: () -> value_t
889 """
890 pair = string ':' value
891 Dict = '{' '}'
892 | '{' pair (',' pair)* '}'
893 """
894 # precondition
895 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
896
897 #log('> Dict')
898
899 d = NewDict() # type: Dict[str, value_t]
900
901 self._Next()
902 if self.tok_id == Id.J8_RBrace:
903 self._Next()
904 return value.Dict(d)
905
906 k, v = self._ParsePair()
907 d[k] = v
908 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
909
910 while self.tok_id == Id.J8_Comma:
911 self._Next()
912 k, v = self._ParsePair()
913 d[k] = v
914 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
915
916 self._Eat(Id.J8_RBrace)
917
918 #log('< Dict')
919
920 return value.Dict(d)
921
922 def _ParseList(self):
923 # type: () -> value_t
924 """
925 List = '[' ']'
926 | '[' value (',' value)* ']'
927 """
928 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
929
930 items = [] # type: List[value_t]
931
932 self._Next()
933 if self.tok_id == Id.J8_RBracket:
934 self._Next()
935 return value.List(items)
936
937 items.append(self._ParseValue())
938
939 while self.tok_id == Id.J8_Comma:
940 self._Next()
941 items.append(self._ParseValue())
942
943 self._Eat(Id.J8_RBracket)
944
945 return value.List(items)
946
947 def _ParseValue(self):
948 # type: () -> value_t
949 if self.tok_id == Id.J8_LBrace:
950 return self._ParseDict()
951
952 elif self.tok_id == Id.J8_LBracket:
953 return self._ParseList()
954
955 elif self.tok_id == Id.J8_Null:
956 self._Next()
957 return value.Null
958
959 elif self.tok_id == Id.J8_Bool:
960 #log('%r %d', self.s[self.start_pos], self.start_pos)
961 b = value.Bool(self.s[self.start_pos] == 't')
962 self._Next()
963 return b
964
965 elif self.tok_id == Id.J8_Int:
966 part = self.s[self.start_pos:self.end_pos]
967 self._Next()
968 try:
969 big = mops.FromStr(part)
970 except ValueError:
971 raise self._ParseError('Integer is too big')
972 return value.Int(big)
973
974 elif self.tok_id == Id.J8_Float:
975 part = self.s[self.start_pos:self.end_pos]
976 self._Next()
977 return value.Float(float(part))
978
979 # UString, BString too
980 elif self.tok_id == Id.J8_String:
981 str_val = value.Str(self.decoded)
982 #log('d %r', self.decoded)
983 self._Next()
984 return str_val
985
986 elif self.tok_id == Id.Eol_Tok:
987 raise self._ParseError('Unexpected EOF while parsing %s' %
988 self.lang_str)
989
990 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
991 raise self._ParseError('Invalid token while parsing %s: %s' %
992 (self.lang_str, Id_str(self.tok_id)))
993
994 def ParseValue(self):
995 # type: () -> value_t
996 """ Raises error.Decode. """
997 self._Next()
998 obj = self._ParseValue()
999
1000 n = len(self.s)
1001 if self.start_pos != n:
1002 extra = n - self.start_pos
1003 #log('n %d pos %d', n, self.start_pos)
1004 raise self._ParseError(
1005 'Got %d bytes of unexpected trailing input' % extra)
1006 return obj
1007
1008
1009class Nil8Parser(_Parser):
1010 """
1011 Tokens not in JSON8:
1012 LParen RParen Symbol
1013
1014 Tokens not in JSON, but in JSON8 and NIL8:
1015 Identifier (unquoted keys)
1016 Ignored_Comment
1017 """
1018
1019 def __init__(self, s, is_j8):
1020 # type: (str, bool) -> None
1021 _Parser.__init__(self, s, is_j8)
1022
1023 if 0:
1024
1025 def _LookAhead(self):
1026 # type: () -> Id_t
1027 """
1028 Don't need this right now
1029 """
1030 end_pos = self.end_pos # look ahead from last token
1031 while True:
1032 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1033 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1034 Id.Ignored_Comment):
1035 break
1036 return tok_id
1037
1038 def _ParseRecord(self):
1039 # type: () -> nvalue_t
1040 """
1041 Yaks
1042 (self->Next) => (-> self Next)
1043 (self->Next obj.field) => ((-> self Next) (. obj field))
1044
1045 Similar to
1046 ((identity identity) 42) => 42 in Clojure
1047
1048 ASDL
1049 (Node left:(. x4beef2))
1050 (Node left !x4beef2)
1051
1052 # Ambiguous because value can be identifier.
1053 # We have to look ahead to and see if there's a colon :
1054 field =
1055 Identifier ':' value
1056 | value
1057
1058 record = '(' head field* ')'
1059
1060 - Identifier | Symbol are treated the same, it's a side effect of
1061 the lexing style
1062 - do positional args come before named args
1063 - () is invalid? Use [] for empty list
1064 """
1065 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1066
1067 items = [] # type: List[nvalue_t]
1068
1069 self._Next()
1070 if self.tok_id == Id.J8_RParen:
1071 self._Next()
1072 return nvalue.List(items)
1073
1074 #log('TOK %s', Id_str(self.tok_id))
1075 while self.tok_id != Id.J8_RParen:
1076 items.append(self._ParseNil8())
1077 #log('TOK 2 %s', Id_str(self.tok_id))
1078
1079 self._Eat(Id.J8_RParen)
1080
1081 return nvalue.List(items)
1082
1083 def _ParseList8(self):
1084 # type: () -> nvalue_t
1085 """
1086 List8 = '[' value* ']'
1087
1088 No commas, not even optional ones for now.
1089 """
1090 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1091
1092 items = [] # type: List[nvalue_t]
1093
1094 self._Next()
1095 if self.tok_id == Id.J8_RBracket:
1096 self._Next()
1097 return nvalue.List(items)
1098
1099 #log('TOK %s', Id_str(self.tok_id))
1100 while self.tok_id != Id.J8_RBracket:
1101 items.append(self._ParseNil8())
1102 #log('TOK 2 %s', Id_str(self.tok_id))
1103
1104 self._Eat(Id.J8_RBracket)
1105
1106 return nvalue.List(items)
1107
1108 def _ParseNil8(self):
1109 # type: () -> nvalue_t
1110 if self.tok_id == Id.J8_LParen:
1111 obj = self._ParseRecord() # type: nvalue_t
1112 #return obj
1113
1114 elif self.tok_id == Id.J8_LBracket:
1115 obj = self._ParseList8()
1116 #return obj
1117
1118 # Primitives are copied from J8 above.
1119 # TODO: We also want hex literals.
1120 elif self.tok_id == Id.J8_Null:
1121 self._Next()
1122 obj = nvalue.Null
1123
1124 elif self.tok_id == Id.J8_Bool:
1125 b = nvalue.Bool(self.s[self.start_pos] == 't')
1126 self._Next()
1127 obj = b
1128
1129 elif self.tok_id == Id.J8_Int:
1130 part = self.s[self.start_pos:self.end_pos]
1131 self._Next()
1132 obj = nvalue.Int(int(part))
1133
1134 elif self.tok_id == Id.J8_Float:
1135 part = self.s[self.start_pos:self.end_pos]
1136 self._Next()
1137 obj = nvalue.Float(float(part))
1138
1139 elif self.tok_id == Id.J8_String:
1140 str_val = nvalue.Str(self.decoded)
1141 self._Next()
1142 obj = str_val
1143
1144 # <- etc.
1145 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1146 Id.J8_Comma):
1147 # unquoted "word" treated like a string
1148 part = self.s[self.start_pos:self.end_pos]
1149 self._Next()
1150 obj = nvalue.Symbol(part)
1151
1152 elif self.tok_id == Id.Eol_Tok:
1153 raise self._ParseError('Unexpected EOF while parsing %s' %
1154 self.lang_str)
1155
1156 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1157 raise self._ParseError('Invalid token while parsing %s: %s' %
1158 (self.lang_str, Id_str(self.tok_id)))
1159
1160 #log('YO %s', Id_str(self.tok_id))
1161 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1162 #log('AT %s', Id_str(self.tok_id))
1163
1164 # key: "value" -> (: key "value")
1165 part = self.s[self.start_pos:self.end_pos]
1166 op = nvalue.Symbol(part)
1167
1168 self._Next()
1169 operand2 = self._ParseNil8()
1170 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1171 #print("--> INFIX %d %s" % (id(infix), infix))
1172 return infix
1173
1174 #next_id = self._LookAhead()
1175 #print('NEXT %s' % Id_str(next_id))
1176
1177 #raise AssertionError()
1178 #print("--> OBJ %d %s" % (id(obj), obj))
1179 return obj
1180
1181 def ParseNil8(self):
1182 # type: () -> nvalue_t
1183 """ Raises error.Decode. """
1184 self._Next()
1185 #print('yo')
1186 obj = self._ParseNil8()
1187 #print("==> %d %s" % (id(obj), obj))
1188 if self.tok_id != Id.Eol_Tok:
1189 raise self._ParseError('Unexpected trailing input')
1190 return obj
1191
1192
1193class J8LinesParser(_Parser):
1194 """Decode lines from a string with newlines.
1195
1196 We specify this with a grammar, to preserve location info and to reduce
1197 allocations. (But note that unquoted_line is more like a LOOP than it is
1198 grammatical.)
1199
1200 Grammar:
1201
1202 end = J8_Newline | Eol_Tok
1203
1204 empty_line = WS_Space? end
1205
1206 # special case: read until end token, but REMOVE trailing WS_Space
1207 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1208
1209 j8_line = WS_Space? J8_String WS_Space? end
1210
1211 lines = (empty_line | unquoted_line | j8_line)*
1212
1213 where Lit_Chars is valid UTF-8
1214
1215 Notes:
1216
1217 (1) We disallow multiple strings on a line, like:
1218
1219 "json" "json2"
1220 "json" unquoted
1221
1222 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1223
1224 foo "" u''
1225
1226 The "" and u'' are not a decoded string, because the line started with
1227 Id.Lit_Chars literals.
1228
1229 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1230 Does it have - for empty cell?
1231 """
1232
1233 def __init__(self, s):
1234 # type: (str) -> None
1235 _Parser.__init__(self, s, True)
1236
1237 def _Show(self, s):
1238 # type: (str) -> None
1239 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1240 self.end_pos)
1241
1242 def _ParseLine(self, out):
1243 # type: (List[str]) -> None
1244 """ May append a line to 'out' """
1245 #self._Show('1')
1246 if self.tok_id == Id.WS_Space:
1247 self._NextForLines()
1248
1249 # Empty line - return without doing anything
1250 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1251 self._NextForLines()
1252 return
1253
1254 # Quoted string on line
1255 if self.tok_id == Id.J8_String:
1256 out.append(self.decoded)
1257 self._NextForLines()
1258
1259 if self.tok_id == Id.WS_Space: # trailing whitespace
1260 self._NextForLines()
1261
1262 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1263 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1264 Id_str(self.tok_id))
1265
1266 self._NextForLines()
1267 return
1268
1269 # Unquoted line
1270 if self.tok_id == Id.Lit_Chars:
1271 # ' unquoted "" text on line ' # read every token until end
1272 string_start = self.start_pos
1273 while True:
1274 # for stripping whitespace
1275 prev_id = self.tok_id
1276 prev_start = self.start_pos
1277
1278 self._NextForLines()
1279
1280 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1281 # \r, but we're sticking with the JSON spec definition of
1282 # whitespace. (As another data point, CPython on Unix allows
1283 # \r in the middle of expressions, treating it as whitespace.)
1284 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1285 break
1286
1287 if prev_id == Id.WS_Space:
1288 string_end = prev_start # remove trailing whitespace
1289 else:
1290 string_end = self.start_pos
1291
1292 out.append(self.s[string_start:string_end])
1293
1294 self._NextForLines() # past newline
1295 return
1296
1297 raise AssertionError(Id_str(self.tok_id))
1298
1299 def Parse(self):
1300 # type: () -> List[str]
1301 """ Raises error.Decode. """
1302 self._NextForLines()
1303
1304 lines = [] # type: List[str]
1305 while self.tok_id != Id.Eol_Tok:
1306 self._ParseLine(lines)
1307
1308 if self.tok_id != Id.Eol_Tok:
1309 raise self._ParseError('Unexpected trailing input in J8 Lines')
1310
1311 return lines
1312
1313
1314def SplitJ8Lines(s):
1315 # type: (str) -> List[str]
1316 """Used by @(echo split command sub)
1317
1318 Raises:
1319 error.Decode
1320
1321 3 Errors:
1322 - J8 string syntax error inside quotes
1323 - Extra input on line
1324 - unquoted line isn't utf-8
1325 """
1326 p = J8LinesParser(s)
1327 return p.Parse()
1328
1329
1330# vim: sw=4