OILS / data_lang / j8.py View on Github | oilshell.org

1326 lines, 653 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 # TODO: Omit type at top level
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194def EncodeString(s, buf, unquoted_ok=False):
195 # type: (str, mylib.BufWriter, bool) -> None
196 """ For pp proc, etc."""
197
198 if unquoted_ok and fastfunc.CanOmitQuotes(s):
199 buf.write(s)
200 return
201
202 _Print(value.Str(s), buf, -1)
203
204
205def MaybeEncodeString(s):
206 # type: (str) -> str
207 """ For write --json8 $s and compexport """
208
209 # TODO: add unquoted_ok here?
210 # /usr/local/foo-bar/x.y/a_b
211
212 buf = mylib.BufWriter()
213 _Print(value.Str(s), buf, -1)
214 return buf.getvalue()
215
216
217def MaybeEncodeJsonString(s):
218 # type: (str) -> str
219 """ For write --json """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225 return buf.getvalue()
226
227
228# DFS traversal state
229UNSEEN = 0
230EXPLORING = 1
231FINISHED = 2
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 # Value is always True
245 # Dict[int, None] doesn't translate -- it would be nice to have a set()
246 self.visited = {} # type: Dict[int, int]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintDict(self, val, level):
297 # type: (value.Dict, int) -> None
298
299 if len(val.d) == 0: # Special case like Python/JS
300 self.buf.write('{}')
301 else:
302 self.buf.write('{')
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(val.d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write('}')
324
325 def _PrintBashPrefix(self, type_str, level):
326 # type: (str, int) -> None
327
328 self.buf.write('{')
329 self._MaybeNewline()
330 self._ItemIndent(level)
331 self.buf.write('"type":')
332 self._MaybeSpace()
333 self.buf.write(type_str) # "BashArray", or "BashAssoc",
334
335 self._MaybeNewline()
336
337 self._ItemIndent(level)
338 self.buf.write('"data":')
339 self._MaybeSpace()
340
341 def _PrintBashSuffix(self, level):
342 # type: (int) -> None
343 self._MaybeNewline()
344 self._BracketIndent(level)
345 self.buf.write('}')
346
347 def _PrintBashArray(self, val, level):
348 # type: (value.BashArray, int) -> None
349
350 self._PrintBashPrefix('"BashArray",', level)
351
352 if len(val.strs) == 0: # Special case like Python/JS
353 self.buf.write('{}')
354 else:
355 self.buf.write('{')
356 self._MaybeNewline()
357
358 first = True
359 for i, s in enumerate(val.strs):
360 if s is None:
361 continue
362
363 if not first:
364 self.buf.write(',')
365 self._MaybeNewline()
366
367 self._ItemIndent(level + 1)
368 pyj8.WriteString(str(i), self.options, self.buf)
369
370 self.buf.write(':')
371 self._MaybeSpace()
372
373 pyj8.WriteString(s, self.options, self.buf)
374
375 first = False
376
377 self._MaybeNewline()
378
379 self._BracketIndent(level + 1)
380 self.buf.write('}')
381
382 self._PrintBashSuffix(level)
383
384 def _PrintBashAssoc(self, val, level):
385 # type: (value.BashAssoc, int) -> None
386
387 self._PrintBashPrefix('"BashAssoc",', level)
388
389 if len(val.d) == 0: # Special case like Python/JS
390 self.buf.write('{}')
391 else:
392 self.buf.write('{')
393 self._MaybeNewline()
394
395 i = 0
396 for k2, v2 in iteritems(val.d):
397 if i != 0:
398 self.buf.write(',')
399 self._MaybeNewline()
400
401 self._ItemIndent(level + 1)
402 pyj8.WriteString(k2, self.options, self.buf)
403
404 self.buf.write(':')
405 self._MaybeSpace()
406
407 pyj8.WriteString(v2, self.options, self.buf)
408
409 i += 1
410
411 self._MaybeNewline()
412
413 self._BracketIndent(level + 1)
414 self.buf.write('}')
415
416 self._PrintBashSuffix(level)
417
418 def Print(self, val, level=0):
419 # type: (value_t, int) -> None
420
421 # special value that means everything is on one line
422 # It's like
423 # JSON.stringify(d, null, 0)
424 # except we use -1, not 0. 0 can still have newlines.
425
426 UP_val = val
427 with tagswitch(val) as case:
428 if case(value_e.Null):
429 self.buf.write('null')
430
431 elif case(value_e.Bool):
432 val = cast(value.Bool, UP_val)
433 self.buf.write('true' if val.b else 'false')
434
435 elif case(value_e.Int):
436 val = cast(value.Int, UP_val)
437 # TODO: avoid intermediate allocation with
438 # self.buf.WriteBigInt(val.i)
439 #
440 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
441 # be of arbitrary length, and will need a growth strategy.
442 # Although that is not very common, so we could allocate in
443 # that case.
444
445 self.buf.write(mops.ToStr(val.i))
446
447 elif case(value_e.Float):
448 val = cast(value.Float, UP_val)
449
450 fl = val.f
451 if math.isinf(fl):
452 if self.options & INF_NAN_ARE_NULL:
453 s = 'null' # negative infinity is null too
454 else:
455 s = 'INFINITY'
456 if fl < 0:
457 s = '-' + s
458 elif math.isnan(fl):
459 if self.options & INF_NAN_ARE_NULL:
460 # JavaScript JSON lib behavior: Inf and NaN are null
461 # Python has a bug in the encoder by default, and then
462 # allow_nan=False raises an error
463 s = 'null'
464 else:
465 s = 'NAN'
466 else:
467 # TODO: can we avoid intermediate allocation?
468 # self.buf.WriteFloat(val.f)
469 s = str(fl)
470
471 self.buf.write(s)
472
473 elif case(value_e.Str):
474 val = cast(value.Str, UP_val)
475
476 pyj8.WriteString(val.s, self.options, self.buf)
477
478 elif case(value_e.List):
479 val = cast(value.List, UP_val)
480
481 # Cycle detection, only for containers that can be in cycles
482 heap_id = HeapValueId(val)
483
484 node_state = self.visited.get(heap_id, UNSEEN)
485 if node_state == FINISHED:
486 # Print it AGAIN. We print a JSON tree, which means we can
487 # visit and print nodes MANY TIMES, as long as they're not
488 # in a cycle.
489 self._PrintList(val, level)
490 return
491 if node_state == EXPLORING:
492 if self.options & SHOW_CYCLES:
493 self.buf.write('[ -->%s ]' % ValueIdString(val))
494 return
495 else:
496 # node.js prints which index closes the cycle
497 raise error.Encode(
498 "Can't encode List%s in object cycle" %
499 ValueIdString(val))
500
501 self.visited[heap_id] = EXPLORING
502 self._PrintList(val, level)
503 self.visited[heap_id] = FINISHED
504
505 elif case(value_e.Dict):
506 val = cast(value.Dict, UP_val)
507
508 # Cycle detection, only for containers that can be in cycles
509 heap_id = HeapValueId(val)
510
511 node_state = self.visited.get(heap_id, UNSEEN)
512 if node_state == FINISHED:
513 # Print it AGAIN. We print a JSON tree, which means we can
514 # visit and print nodes MANY TIMES, as long as they're not
515 # in a cycle.
516 self._PrintDict(val, level)
517 return
518 if node_state == EXPLORING:
519 if self.options & SHOW_CYCLES:
520 self.buf.write('{ -->%s }' % ValueIdString(val))
521 return
522 else:
523 # node.js prints which key closes the cycle
524 raise error.Encode(
525 "Can't encode Dict%s in object cycle" %
526 ValueIdString(val))
527
528 self.visited[heap_id] = EXPLORING
529 self._PrintDict(val, level)
530 self.visited[heap_id] = FINISHED
531
532 # TODO: New format, which should consistent with pretty printing
533 # pp line (x) supports BashArray and BashAssoc, e.g. for spec
534 # tests.
535
536 # - BashAssoc is Dict[str, str]
537 # (BashAssoc ['1']='foo' ['3']='bar')
538 # - BashArray will be Dict[int, str] - SparseArray. We should write it like
539 # (BashArray [1]='foo' [3]='bar')
540
541 elif case(value_e.BashArray):
542 val = cast(value.BashArray, UP_val)
543 self._PrintBashArray(val, level)
544
545 elif case(value_e.BashAssoc):
546 val = cast(value.BashAssoc, UP_val)
547 self._PrintBashAssoc(val, level)
548
549 else:
550 pass # mycpp workaround
551 if self.options & SHOW_NON_DATA:
552 # Similar to = operator, ui.DebugPrint()
553 # TODO: that prints value.Range in a special way
554 ysh_type = ValType(val)
555 id_str = ValueIdString(val)
556 self.buf.write('<%s%s>' % (ysh_type, id_str))
557 else:
558 raise error.Encode("Can't serialize object of type %s" %
559 ValType(val))
560
561
562class PrettyPrinter(object):
563 """ Unused right now, but could enhance the = operator.
564
565 Output to polymorphic ColorOutput
566
567 Features like asdl/format.py:
568 - line wrapping
569 - color
570 - sharing detection by passing in a REF COUTN dict
571 - print @123 the first time, and then print ... the second time
572
573 and
574
575 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
576 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
577
578 - Omitting commas for ASDL? Maybe we can use two spaces
579
580 (Token id: Id.VSub_DollarName start: 0 length: 3)
581 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
582 """
583
584 def __init__(self, max_col):
585 # type: (int) -> None
586 self.max_col = max_col
587
588 # This could be an optimized set an C++ bit set like
589 # mark_sweep_heap.h, rather than a Dict
590 #self.unique_objs = mylib.UniqueObjects()
591
592 # first pass of object ID -> number of times references
593
594 self.ref_count = {} # type: Dict[int, int]
595
596 def PrettyTree(self, val, f):
597 # type: (value_t, fmt.ColorOutput) -> None
598
599 # TODO: first convert to hnode.asdl types?
600
601 # Although we might want
602 # hnode.AlreadyShown = (str type, int unique_id)
603 pass
604
605 def Print(self, val, buf):
606 # type: (value_t, mylib.BufWriter) -> None
607
608 # Or print to stderr?
609 f = fmt.DetectConsoleOutput(mylib.Stdout())
610 self.PrettyTree(val, f)
611
612 # Then print those with ASDL
613 pass
614
615
616class LexerDecoder(object):
617 """J8 lexer and string decoder.
618
619 Similar interface as SimpleLexer, except we return an optional decoded
620 string
621 """
622
623 def __init__(self, s, is_j8, lang_str):
624 # type: (str, bool, str) -> None
625 self.s = s
626 self.is_j8 = is_j8
627 self.lang_str = lang_str
628
629 self.pos = 0
630
631 # current line being lexed -- for error messages
632 self.cur_line_num = 1
633
634 # Reuse this instance to save GC objects. JSON objects could have
635 # thousands of strings.
636 self.decoded = mylib.BufWriter()
637
638 def _Error(self, msg, end_pos):
639 # type: (str, int) -> error.Decode
640
641 # Use the current position as start pos
642 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
643
644 def Next(self):
645 # type: () -> Tuple[Id_t, int, Optional[str]]
646 """ Returns a token and updates self.pos """
647
648 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
649
650 if not self.is_j8:
651 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
652 raise self._Error(
653 "Single quotes aren't part of JSON; you may want 'json8 read'",
654 end_pos)
655 if tok_id == Id.Ignored_Comment:
656 raise self._Error(
657 "Comments aren't part of JSON; you may want 'json8 read'",
658 end_pos)
659
660 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
661 Id.Left_USingleQuote):
662 return self._DecodeString(tok_id, end_pos)
663
664 if tok_id == Id.Left_JDoubleQuote:
665 if self.is_j8:
666 return self._DecodeString(tok_id, end_pos)
667 else:
668 raise self._Error('Pure JSON does not accept j"" prefix',
669 end_pos)
670
671 if tok_id == Id.Ignored_Newline:
672 #log('LINE %d', self.cur_line_num)
673 self.cur_line_num += 1
674
675 self.pos = end_pos
676 return tok_id, end_pos, None
677
678 def NextForLines(self):
679 # type: () -> Tuple[Id_t, int, Optional[str]]
680 """ Like Next(), but for J8 Lines """
681
682 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
683
684 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
685 Id.Left_BSingleQuote, Id.Left_USingleQuote):
686 return self._DecodeString(tok_id, end_pos)
687
688 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
689 # this for quoted strings.)
690 if (tok_id == Id.Lit_Chars and
691 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
692 raise self._Error(
693 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
694 if tok_id == Id.Char_AsciiControl:
695 raise self._Error(
696 "J8 Lines can't have unescaped ASCII control chars", end_pos)
697
698 if tok_id == Id.J8_Newline:
699 #log('LINE %d', self.cur_line_num)
700 self.cur_line_num += 1
701
702 self.pos = end_pos
703 return tok_id, end_pos, None
704
705 def _DecodeString(self, left_id, str_pos):
706 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
707 """ Returns a string token and updates self.pos """
708
709 while True:
710 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
711 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
712 else:
713 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
714
715 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
716
717 if tok_id == Id.Eol_Tok:
718 # TODO: point to beginning of # quote?
719 raise self._Error(
720 'Unexpected EOF while lexing %s string' % self.lang_str,
721 str_end)
722 if tok_id == Id.Unknown_Backslash:
723 raise self._Error(
724 'Bad backslash escape in %s string' % self.lang_str,
725 str_end)
726 if tok_id == Id.Char_AsciiControl:
727 raise self._Error(
728 "%s strings can't have unescaped ASCII control chars" %
729 self.lang_str, str_end)
730
731 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
732
733 self.pos = str_end
734
735 s = self.decoded.getvalue()
736 self.decoded.clear() # reuse this instance
737
738 #log('decoded %r', self.decoded.getvalue())
739 return Id.J8_String, str_end, s
740
741 #
742 # Now handle each kind of token
743 #
744
745 if tok_id == Id.Lit_Chars: # JSON and J8
746 part = self.s[str_pos:str_end]
747 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
748 raise self._Error(
749 'Invalid UTF-8 in %s string literal' % self.lang_str,
750 str_end)
751
752 # TODO: would be nice to avoid allocation in all these cases.
753 # But LookupCharC() would have to change.
754
755 elif tok_id == Id.Char_OneChar: # JSON and J8
756 ch = self.s[str_pos + 1]
757 part = consts.LookupCharC(ch)
758
759 elif tok_id == Id.Char_UBraced: # J8 only
760 h = self.s[str_pos + 3:str_end - 1]
761 i = int(h, 16)
762
763 # Same checks in osh/word_compile.py
764 if i > 0x10ffff:
765 raise self._Error(
766 "Code point can't be greater than U+10ffff", str_end)
767 if 0xD800 <= i and i < 0xE000:
768 raise self._Error(
769 r"\u{%s} escape is illegal because it's in the surrogate range"
770 % h, str_end)
771
772 part = Utf8Encode(i)
773
774 elif tok_id == Id.Char_YHex: # J8 only
775 h = self.s[str_pos + 2:str_end]
776
777 # Same check in osh/word_parse.py
778 if left_id != Id.Left_BSingleQuote:
779 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
780 raise self._Error(
781 r"\y%s escapes not allowed in u'' strings" % h,
782 str_end)
783
784 i = int(h, 16)
785 part = chr(i)
786
787 elif tok_id == Id.Char_SurrogatePair:
788 h1 = self.s[str_pos + 2:str_pos + 6]
789 h2 = self.s[str_pos + 8:str_pos + 12]
790
791 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
792 i1 = int(h1, 16) - 0xD800 # high surrogate
793 i2 = int(h2, 16) - 0xDC00 # low surrogate
794 code_point = 0x10000 + (i1 << 10) + i2
795
796 part = Utf8Encode(code_point)
797
798 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
799 h = self.s[str_pos + 2:str_end]
800 i = int(h, 16)
801 part = Utf8Encode(i)
802
803 else:
804 # Should never happen
805 raise AssertionError(Id_str(tok_id))
806
807 #log('%s part %r', Id_str(tok_id), part)
808 self.decoded.write(part)
809 str_pos = str_end
810
811
812class _Parser(object):
813
814 def __init__(self, s, is_j8):
815 # type: (str, bool) -> None
816 self.s = s
817 self.is_j8 = is_j8
818 self.lang_str = "J8" if is_j8 else "JSON"
819
820 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
821 self.tok_id = Id.Undefined_Tok
822 self.start_pos = 0
823 self.end_pos = 0
824 self.decoded = '' # decoded J8 string
825
826 def _Next(self):
827 # type: () -> None
828
829 # This isn't the start of a J8_Bool token, it's the END of the token before it
830 while True:
831 self.start_pos = self.end_pos
832 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
833 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
834 Id.Ignored_Comment):
835 break
836 # TODO: add Ignored_Newline to count lines, and show line numbers
837 # in errors messages. The position of the last newline and a token
838 # can be used to calculate a column number.
839
840 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
841
842 def _Eat(self, tok_id):
843 # type: (Id_t) -> None
844
845 if self.tok_id != tok_id:
846 #log('position %r %d-%d %r', self.s, self.start_pos,
847 # self.end_pos, self.s[self.start_pos:self.end_pos])
848 raise self._ParseError("Expected %s, got %s" %
849 (Id_str(tok_id), Id_str(self.tok_id)))
850 self._Next()
851
852 def _NextForLines(self):
853 # type: () -> None
854 """Like _Next, but use the J8 Lines lexer."""
855 self.start_pos = self.end_pos
856 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
857
858 def _ParseError(self, msg):
859 # type: (str) -> error.Decode
860 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
861 self.lexer.cur_line_num)
862
863
864class Parser(_Parser):
865 """JSON and JSON8 Parser."""
866
867 def __init__(self, s, is_j8):
868 # type: (str, bool) -> None
869 _Parser.__init__(self, s, is_j8)
870
871 def _ParsePair(self):
872 # type: () -> Tuple[str, value_t]
873
874 k = self.decoded # Save the potential string value
875 self._Eat(Id.J8_String) # Check that it's a string
876 assert k is not None
877
878 self._Eat(Id.J8_Colon)
879
880 v = self._ParseValue()
881 return k, v
882
883 def _ParseDict(self):
884 # type: () -> value_t
885 """
886 pair = string ':' value
887 Dict = '{' '}'
888 | '{' pair (',' pair)* '}'
889 """
890 # precondition
891 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
892
893 #log('> Dict')
894
895 d = NewDict() # type: Dict[str, value_t]
896
897 self._Next()
898 if self.tok_id == Id.J8_RBrace:
899 self._Next()
900 return value.Dict(d)
901
902 k, v = self._ParsePair()
903 d[k] = v
904 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
905
906 while self.tok_id == Id.J8_Comma:
907 self._Next()
908 k, v = self._ParsePair()
909 d[k] = v
910 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
911
912 self._Eat(Id.J8_RBrace)
913
914 #log('< Dict')
915
916 return value.Dict(d)
917
918 def _ParseList(self):
919 # type: () -> value_t
920 """
921 List = '[' ']'
922 | '[' value (',' value)* ']'
923 """
924 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
925
926 items = [] # type: List[value_t]
927
928 self._Next()
929 if self.tok_id == Id.J8_RBracket:
930 self._Next()
931 return value.List(items)
932
933 items.append(self._ParseValue())
934
935 while self.tok_id == Id.J8_Comma:
936 self._Next()
937 items.append(self._ParseValue())
938
939 self._Eat(Id.J8_RBracket)
940
941 return value.List(items)
942
943 def _ParseValue(self):
944 # type: () -> value_t
945 if self.tok_id == Id.J8_LBrace:
946 return self._ParseDict()
947
948 elif self.tok_id == Id.J8_LBracket:
949 return self._ParseList()
950
951 elif self.tok_id == Id.J8_Null:
952 self._Next()
953 return value.Null
954
955 elif self.tok_id == Id.J8_Bool:
956 #log('%r %d', self.s[self.start_pos], self.start_pos)
957 b = value.Bool(self.s[self.start_pos] == 't')
958 self._Next()
959 return b
960
961 elif self.tok_id == Id.J8_Int:
962 part = self.s[self.start_pos:self.end_pos]
963 self._Next()
964 try:
965 big = mops.FromStr(part)
966 except ValueError:
967 raise self._ParseError('Integer is too big')
968 return value.Int(big)
969
970 elif self.tok_id == Id.J8_Float:
971 part = self.s[self.start_pos:self.end_pos]
972 self._Next()
973 return value.Float(float(part))
974
975 # UString, BString too
976 elif self.tok_id == Id.J8_String:
977 str_val = value.Str(self.decoded)
978 #log('d %r', self.decoded)
979 self._Next()
980 return str_val
981
982 elif self.tok_id == Id.Eol_Tok:
983 raise self._ParseError('Unexpected EOF while parsing %s' %
984 self.lang_str)
985
986 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
987 raise self._ParseError('Invalid token while parsing %s: %s' %
988 (self.lang_str, Id_str(self.tok_id)))
989
990 def ParseValue(self):
991 # type: () -> value_t
992 """ Raises error.Decode. """
993 self._Next()
994 obj = self._ParseValue()
995
996 n = len(self.s)
997 if self.start_pos != n:
998 extra = n - self.start_pos
999 #log('n %d pos %d', n, self.start_pos)
1000 raise self._ParseError(
1001 'Got %d bytes of unexpected trailing input' % extra)
1002 return obj
1003
1004
1005class Nil8Parser(_Parser):
1006 """
1007 Tokens not in JSON8:
1008 LParen RParen Symbol
1009
1010 Tokens not in JSON, but in JSON8 and NIL8:
1011 Identifier (unquoted keys)
1012 Ignored_Comment
1013 """
1014
1015 def __init__(self, s, is_j8):
1016 # type: (str, bool) -> None
1017 _Parser.__init__(self, s, is_j8)
1018
1019 if 0:
1020
1021 def _LookAhead(self):
1022 # type: () -> Id_t
1023 """
1024 Don't need this right now
1025 """
1026 end_pos = self.end_pos # look ahead from last token
1027 while True:
1028 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1029 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1030 Id.Ignored_Comment):
1031 break
1032 return tok_id
1033
1034 def _ParseRecord(self):
1035 # type: () -> nvalue_t
1036 """
1037 Yaks
1038 (self->Next) => (-> self Next)
1039 (self->Next obj.field) => ((-> self Next) (. obj field))
1040
1041 Similar to
1042 ((identity identity) 42) => 42 in Clojure
1043
1044 ASDL
1045 (Node left:(. x4beef2))
1046 (Node left !x4beef2)
1047
1048 # Ambiguous because value can be identifier.
1049 # We have to look ahead to and see if there's a colon :
1050 field =
1051 Identifier ':' value
1052 | value
1053
1054 record = '(' head field* ')'
1055
1056 - Identifier | Symbol are treated the same, it's a side effect of
1057 the lexing style
1058 - do positional args come before named args
1059 - () is invalid? Use [] for empty list
1060 """
1061 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1062
1063 items = [] # type: List[nvalue_t]
1064
1065 self._Next()
1066 if self.tok_id == Id.J8_RParen:
1067 self._Next()
1068 return nvalue.List(items)
1069
1070 #log('TOK %s', Id_str(self.tok_id))
1071 while self.tok_id != Id.J8_RParen:
1072 items.append(self._ParseNil8())
1073 #log('TOK 2 %s', Id_str(self.tok_id))
1074
1075 self._Eat(Id.J8_RParen)
1076
1077 return nvalue.List(items)
1078
1079 def _ParseList8(self):
1080 # type: () -> nvalue_t
1081 """
1082 List8 = '[' value* ']'
1083
1084 No commas, not even optional ones for now.
1085 """
1086 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1087
1088 items = [] # type: List[nvalue_t]
1089
1090 self._Next()
1091 if self.tok_id == Id.J8_RBracket:
1092 self._Next()
1093 return nvalue.List(items)
1094
1095 #log('TOK %s', Id_str(self.tok_id))
1096 while self.tok_id != Id.J8_RBracket:
1097 items.append(self._ParseNil8())
1098 #log('TOK 2 %s', Id_str(self.tok_id))
1099
1100 self._Eat(Id.J8_RBracket)
1101
1102 return nvalue.List(items)
1103
1104 def _ParseNil8(self):
1105 # type: () -> nvalue_t
1106 if self.tok_id == Id.J8_LParen:
1107 obj = self._ParseRecord() # type: nvalue_t
1108 #return obj
1109
1110 elif self.tok_id == Id.J8_LBracket:
1111 obj = self._ParseList8()
1112 #return obj
1113
1114 # Primitives are copied from J8 above.
1115 # TODO: We also want hex literals.
1116 elif self.tok_id == Id.J8_Null:
1117 self._Next()
1118 obj = nvalue.Null
1119
1120 elif self.tok_id == Id.J8_Bool:
1121 b = nvalue.Bool(self.s[self.start_pos] == 't')
1122 self._Next()
1123 obj = b
1124
1125 elif self.tok_id == Id.J8_Int:
1126 part = self.s[self.start_pos:self.end_pos]
1127 self._Next()
1128 obj = nvalue.Int(int(part))
1129
1130 elif self.tok_id == Id.J8_Float:
1131 part = self.s[self.start_pos:self.end_pos]
1132 self._Next()
1133 obj = nvalue.Float(float(part))
1134
1135 elif self.tok_id == Id.J8_String:
1136 str_val = nvalue.Str(self.decoded)
1137 self._Next()
1138 obj = str_val
1139
1140 # <- etc.
1141 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1142 Id.J8_Comma):
1143 # unquoted "word" treated like a string
1144 part = self.s[self.start_pos:self.end_pos]
1145 self._Next()
1146 obj = nvalue.Symbol(part)
1147
1148 elif self.tok_id == Id.Eol_Tok:
1149 raise self._ParseError('Unexpected EOF while parsing %s' %
1150 self.lang_str)
1151
1152 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1153 raise self._ParseError('Invalid token while parsing %s: %s' %
1154 (self.lang_str, Id_str(self.tok_id)))
1155
1156 #log('YO %s', Id_str(self.tok_id))
1157 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1158 #log('AT %s', Id_str(self.tok_id))
1159
1160 # key: "value" -> (: key "value")
1161 part = self.s[self.start_pos:self.end_pos]
1162 op = nvalue.Symbol(part)
1163
1164 self._Next()
1165 operand2 = self._ParseNil8()
1166 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1167 #print("--> INFIX %d %s" % (id(infix), infix))
1168 return infix
1169
1170 #next_id = self._LookAhead()
1171 #print('NEXT %s' % Id_str(next_id))
1172
1173 #raise AssertionError()
1174 #print("--> OBJ %d %s" % (id(obj), obj))
1175 return obj
1176
1177 def ParseNil8(self):
1178 # type: () -> nvalue_t
1179 """ Raises error.Decode. """
1180 self._Next()
1181 #print('yo')
1182 obj = self._ParseNil8()
1183 #print("==> %d %s" % (id(obj), obj))
1184 if self.tok_id != Id.Eol_Tok:
1185 raise self._ParseError('Unexpected trailing input')
1186 return obj
1187
1188
1189class J8LinesParser(_Parser):
1190 """Decode lines from a string with newlines.
1191
1192 We specify this with a grammar, to preserve location info and to reduce
1193 allocations. (But note that unquoted_line is more like a LOOP than it is
1194 grammatical.)
1195
1196 Grammar:
1197
1198 end = J8_Newline | Eol_Tok
1199
1200 empty_line = WS_Space? end
1201
1202 # special case: read until end token, but REMOVE trailing WS_Space
1203 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1204
1205 j8_line = WS_Space? J8_String WS_Space? end
1206
1207 lines = (empty_line | unquoted_line | j8_line)*
1208
1209 where Lit_Chars is valid UTF-8
1210
1211 Notes:
1212
1213 (1) We disallow multiple strings on a line, like:
1214
1215 "json" "json2"
1216 "json" unquoted
1217
1218 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1219
1220 foo "" u''
1221
1222 The "" and u'' are not a decoded string, because the line started with
1223 Id.Lit_Chars literals.
1224
1225 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1226 Does it have - for empty cell?
1227 """
1228
1229 def __init__(self, s):
1230 # type: (str) -> None
1231 _Parser.__init__(self, s, True)
1232
1233 def _Show(self, s):
1234 # type: (str) -> None
1235 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1236 self.end_pos)
1237
1238 def _ParseLine(self, out):
1239 # type: (List[str]) -> None
1240 """ May append a line to 'out' """
1241 #self._Show('1')
1242 if self.tok_id == Id.WS_Space:
1243 self._NextForLines()
1244
1245 # Empty line - return without doing anything
1246 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1247 self._NextForLines()
1248 return
1249
1250 # Quoted string on line
1251 if self.tok_id == Id.J8_String:
1252 out.append(self.decoded)
1253 self._NextForLines()
1254
1255 if self.tok_id == Id.WS_Space: # trailing whitespace
1256 self._NextForLines()
1257
1258 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1259 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1260 Id_str(self.tok_id))
1261
1262 self._NextForLines()
1263 return
1264
1265 # Unquoted line
1266 if self.tok_id == Id.Lit_Chars:
1267 # ' unquoted "" text on line ' # read every token until end
1268 string_start = self.start_pos
1269 while True:
1270 # for stripping whitespace
1271 prev_id = self.tok_id
1272 prev_start = self.start_pos
1273
1274 self._NextForLines()
1275
1276 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1277 # \r, but we're sticking with the JSON spec definition of
1278 # whitespace. (As another data point, CPython on Unix allows
1279 # \r in the middle of expressions, treating it as whitespace.)
1280 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1281 break
1282
1283 if prev_id == Id.WS_Space:
1284 string_end = prev_start # remove trailing whitespace
1285 else:
1286 string_end = self.start_pos
1287
1288 out.append(self.s[string_start:string_end])
1289
1290 self._NextForLines() # past newline
1291 return
1292
1293 raise AssertionError(Id_str(self.tok_id))
1294
1295 def Parse(self):
1296 # type: () -> List[str]
1297 """ Raises error.Decode. """
1298 self._NextForLines()
1299
1300 lines = [] # type: List[str]
1301 while self.tok_id != Id.Eol_Tok:
1302 self._ParseLine(lines)
1303
1304 if self.tok_id != Id.Eol_Tok:
1305 raise self._ParseError('Unexpected trailing input in J8 Lines')
1306
1307 return lines
1308
1309
1310def SplitJ8Lines(s):
1311 # type: (str) -> List[str]
1312 """Used by @(echo split command sub)
1313
1314 Raises:
1315 error.Decode
1316
1317 3 Errors:
1318 - J8 string syntax error inside quotes
1319 - Extra input on line
1320 - unquoted line isn't utf-8
1321 """
1322 p = J8LinesParser(s)
1323 return p.Parse()
1324
1325
1326# vim: sw=4