OILS / data_lang / j8.py View on Github | oilshell.org

1367 lines, 683 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
188
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193def Repr(val):
194 # type: (value_t) -> str
195 """ For assert [x]
196
197 This is like Python's repr
198 """
199 # error.Encode should be impossible - we show cycles and non-data
200 buf = mylib.BufWriter()
201 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
202 return buf.getvalue()
203
204
205def EncodeString(s, buf, unquoted_ok=False):
206 # type: (str, mylib.BufWriter, bool) -> None
207 """ For pp proc, etc."""
208
209 if unquoted_ok and fastfunc.CanOmitQuotes(s):
210 buf.write(s)
211 return
212
213 _Print(value.Str(s), buf, -1)
214
215
216def MaybeEncodeString(s):
217 # type: (str) -> str
218 """ For write --json8 $s and compexport """
219
220 # TODO: add unquoted_ok here?
221 # /usr/local/foo-bar/x.y/a_b
222
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1)
225 return buf.getvalue()
226
227
228def MaybeEncodeJsonString(s):
229 # type: (str) -> str
230 """ For write --json """
231
232 # TODO: add unquoted_ok here?
233 # /usr/local/foo-bar/x.y/a_b
234 buf = mylib.BufWriter()
235 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
236 return buf.getvalue()
237
238
239# DFS traversal state
240UNSEEN = 0
241EXPLORING = 1
242FINISHED = 2
243
244
245class InstancePrinter(object):
246 """Print a value tree as J8/JSON."""
247
248 def __init__(self, buf, indent, options):
249 # type: (mylib.BufWriter, int, int) -> None
250 self.buf = buf
251 self.indent = indent
252 self.options = options
253
254 # Key is vm.HeapValueId(val)
255 # Value is always True
256 # Dict[int, None] doesn't translate -- it would be nice to have a set()
257 self.visited = {} # type: Dict[int, int]
258
259 def _ItemIndent(self, level):
260 # type: (int) -> None
261
262 if self.indent == -1:
263 return
264
265 self.buf.write_spaces((level + 1) * self.indent)
266
267 def _BracketIndent(self, level):
268 # type: (int) -> None
269
270 if self.indent == -1:
271 return
272
273 self.buf.write_spaces(level * self.indent)
274
275 def _MaybeNewline(self):
276 # type: () -> None
277 if self.indent == -1:
278 return
279 self.buf.write('\n')
280
281 def _MaybeSpace(self):
282 # type: () -> None
283 if self.indent == -1:
284 return
285 self.buf.write(' ')
286
287 def _PrintList(self, val, level):
288 # type: (value.List, int) -> None
289
290 if len(val.items) == 0: # Special case like Python/JS
291 self.buf.write('[]')
292 else:
293 self.buf.write('[')
294 self._MaybeNewline()
295 for i, item in enumerate(val.items):
296 if i != 0:
297 self.buf.write(',')
298 self._MaybeNewline()
299
300 self._ItemIndent(level)
301 self.Print(item, level + 1)
302 self._MaybeNewline()
303
304 self._BracketIndent(level)
305 self.buf.write(']')
306
307 def _PrintDict(self, val, level):
308 # type: (value.Dict, int) -> None
309
310 if len(val.d) == 0: # Special case like Python/JS
311 self.buf.write('{}')
312 else:
313 self.buf.write('{')
314 self._MaybeNewline()
315 i = 0
316 for k, v in iteritems(val.d):
317 if i != 0:
318 self.buf.write(',')
319 self._MaybeNewline()
320
321 self._ItemIndent(level)
322
323 pyj8.WriteString(k, self.options, self.buf)
324
325 self.buf.write(':')
326 self._MaybeSpace()
327
328 self.Print(v, level + 1)
329
330 i += 1
331
332 self._MaybeNewline()
333 self._BracketIndent(level)
334 self.buf.write('}')
335
336 def _PrintBashPrefix(self, type_str, level):
337 # type: (str, int) -> None
338
339 self.buf.write('{')
340 self._MaybeNewline()
341 self._ItemIndent(level)
342 self.buf.write('"type":')
343 self._MaybeSpace()
344 self.buf.write(type_str) # "BashArray", or "BashAssoc",
345
346 self._MaybeNewline()
347
348 self._ItemIndent(level)
349 self.buf.write('"data":')
350 self._MaybeSpace()
351
352 def _PrintBashSuffix(self, level):
353 # type: (int) -> None
354 self._MaybeNewline()
355 self._BracketIndent(level)
356 self.buf.write('}')
357
358 def _PrintSparseArray(self, val, level):
359 # type: (value.SparseArray, int) -> None
360
361 self._PrintBashPrefix('"SparseArray",', level)
362
363 if len(val.d) == 0: # Special case like Python/JS
364 self.buf.write('{}')
365 else:
366 self.buf.write('{')
367 self._MaybeNewline()
368
369 first = True
370 i = 0
371 for k, v in iteritems(val.d):
372 if i != 0:
373 self.buf.write(',')
374 self._MaybeNewline()
375
376 self._ItemIndent(level + 1)
377 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
378
379 self.buf.write(':')
380 self._MaybeSpace()
381
382 pyj8.WriteString(v, self.options, self.buf)
383
384 i += 1
385
386 self._MaybeNewline()
387
388 self._BracketIndent(level + 1)
389 self.buf.write('}')
390
391 self._PrintBashSuffix(level)
392
393 def _PrintBashArray(self, val, level):
394 # type: (value.BashArray, int) -> None
395
396 self._PrintBashPrefix('"BashArray",', level)
397
398 if len(val.strs) == 0: # Special case like Python/JS
399 self.buf.write('{}')
400 else:
401 self.buf.write('{')
402 self._MaybeNewline()
403
404 first = True
405 for i, s in enumerate(val.strs):
406 if s is None:
407 continue
408
409 if not first:
410 self.buf.write(',')
411 self._MaybeNewline()
412
413 self._ItemIndent(level + 1)
414 pyj8.WriteString(str(i), self.options, self.buf)
415
416 self.buf.write(':')
417 self._MaybeSpace()
418
419 pyj8.WriteString(s, self.options, self.buf)
420
421 first = False
422
423 self._MaybeNewline()
424
425 self._BracketIndent(level + 1)
426 self.buf.write('}')
427
428 self._PrintBashSuffix(level)
429
430 def _PrintBashAssoc(self, val, level):
431 # type: (value.BashAssoc, int) -> None
432
433 self._PrintBashPrefix('"BashAssoc",', level)
434
435 if len(val.d) == 0: # Special case like Python/JS
436 self.buf.write('{}')
437 else:
438 self.buf.write('{')
439 self._MaybeNewline()
440
441 i = 0
442 for k2, v2 in iteritems(val.d):
443 if i != 0:
444 self.buf.write(',')
445 self._MaybeNewline()
446
447 self._ItemIndent(level + 1)
448 pyj8.WriteString(k2, self.options, self.buf)
449
450 self.buf.write(':')
451 self._MaybeSpace()
452
453 pyj8.WriteString(v2, self.options, self.buf)
454
455 i += 1
456
457 self._MaybeNewline()
458
459 self._BracketIndent(level + 1)
460 self.buf.write('}')
461
462 self._PrintBashSuffix(level)
463
464 def Print(self, val, level=0):
465 # type: (value_t, int) -> None
466
467 # special value that means everything is on one line
468 # It's like
469 # JSON.stringify(d, null, 0)
470 # except we use -1, not 0. 0 can still have newlines.
471
472 UP_val = val
473 with tagswitch(val) as case:
474 if case(value_e.Null):
475 self.buf.write('null')
476
477 elif case(value_e.Bool):
478 val = cast(value.Bool, UP_val)
479 self.buf.write('true' if val.b else 'false')
480
481 elif case(value_e.Int):
482 val = cast(value.Int, UP_val)
483 # TODO: avoid intermediate allocation with
484 # self.buf.WriteBigInt(val.i)
485 #
486 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
487 # be of arbitrary length, and will need a growth strategy.
488 # Although that is not very common, so we could allocate in
489 # that case.
490
491 self.buf.write(mops.ToStr(val.i))
492
493 elif case(value_e.Float):
494 val = cast(value.Float, UP_val)
495
496 fl = val.f
497 if math.isinf(fl):
498 if self.options & INF_NAN_ARE_NULL:
499 s = 'null' # negative infinity is null too
500 else:
501 s = 'INFINITY'
502 if fl < 0:
503 s = '-' + s
504 elif math.isnan(fl):
505 if self.options & INF_NAN_ARE_NULL:
506 # JavaScript JSON lib behavior: Inf and NaN are null
507 # Python has a bug in the encoder by default, and then
508 # allow_nan=False raises an error
509 s = 'null'
510 else:
511 s = 'NAN'
512 else:
513 # TODO: can we avoid intermediate allocation?
514 # self.buf.WriteFloat(val.f)
515 s = str(fl)
516
517 self.buf.write(s)
518
519 elif case(value_e.Str):
520 val = cast(value.Str, UP_val)
521
522 pyj8.WriteString(val.s, self.options, self.buf)
523
524 elif case(value_e.List):
525 val = cast(value.List, UP_val)
526
527 # Cycle detection, only for containers that can be in cycles
528 heap_id = HeapValueId(val)
529
530 node_state = self.visited.get(heap_id, UNSEEN)
531 if node_state == FINISHED:
532 # Print it AGAIN. We print a JSON tree, which means we can
533 # visit and print nodes MANY TIMES, as long as they're not
534 # in a cycle.
535 self._PrintList(val, level)
536 return
537 if node_state == EXPLORING:
538 if self.options & SHOW_CYCLES:
539 self.buf.write('[ -->%s ]' % ValueIdString(val))
540 return
541 else:
542 # node.js prints which index closes the cycle
543 raise error.Encode(
544 "Can't encode List%s in object cycle" %
545 ValueIdString(val))
546
547 self.visited[heap_id] = EXPLORING
548 self._PrintList(val, level)
549 self.visited[heap_id] = FINISHED
550
551 elif case(value_e.Dict):
552 val = cast(value.Dict, UP_val)
553
554 # Cycle detection, only for containers that can be in cycles
555 heap_id = HeapValueId(val)
556
557 node_state = self.visited.get(heap_id, UNSEEN)
558 if node_state == FINISHED:
559 # Print it AGAIN. We print a JSON tree, which means we can
560 # visit and print nodes MANY TIMES, as long as they're not
561 # in a cycle.
562 self._PrintDict(val, level)
563 return
564 if node_state == EXPLORING:
565 if self.options & SHOW_CYCLES:
566 self.buf.write('{ -->%s }' % ValueIdString(val))
567 return
568 else:
569 # node.js prints which key closes the cycle
570 raise error.Encode(
571 "Can't encode Dict%s in object cycle" %
572 ValueIdString(val))
573
574 self.visited[heap_id] = EXPLORING
575 self._PrintDict(val, level)
576 self.visited[heap_id] = FINISHED
577
578 elif case(value_e.SparseArray):
579 val = cast(value.SparseArray, UP_val)
580 self._PrintSparseArray(val, level)
581
582 elif case(value_e.BashArray):
583 val = cast(value.BashArray, UP_val)
584 self._PrintBashArray(val, level)
585
586 elif case(value_e.BashAssoc):
587 val = cast(value.BashAssoc, UP_val)
588 self._PrintBashAssoc(val, level)
589
590 else:
591 pass # mycpp workaround
592 if self.options & SHOW_NON_DATA:
593 # Similar to = operator, ui.DebugPrint()
594 # TODO: that prints value.Range in a special way
595 ysh_type = ValType(val)
596 id_str = ValueIdString(val)
597 self.buf.write('<%s%s>' % (ysh_type, id_str))
598 else:
599 raise error.Encode("Can't serialize object of type %s" %
600 ValType(val))
601
602
603class PrettyPrinter(object):
604 """ Unused right now, but could enhance the = operator.
605
606 Output to polymorphic ColorOutput
607
608 Features like asdl/format.py:
609 - line wrapping
610 - color
611 - sharing detection by passing in a REF COUTN dict
612 - print @123 the first time, and then print ... the second time
613
614 and
615
616 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
617 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
618
619 - Omitting commas for ASDL? Maybe we can use two spaces
620
621 (Token id: Id.VSub_DollarName start: 0 length: 3)
622 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
623 """
624
625 def __init__(self, max_col):
626 # type: (int) -> None
627 self.max_col = max_col
628
629 # This could be an optimized set an C++ bit set like
630 # mark_sweep_heap.h, rather than a Dict
631 #self.unique_objs = mylib.UniqueObjects()
632
633 # first pass of object ID -> number of times references
634
635 self.ref_count = {} # type: Dict[int, int]
636
637 def PrettyTree(self, val, f):
638 # type: (value_t, fmt.ColorOutput) -> None
639
640 # TODO: first convert to hnode.asdl types?
641
642 # Although we might want
643 # hnode.AlreadyShown = (str type, int unique_id)
644 pass
645
646 def Print(self, val, buf):
647 # type: (value_t, mylib.BufWriter) -> None
648
649 # Or print to stderr?
650 f = fmt.DetectConsoleOutput(mylib.Stdout())
651 self.PrettyTree(val, f)
652
653 # Then print those with ASDL
654 pass
655
656
657class LexerDecoder(object):
658 """J8 lexer and string decoder.
659
660 Similar interface as SimpleLexer, except we return an optional decoded
661 string
662 """
663
664 def __init__(self, s, is_j8, lang_str):
665 # type: (str, bool, str) -> None
666 self.s = s
667 self.is_j8 = is_j8
668 self.lang_str = lang_str
669
670 self.pos = 0
671
672 # current line being lexed -- for error messages
673 self.cur_line_num = 1
674
675 # Reuse this instance to save GC objects. JSON objects could have
676 # thousands of strings.
677 self.decoded = mylib.BufWriter()
678
679 def _Error(self, msg, end_pos):
680 # type: (str, int) -> error.Decode
681
682 # Use the current position as start pos
683 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
684
685 def Next(self):
686 # type: () -> Tuple[Id_t, int, Optional[str]]
687 """ Returns a token and updates self.pos """
688
689 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
690
691 if not self.is_j8:
692 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
693 raise self._Error(
694 "Single quotes aren't part of JSON; you may want 'json8 read'",
695 end_pos)
696 if tok_id == Id.Ignored_Comment:
697 raise self._Error(
698 "Comments aren't part of JSON; you may want 'json8 read'",
699 end_pos)
700
701 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
702 Id.Left_USingleQuote):
703 return self._DecodeString(tok_id, end_pos)
704
705 if tok_id == Id.Left_JDoubleQuote:
706 if self.is_j8:
707 return self._DecodeString(tok_id, end_pos)
708 else:
709 raise self._Error('Pure JSON does not accept j"" prefix',
710 end_pos)
711
712 if tok_id == Id.Ignored_Newline:
713 #log('LINE %d', self.cur_line_num)
714 self.cur_line_num += 1
715
716 self.pos = end_pos
717 return tok_id, end_pos, None
718
719 def NextForLines(self):
720 # type: () -> Tuple[Id_t, int, Optional[str]]
721 """ Like Next(), but for J8 Lines """
722
723 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
724
725 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
726 Id.Left_BSingleQuote, Id.Left_USingleQuote):
727 return self._DecodeString(tok_id, end_pos)
728
729 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
730 # this for quoted strings.)
731 if (tok_id == Id.Lit_Chars and
732 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
733 raise self._Error(
734 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
735 if tok_id == Id.Char_AsciiControl:
736 raise self._Error(
737 "J8 Lines can't have unescaped ASCII control chars", end_pos)
738
739 if tok_id == Id.J8_Newline:
740 #log('LINE %d', self.cur_line_num)
741 self.cur_line_num += 1
742
743 self.pos = end_pos
744 return tok_id, end_pos, None
745
746 def _DecodeString(self, left_id, str_pos):
747 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
748 """ Returns a string token and updates self.pos """
749
750 while True:
751 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
752 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
753 else:
754 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
755
756 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
757
758 if tok_id == Id.Eol_Tok:
759 # TODO: point to beginning of # quote?
760 raise self._Error(
761 'Unexpected EOF while lexing %s string' % self.lang_str,
762 str_end)
763 if tok_id == Id.Unknown_Backslash:
764 raise self._Error(
765 'Bad backslash escape in %s string' % self.lang_str,
766 str_end)
767 if tok_id == Id.Char_AsciiControl:
768 raise self._Error(
769 "%s strings can't have unescaped ASCII control chars" %
770 self.lang_str, str_end)
771
772 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
773
774 self.pos = str_end
775
776 s = self.decoded.getvalue()
777 self.decoded.clear() # reuse this instance
778
779 #log('decoded %r', self.decoded.getvalue())
780 return Id.J8_String, str_end, s
781
782 #
783 # Now handle each kind of token
784 #
785
786 if tok_id == Id.Lit_Chars: # JSON and J8
787 part = self.s[str_pos:str_end]
788 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
789 raise self._Error(
790 'Invalid UTF-8 in %s string literal' % self.lang_str,
791 str_end)
792
793 # TODO: would be nice to avoid allocation in all these cases.
794 # But LookupCharC() would have to change.
795
796 elif tok_id == Id.Char_OneChar: # JSON and J8
797 ch = self.s[str_pos + 1]
798 part = consts.LookupCharC(ch)
799
800 elif tok_id == Id.Char_UBraced: # J8 only
801 h = self.s[str_pos + 3:str_end - 1]
802 i = int(h, 16)
803
804 # Same checks in osh/word_compile.py
805 if i > 0x10ffff:
806 raise self._Error(
807 "Code point can't be greater than U+10ffff", str_end)
808 if 0xD800 <= i and i < 0xE000:
809 raise self._Error(
810 r"\u{%s} escape is illegal because it's in the surrogate range"
811 % h, str_end)
812
813 part = Utf8Encode(i)
814
815 elif tok_id == Id.Char_YHex: # J8 only
816 h = self.s[str_pos + 2:str_end]
817
818 # Same check in osh/word_parse.py
819 if left_id != Id.Left_BSingleQuote:
820 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
821 raise self._Error(
822 r"\y%s escapes not allowed in u'' strings" % h,
823 str_end)
824
825 i = int(h, 16)
826 part = chr(i)
827
828 elif tok_id == Id.Char_SurrogatePair:
829 h1 = self.s[str_pos + 2:str_pos + 6]
830 h2 = self.s[str_pos + 8:str_pos + 12]
831
832 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
833 i1 = int(h1, 16) - 0xD800 # high surrogate
834 i2 = int(h2, 16) - 0xDC00 # low surrogate
835 code_point = 0x10000 + (i1 << 10) + i2
836
837 part = Utf8Encode(code_point)
838
839 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
840 h = self.s[str_pos + 2:str_end]
841 i = int(h, 16)
842 part = Utf8Encode(i)
843
844 else:
845 # Should never happen
846 raise AssertionError(Id_str(tok_id))
847
848 #log('%s part %r', Id_str(tok_id), part)
849 self.decoded.write(part)
850 str_pos = str_end
851
852
853class _Parser(object):
854
855 def __init__(self, s, is_j8):
856 # type: (str, bool) -> None
857 self.s = s
858 self.is_j8 = is_j8
859 self.lang_str = "J8" if is_j8 else "JSON"
860
861 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
862 self.tok_id = Id.Undefined_Tok
863 self.start_pos = 0
864 self.end_pos = 0
865 self.decoded = '' # decoded J8 string
866
867 def _Next(self):
868 # type: () -> None
869
870 # This isn't the start of a J8_Bool token, it's the END of the token before it
871 while True:
872 self.start_pos = self.end_pos
873 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
874 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
875 Id.Ignored_Comment):
876 break
877 # TODO: add Ignored_Newline to count lines, and show line numbers
878 # in errors messages. The position of the last newline and a token
879 # can be used to calculate a column number.
880
881 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
882
883 def _Eat(self, tok_id):
884 # type: (Id_t) -> None
885
886 if self.tok_id != tok_id:
887 #log('position %r %d-%d %r', self.s, self.start_pos,
888 # self.end_pos, self.s[self.start_pos:self.end_pos])
889 raise self._ParseError("Expected %s, got %s" %
890 (Id_str(tok_id), Id_str(self.tok_id)))
891 self._Next()
892
893 def _NextForLines(self):
894 # type: () -> None
895 """Like _Next, but use the J8 Lines lexer."""
896 self.start_pos = self.end_pos
897 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
898
899 def _ParseError(self, msg):
900 # type: (str) -> error.Decode
901 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
902 self.lexer.cur_line_num)
903
904
905class Parser(_Parser):
906 """JSON and JSON8 Parser."""
907
908 def __init__(self, s, is_j8):
909 # type: (str, bool) -> None
910 _Parser.__init__(self, s, is_j8)
911
912 def _ParsePair(self):
913 # type: () -> Tuple[str, value_t]
914
915 k = self.decoded # Save the potential string value
916 self._Eat(Id.J8_String) # Check that it's a string
917 assert k is not None
918
919 self._Eat(Id.J8_Colon)
920
921 v = self._ParseValue()
922 return k, v
923
924 def _ParseDict(self):
925 # type: () -> value_t
926 """
927 pair = string ':' value
928 Dict = '{' '}'
929 | '{' pair (',' pair)* '}'
930 """
931 # precondition
932 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
933
934 #log('> Dict')
935
936 d = NewDict() # type: Dict[str, value_t]
937
938 self._Next()
939 if self.tok_id == Id.J8_RBrace:
940 self._Next()
941 return value.Dict(d)
942
943 k, v = self._ParsePair()
944 d[k] = v
945 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
946
947 while self.tok_id == Id.J8_Comma:
948 self._Next()
949 k, v = self._ParsePair()
950 d[k] = v
951 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
952
953 self._Eat(Id.J8_RBrace)
954
955 #log('< Dict')
956
957 return value.Dict(d)
958
959 def _ParseList(self):
960 # type: () -> value_t
961 """
962 List = '[' ']'
963 | '[' value (',' value)* ']'
964 """
965 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
966
967 items = [] # type: List[value_t]
968
969 self._Next()
970 if self.tok_id == Id.J8_RBracket:
971 self._Next()
972 return value.List(items)
973
974 items.append(self._ParseValue())
975
976 while self.tok_id == Id.J8_Comma:
977 self._Next()
978 items.append(self._ParseValue())
979
980 self._Eat(Id.J8_RBracket)
981
982 return value.List(items)
983
984 def _ParseValue(self):
985 # type: () -> value_t
986 if self.tok_id == Id.J8_LBrace:
987 return self._ParseDict()
988
989 elif self.tok_id == Id.J8_LBracket:
990 return self._ParseList()
991
992 elif self.tok_id == Id.J8_Null:
993 self._Next()
994 return value.Null
995
996 elif self.tok_id == Id.J8_Bool:
997 #log('%r %d', self.s[self.start_pos], self.start_pos)
998 b = value.Bool(self.s[self.start_pos] == 't')
999 self._Next()
1000 return b
1001
1002 elif self.tok_id == Id.J8_Int:
1003 part = self.s[self.start_pos:self.end_pos]
1004 self._Next()
1005 try:
1006 big = mops.FromStr(part)
1007 except ValueError:
1008 raise self._ParseError('Integer is too big')
1009 return value.Int(big)
1010
1011 elif self.tok_id == Id.J8_Float:
1012 part = self.s[self.start_pos:self.end_pos]
1013 self._Next()
1014 return value.Float(float(part))
1015
1016 # UString, BString too
1017 elif self.tok_id == Id.J8_String:
1018 str_val = value.Str(self.decoded)
1019 #log('d %r', self.decoded)
1020 self._Next()
1021 return str_val
1022
1023 elif self.tok_id == Id.Eol_Tok:
1024 raise self._ParseError('Unexpected EOF while parsing %s' %
1025 self.lang_str)
1026
1027 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1028 raise self._ParseError('Invalid token while parsing %s: %s' %
1029 (self.lang_str, Id_str(self.tok_id)))
1030
1031 def ParseValue(self):
1032 # type: () -> value_t
1033 """ Raises error.Decode. """
1034 self._Next()
1035 obj = self._ParseValue()
1036
1037 n = len(self.s)
1038 if self.start_pos != n:
1039 extra = n - self.start_pos
1040 #log('n %d pos %d', n, self.start_pos)
1041 raise self._ParseError(
1042 'Got %d bytes of unexpected trailing input' % extra)
1043 return obj
1044
1045
1046class Nil8Parser(_Parser):
1047 """
1048 Tokens not in JSON8:
1049 LParen RParen Symbol
1050
1051 Tokens not in JSON, but in JSON8 and NIL8:
1052 Identifier (unquoted keys)
1053 Ignored_Comment
1054 """
1055
1056 def __init__(self, s, is_j8):
1057 # type: (str, bool) -> None
1058 _Parser.__init__(self, s, is_j8)
1059
1060 if 0:
1061
1062 def _LookAhead(self):
1063 # type: () -> Id_t
1064 """
1065 Don't need this right now
1066 """
1067 end_pos = self.end_pos # look ahead from last token
1068 while True:
1069 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1070 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1071 Id.Ignored_Comment):
1072 break
1073 return tok_id
1074
1075 def _ParseRecord(self):
1076 # type: () -> nvalue_t
1077 """
1078 Yaks
1079 (self->Next) => (-> self Next)
1080 (self->Next obj.field) => ((-> self Next) (. obj field))
1081
1082 Similar to
1083 ((identity identity) 42) => 42 in Clojure
1084
1085 ASDL
1086 (Node left:(. x4beef2))
1087 (Node left !x4beef2)
1088
1089 # Ambiguous because value can be identifier.
1090 # We have to look ahead to and see if there's a colon :
1091 field =
1092 Identifier ':' value
1093 | value
1094
1095 record = '(' head field* ')'
1096
1097 - Identifier | Symbol are treated the same, it's a side effect of
1098 the lexing style
1099 - do positional args come before named args
1100 - () is invalid? Use [] for empty list
1101 """
1102 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1103
1104 items = [] # type: List[nvalue_t]
1105
1106 self._Next()
1107 if self.tok_id == Id.J8_RParen:
1108 self._Next()
1109 return nvalue.List(items)
1110
1111 #log('TOK %s', Id_str(self.tok_id))
1112 while self.tok_id != Id.J8_RParen:
1113 items.append(self._ParseNil8())
1114 #log('TOK 2 %s', Id_str(self.tok_id))
1115
1116 self._Eat(Id.J8_RParen)
1117
1118 return nvalue.List(items)
1119
1120 def _ParseList8(self):
1121 # type: () -> nvalue_t
1122 """
1123 List8 = '[' value* ']'
1124
1125 No commas, not even optional ones for now.
1126 """
1127 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1128
1129 items = [] # type: List[nvalue_t]
1130
1131 self._Next()
1132 if self.tok_id == Id.J8_RBracket:
1133 self._Next()
1134 return nvalue.List(items)
1135
1136 #log('TOK %s', Id_str(self.tok_id))
1137 while self.tok_id != Id.J8_RBracket:
1138 items.append(self._ParseNil8())
1139 #log('TOK 2 %s', Id_str(self.tok_id))
1140
1141 self._Eat(Id.J8_RBracket)
1142
1143 return nvalue.List(items)
1144
1145 def _ParseNil8(self):
1146 # type: () -> nvalue_t
1147 if self.tok_id == Id.J8_LParen:
1148 obj = self._ParseRecord() # type: nvalue_t
1149 #return obj
1150
1151 elif self.tok_id == Id.J8_LBracket:
1152 obj = self._ParseList8()
1153 #return obj
1154
1155 # Primitives are copied from J8 above.
1156 # TODO: We also want hex literals.
1157 elif self.tok_id == Id.J8_Null:
1158 self._Next()
1159 obj = nvalue.Null
1160
1161 elif self.tok_id == Id.J8_Bool:
1162 b = nvalue.Bool(self.s[self.start_pos] == 't')
1163 self._Next()
1164 obj = b
1165
1166 elif self.tok_id == Id.J8_Int:
1167 part = self.s[self.start_pos:self.end_pos]
1168 self._Next()
1169 obj = nvalue.Int(int(part))
1170
1171 elif self.tok_id == Id.J8_Float:
1172 part = self.s[self.start_pos:self.end_pos]
1173 self._Next()
1174 obj = nvalue.Float(float(part))
1175
1176 elif self.tok_id == Id.J8_String:
1177 str_val = nvalue.Str(self.decoded)
1178 self._Next()
1179 obj = str_val
1180
1181 # <- etc.
1182 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1183 Id.J8_Comma):
1184 # unquoted "word" treated like a string
1185 part = self.s[self.start_pos:self.end_pos]
1186 self._Next()
1187 obj = nvalue.Symbol(part)
1188
1189 elif self.tok_id == Id.Eol_Tok:
1190 raise self._ParseError('Unexpected EOF while parsing %s' %
1191 self.lang_str)
1192
1193 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1194 raise self._ParseError('Invalid token while parsing %s: %s' %
1195 (self.lang_str, Id_str(self.tok_id)))
1196
1197 #log('YO %s', Id_str(self.tok_id))
1198 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1199 #log('AT %s', Id_str(self.tok_id))
1200
1201 # key: "value" -> (: key "value")
1202 part = self.s[self.start_pos:self.end_pos]
1203 op = nvalue.Symbol(part)
1204
1205 self._Next()
1206 operand2 = self._ParseNil8()
1207 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1208 #print("--> INFIX %d %s" % (id(infix), infix))
1209 return infix
1210
1211 #next_id = self._LookAhead()
1212 #print('NEXT %s' % Id_str(next_id))
1213
1214 #raise AssertionError()
1215 #print("--> OBJ %d %s" % (id(obj), obj))
1216 return obj
1217
1218 def ParseNil8(self):
1219 # type: () -> nvalue_t
1220 """ Raises error.Decode. """
1221 self._Next()
1222 #print('yo')
1223 obj = self._ParseNil8()
1224 #print("==> %d %s" % (id(obj), obj))
1225 if self.tok_id != Id.Eol_Tok:
1226 raise self._ParseError('Unexpected trailing input')
1227 return obj
1228
1229
1230class J8LinesParser(_Parser):
1231 """Decode lines from a string with newlines.
1232
1233 We specify this with a grammar, to preserve location info and to reduce
1234 allocations. (But note that unquoted_line is more like a LOOP than it is
1235 grammatical.)
1236
1237 Grammar:
1238
1239 end = J8_Newline | Eol_Tok
1240
1241 empty_line = WS_Space? end
1242
1243 # special case: read until end token, but REMOVE trailing WS_Space
1244 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1245
1246 j8_line = WS_Space? J8_String WS_Space? end
1247
1248 lines = (empty_line | unquoted_line | j8_line)*
1249
1250 where Lit_Chars is valid UTF-8
1251
1252 Notes:
1253
1254 (1) We disallow multiple strings on a line, like:
1255
1256 "json" "json2"
1257 "json" unquoted
1258
1259 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1260
1261 foo "" u''
1262
1263 The "" and u'' are not a decoded string, because the line started with
1264 Id.Lit_Chars literals.
1265
1266 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1267 Does it have - for empty cell?
1268 """
1269
1270 def __init__(self, s):
1271 # type: (str) -> None
1272 _Parser.__init__(self, s, True)
1273
1274 def _Show(self, s):
1275 # type: (str) -> None
1276 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1277 self.end_pos)
1278
1279 def _ParseLine(self, out):
1280 # type: (List[str]) -> None
1281 """ May append a line to 'out' """
1282 #self._Show('1')
1283 if self.tok_id == Id.WS_Space:
1284 self._NextForLines()
1285
1286 # Empty line - return without doing anything
1287 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1288 self._NextForLines()
1289 return
1290
1291 # Quoted string on line
1292 if self.tok_id == Id.J8_String:
1293 out.append(self.decoded)
1294 self._NextForLines()
1295
1296 if self.tok_id == Id.WS_Space: # trailing whitespace
1297 self._NextForLines()
1298
1299 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1300 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1301 Id_str(self.tok_id))
1302
1303 self._NextForLines()
1304 return
1305
1306 # Unquoted line
1307 if self.tok_id == Id.Lit_Chars:
1308 # ' unquoted "" text on line ' # read every token until end
1309 string_start = self.start_pos
1310 while True:
1311 # for stripping whitespace
1312 prev_id = self.tok_id
1313 prev_start = self.start_pos
1314
1315 self._NextForLines()
1316
1317 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1318 # \r, but we're sticking with the JSON spec definition of
1319 # whitespace. (As another data point, CPython on Unix allows
1320 # \r in the middle of expressions, treating it as whitespace.)
1321 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1322 break
1323
1324 if prev_id == Id.WS_Space:
1325 string_end = prev_start # remove trailing whitespace
1326 else:
1327 string_end = self.start_pos
1328
1329 out.append(self.s[string_start:string_end])
1330
1331 self._NextForLines() # past newline
1332 return
1333
1334 raise AssertionError(Id_str(self.tok_id))
1335
1336 def Parse(self):
1337 # type: () -> List[str]
1338 """ Raises error.Decode. """
1339 self._NextForLines()
1340
1341 lines = [] # type: List[str]
1342 while self.tok_id != Id.Eol_Tok:
1343 self._ParseLine(lines)
1344
1345 if self.tok_id != Id.Eol_Tok:
1346 raise self._ParseError('Unexpected trailing input in J8 Lines')
1347
1348 return lines
1349
1350
1351def SplitJ8Lines(s):
1352 # type: (str) -> List[str]
1353 """Used by @(echo split command sub)
1354
1355 Raises:
1356 error.Decode
1357
1358 3 Errors:
1359 - J8 string syntax error inside quotes
1360 - Extra input on line
1361 - unquoted line isn't utf-8
1362 """
1363 p = J8LinesParser(s)
1364 return p.Parse()
1365
1366
1367# vim: sw=4