OILS / data_lang / j8.py View on Github | oilshell.org

1272 lines, 620 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 # TODO: Omit type at top level
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194def EncodeString(s, buf, unquoted_ok=False):
195 # type: (str, mylib.BufWriter, bool) -> None
196 """ For pp proc, etc."""
197
198 if unquoted_ok and fastfunc.CanOmitQuotes(s):
199 buf.write(s)
200 return
201
202 _Print(value.Str(s), buf, -1)
203
204
205def MaybeEncodeString(s):
206 # type: (str) -> str
207 """ For write --json8 $s and compexport """
208
209 # TODO: add unquoted_ok here?
210 # /usr/local/foo-bar/x.y/a_b
211
212 buf = mylib.BufWriter()
213 _Print(value.Str(s), buf, -1)
214 return buf.getvalue()
215
216
217def MaybeEncodeJsonString(s):
218 # type: (str) -> str
219 """ For write --json """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225 return buf.getvalue()
226
227
228# DFS traversal state
229UNSEEN = 0
230EXPLORING = 1
231FINISHED = 2
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 # Value is always True
245 # Dict[int, None] doesn't translate -- it would be nice to have a set()
246 self.visited = {} # type: Dict[int, int]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintDict(self, val, level):
297 # type: (value.Dict, int) -> None
298
299 if len(val.d) == 0: # Special case like Python/JS
300 self.buf.write('{}')
301 else:
302 self.buf.write('{')
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(val.d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write('}')
324
325 def Print(self, val, level=0):
326 # type: (value_t, int) -> None
327
328 # special value that means everything is on one line
329 # It's like
330 # JSON.stringify(d, null, 0)
331 # except we use -1, not 0. 0 can still have newlines.
332
333 UP_val = val
334 with tagswitch(val) as case:
335 if case(value_e.Null):
336 self.buf.write('null')
337
338 elif case(value_e.Bool):
339 val = cast(value.Bool, UP_val)
340 self.buf.write('true' if val.b else 'false')
341
342 elif case(value_e.Int):
343 val = cast(value.Int, UP_val)
344 # TODO: avoid intermediate allocation with
345 # self.buf.WriteBigInt(val.i)
346 #
347 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
348 # be of arbitrary length, and will need a growth strategy.
349 # Although that is not very common, so we could allocate in
350 # that case.
351
352 self.buf.write(mops.ToStr(val.i))
353
354 elif case(value_e.Float):
355 val = cast(value.Float, UP_val)
356
357 fl = val.f
358 if math.isinf(fl):
359 if self.options & INF_NAN_ARE_NULL:
360 s = 'null' # negative infinity is null too
361 else:
362 s = 'INFINITY'
363 if fl < 0:
364 s = '-' + s
365 elif math.isnan(fl):
366 if self.options & INF_NAN_ARE_NULL:
367 # JavaScript JSON lib behavior: Inf and NaN are null
368 # Python has a bug in the encoder by default, and then
369 # allow_nan=False raises an error
370 s = 'null'
371 else:
372 s = 'NAN'
373 else:
374 # TODO: can we avoid intermediate allocation?
375 # self.buf.WriteFloat(val.f)
376 s = str(fl)
377
378 self.buf.write(s)
379
380 elif case(value_e.Str):
381 val = cast(value.Str, UP_val)
382
383 pyj8.WriteString(val.s, self.options, self.buf)
384
385 elif case(value_e.List):
386 val = cast(value.List, UP_val)
387
388 # Cycle detection, only for containers that can be in cycles
389 heap_id = HeapValueId(val)
390
391 node_state = self.visited.get(heap_id, UNSEEN)
392 if node_state == FINISHED:
393 # Print it AGAIN. We print a JSON tree, which means we can
394 # visit and print nodes MANY TIMES, as long as they're not
395 # in a cycle.
396 self._PrintList(val, level)
397 return
398 if node_state == EXPLORING:
399 if self.options & SHOW_CYCLES:
400 self.buf.write('[ -->%s ]' % ValueIdString(val))
401 return
402 else:
403 # node.js prints which index closes the cycle
404 raise error.Encode(
405 "Can't encode List%s in object cycle" %
406 ValueIdString(val))
407
408 self.visited[heap_id] = EXPLORING
409 self._PrintList(val, level)
410 self.visited[heap_id] = FINISHED
411
412 elif case(value_e.Dict):
413 val = cast(value.Dict, UP_val)
414
415 # Cycle detection, only for containers that can be in cycles
416 heap_id = HeapValueId(val)
417
418 node_state = self.visited.get(heap_id, UNSEEN)
419 if node_state == FINISHED:
420 # Print it AGAIN. We print a JSON tree, which means we can
421 # visit and print nodes MANY TIMES, as long as they're not
422 # in a cycle.
423 self._PrintDict(val, level)
424 return
425 if node_state == EXPLORING:
426 if self.options & SHOW_CYCLES:
427 self.buf.write('{ -->%s }' % ValueIdString(val))
428 return
429 else:
430 # node.js prints which key closes the cycle
431 raise error.Encode(
432 "Can't encode Dict%s in object cycle" %
433 ValueIdString(val))
434
435 self.visited[heap_id] = EXPLORING
436 self._PrintDict(val, level)
437 self.visited[heap_id] = FINISHED
438
439 # TODO: New format, which should consistent with pretty printing
440 # pp line (x) supports BashArray and BashAssoc, e.g. for spec
441 # tests.
442
443 # - BashAssoc is Dict[str, str]
444 # (BashAssoc ['1']='foo' ['3']='bar')
445 # - BashArray will be Dict[int, str] - SparseArray. We should write it like
446 # (BashArray [1]='foo' [3]='bar')
447
448 elif case(value_e.BashArray):
449 val = cast(value.BashArray, UP_val)
450
451 self.buf.write('[')
452 self._MaybeNewline()
453 for i, s in enumerate(val.strs):
454 if i != 0:
455 self.buf.write(',')
456 self._MaybeNewline()
457
458 self._ItemIndent(level)
459 if s is None:
460 self.buf.write('null')
461 else:
462 pyj8.WriteString(s, self.options, self.buf)
463
464 self._MaybeNewline()
465
466 self._BracketIndent(level)
467 self.buf.write(']')
468
469 elif case(value_e.BashAssoc):
470 val = cast(value.BashAssoc, UP_val)
471
472 self.buf.write('{')
473 self._MaybeNewline()
474 i = 0
475 for k2, v2 in iteritems(val.d):
476 if i != 0:
477 self.buf.write(',')
478 self._MaybeNewline()
479
480 self._ItemIndent(level)
481
482 pyj8.WriteString(k2, self.options, self.buf)
483
484 self.buf.write(':')
485 self._MaybeSpace()
486
487 pyj8.WriteString(v2, self.options, self.buf)
488
489 i += 1
490
491 self._MaybeNewline()
492 self._BracketIndent(level)
493 self.buf.write('}')
494
495 else:
496 pass # mycpp workaround
497 if self.options & SHOW_NON_DATA:
498 # Similar to = operator, ui.DebugPrint()
499 # TODO: that prints value.Range in a special way
500 ysh_type = ValType(val)
501 id_str = ValueIdString(val)
502 self.buf.write('<%s%s>' % (ysh_type, id_str))
503 else:
504 raise error.Encode("Can't serialize object of type %s" %
505 ValType(val))
506
507
508class PrettyPrinter(object):
509 """ Unused right now, but could enhance the = operator.
510
511 Output to polymorphic ColorOutput
512
513 Features like asdl/format.py:
514 - line wrapping
515 - color
516 - sharing detection by passing in a REF COUTN dict
517 - print @123 the first time, and then print ... the second time
518
519 and
520
521 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
522 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
523
524 - Omitting commas for ASDL? Maybe we can use two spaces
525
526 (Token id: Id.VSub_DollarName start: 0 length: 3)
527 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
528 """
529
530 def __init__(self, max_col):
531 # type: (int) -> None
532 self.max_col = max_col
533
534 # This could be an optimized set an C++ bit set like
535 # mark_sweep_heap.h, rather than a Dict
536 #self.unique_objs = mylib.UniqueObjects()
537
538 # first pass of object ID -> number of times references
539
540 self.ref_count = {} # type: Dict[int, int]
541
542 def PrettyTree(self, val, f):
543 # type: (value_t, fmt.ColorOutput) -> None
544
545 # TODO: first convert to hnode.asdl types?
546
547 # Although we might want
548 # hnode.AlreadyShown = (str type, int unique_id)
549 pass
550
551 def Print(self, val, buf):
552 # type: (value_t, mylib.BufWriter) -> None
553
554 # Or print to stderr?
555 f = fmt.DetectConsoleOutput(mylib.Stdout())
556 self.PrettyTree(val, f)
557
558 # Then print those with ASDL
559 pass
560
561
562class LexerDecoder(object):
563 """J8 lexer and string decoder.
564
565 Similar interface as SimpleLexer, except we return an optional decoded
566 string
567 """
568
569 def __init__(self, s, is_j8, lang_str):
570 # type: (str, bool, str) -> None
571 self.s = s
572 self.is_j8 = is_j8
573 self.lang_str = lang_str
574
575 self.pos = 0
576
577 # current line being lexed -- for error messages
578 self.cur_line_num = 1
579
580 # Reuse this instance to save GC objects. JSON objects could have
581 # thousands of strings.
582 self.decoded = mylib.BufWriter()
583
584 def _Error(self, msg, end_pos):
585 # type: (str, int) -> error.Decode
586
587 # Use the current position as start pos
588 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
589
590 def Next(self):
591 # type: () -> Tuple[Id_t, int, Optional[str]]
592 """ Returns a token and updates self.pos """
593
594 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
595
596 if not self.is_j8:
597 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
598 raise self._Error(
599 "Single quotes aren't part of JSON; you may want 'json8 read'",
600 end_pos)
601 if tok_id == Id.Ignored_Comment:
602 raise self._Error(
603 "Comments aren't part of JSON; you may want 'json8 read'",
604 end_pos)
605
606 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
607 Id.Left_USingleQuote):
608 return self._DecodeString(tok_id, end_pos)
609
610 if tok_id == Id.Left_JDoubleQuote:
611 if self.is_j8:
612 return self._DecodeString(tok_id, end_pos)
613 else:
614 raise self._Error('Pure JSON does not accept j"" prefix',
615 end_pos)
616
617 if tok_id == Id.Ignored_Newline:
618 #log('LINE %d', self.cur_line_num)
619 self.cur_line_num += 1
620
621 self.pos = end_pos
622 return tok_id, end_pos, None
623
624 def NextForLines(self):
625 # type: () -> Tuple[Id_t, int, Optional[str]]
626 """ Like Next(), but for J8 Lines """
627
628 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
629
630 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
631 Id.Left_BSingleQuote, Id.Left_USingleQuote):
632 return self._DecodeString(tok_id, end_pos)
633
634 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
635 # this for quoted strings.)
636 if (tok_id == Id.Lit_Chars and
637 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
638 raise self._Error(
639 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
640 if tok_id == Id.Char_AsciiControl:
641 raise self._Error(
642 "J8 Lines can't have unescaped ASCII control chars", end_pos)
643
644 if tok_id == Id.J8_Newline:
645 #log('LINE %d', self.cur_line_num)
646 self.cur_line_num += 1
647
648 self.pos = end_pos
649 return tok_id, end_pos, None
650
651 def _DecodeString(self, left_id, str_pos):
652 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
653 """ Returns a string token and updates self.pos """
654
655 while True:
656 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
657 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
658 else:
659 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
660
661 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
662
663 if tok_id == Id.Eol_Tok:
664 # TODO: point to beginning of # quote?
665 raise self._Error(
666 'Unexpected EOF while lexing %s string' % self.lang_str,
667 str_end)
668 if tok_id == Id.Unknown_Backslash:
669 raise self._Error(
670 'Bad backslash escape in %s string' % self.lang_str,
671 str_end)
672 if tok_id == Id.Char_AsciiControl:
673 raise self._Error(
674 "%s strings can't have unescaped ASCII control chars" %
675 self.lang_str, str_end)
676
677 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
678
679 self.pos = str_end
680
681 s = self.decoded.getvalue()
682 self.decoded.clear() # reuse this instance
683
684 #log('decoded %r', self.decoded.getvalue())
685 return Id.J8_String, str_end, s
686
687 #
688 # Now handle each kind of token
689 #
690
691 if tok_id == Id.Lit_Chars: # JSON and J8
692 part = self.s[str_pos:str_end]
693 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
694 raise self._Error(
695 'Invalid UTF-8 in %s string literal' % self.lang_str,
696 str_end)
697
698 # TODO: would be nice to avoid allocation in all these cases.
699 # But LookupCharC() would have to change.
700
701 elif tok_id == Id.Char_OneChar: # JSON and J8
702 ch = self.s[str_pos + 1]
703 part = consts.LookupCharC(ch)
704
705 elif tok_id == Id.Char_UBraced: # J8 only
706 h = self.s[str_pos + 3:str_end - 1]
707 i = int(h, 16)
708
709 # Same checks in osh/word_compile.py
710 if i > 0x10ffff:
711 raise self._Error(
712 "Code point can't be greater than U+10ffff", str_end)
713 if 0xD800 <= i and i < 0xE000:
714 raise self._Error(
715 r"\u{%s} escape is illegal because it's in the surrogate range"
716 % h, str_end)
717
718 part = Utf8Encode(i)
719
720 elif tok_id == Id.Char_YHex: # J8 only
721 h = self.s[str_pos + 2:str_end]
722
723 # Same check in osh/word_parse.py
724 if left_id != Id.Left_BSingleQuote:
725 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
726 raise self._Error(
727 r"\y%s escapes not allowed in u'' strings" % h,
728 str_end)
729
730 i = int(h, 16)
731 part = chr(i)
732
733 elif tok_id == Id.Char_SurrogatePair:
734 h1 = self.s[str_pos + 2:str_pos + 6]
735 h2 = self.s[str_pos + 8:str_pos + 12]
736
737 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
738 i1 = int(h1, 16) - 0xD800 # high surrogate
739 i2 = int(h2, 16) - 0xDC00 # low surrogate
740 code_point = 0x10000 + (i1 << 10) + i2
741
742 part = Utf8Encode(code_point)
743
744 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
745 h = self.s[str_pos + 2:str_end]
746 i = int(h, 16)
747 part = Utf8Encode(i)
748
749 else:
750 # Should never happen
751 raise AssertionError(Id_str(tok_id))
752
753 #log('%s part %r', Id_str(tok_id), part)
754 self.decoded.write(part)
755 str_pos = str_end
756
757
758class _Parser(object):
759
760 def __init__(self, s, is_j8):
761 # type: (str, bool) -> None
762 self.s = s
763 self.is_j8 = is_j8
764 self.lang_str = "J8" if is_j8 else "JSON"
765
766 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
767 self.tok_id = Id.Undefined_Tok
768 self.start_pos = 0
769 self.end_pos = 0
770 self.decoded = '' # decoded J8 string
771
772 def _Next(self):
773 # type: () -> None
774
775 # This isn't the start of a J8_Bool token, it's the END of the token before it
776 while True:
777 self.start_pos = self.end_pos
778 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
779 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
780 Id.Ignored_Comment):
781 break
782 # TODO: add Ignored_Newline to count lines, and show line numbers
783 # in errors messages. The position of the last newline and a token
784 # can be used to calculate a column number.
785
786 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
787
788 def _Eat(self, tok_id):
789 # type: (Id_t) -> None
790
791 if self.tok_id != tok_id:
792 #log('position %r %d-%d %r', self.s, self.start_pos,
793 # self.end_pos, self.s[self.start_pos:self.end_pos])
794 raise self._ParseError("Expected %s, got %s" %
795 (Id_str(tok_id), Id_str(self.tok_id)))
796 self._Next()
797
798 def _NextForLines(self):
799 # type: () -> None
800 """Like _Next, but use the J8 Lines lexer."""
801 self.start_pos = self.end_pos
802 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
803
804 def _ParseError(self, msg):
805 # type: (str) -> error.Decode
806 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
807 self.lexer.cur_line_num)
808
809
810class Parser(_Parser):
811 """JSON and JSON8 Parser."""
812
813 def __init__(self, s, is_j8):
814 # type: (str, bool) -> None
815 _Parser.__init__(self, s, is_j8)
816
817 def _ParsePair(self):
818 # type: () -> Tuple[str, value_t]
819
820 k = self.decoded # Save the potential string value
821 self._Eat(Id.J8_String) # Check that it's a string
822 assert k is not None
823
824 self._Eat(Id.J8_Colon)
825
826 v = self._ParseValue()
827 return k, v
828
829 def _ParseDict(self):
830 # type: () -> value_t
831 """
832 pair = string ':' value
833 Dict = '{' '}'
834 | '{' pair (',' pair)* '}'
835 """
836 # precondition
837 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
838
839 #log('> Dict')
840
841 d = NewDict() # type: Dict[str, value_t]
842
843 self._Next()
844 if self.tok_id == Id.J8_RBrace:
845 self._Next()
846 return value.Dict(d)
847
848 k, v = self._ParsePair()
849 d[k] = v
850 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
851
852 while self.tok_id == Id.J8_Comma:
853 self._Next()
854 k, v = self._ParsePair()
855 d[k] = v
856 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
857
858 self._Eat(Id.J8_RBrace)
859
860 #log('< Dict')
861
862 return value.Dict(d)
863
864 def _ParseList(self):
865 # type: () -> value_t
866 """
867 List = '[' ']'
868 | '[' value (',' value)* ']'
869 """
870 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
871
872 items = [] # type: List[value_t]
873
874 self._Next()
875 if self.tok_id == Id.J8_RBracket:
876 self._Next()
877 return value.List(items)
878
879 items.append(self._ParseValue())
880
881 while self.tok_id == Id.J8_Comma:
882 self._Next()
883 items.append(self._ParseValue())
884
885 self._Eat(Id.J8_RBracket)
886
887 return value.List(items)
888
889 def _ParseValue(self):
890 # type: () -> value_t
891 if self.tok_id == Id.J8_LBrace:
892 return self._ParseDict()
893
894 elif self.tok_id == Id.J8_LBracket:
895 return self._ParseList()
896
897 elif self.tok_id == Id.J8_Null:
898 self._Next()
899 return value.Null
900
901 elif self.tok_id == Id.J8_Bool:
902 #log('%r %d', self.s[self.start_pos], self.start_pos)
903 b = value.Bool(self.s[self.start_pos] == 't')
904 self._Next()
905 return b
906
907 elif self.tok_id == Id.J8_Int:
908 part = self.s[self.start_pos:self.end_pos]
909 self._Next()
910 try:
911 big = mops.FromStr(part)
912 except ValueError:
913 raise self._ParseError('Integer is too big')
914 return value.Int(big)
915
916 elif self.tok_id == Id.J8_Float:
917 part = self.s[self.start_pos:self.end_pos]
918 self._Next()
919 return value.Float(float(part))
920
921 # UString, BString too
922 elif self.tok_id == Id.J8_String:
923 str_val = value.Str(self.decoded)
924 #log('d %r', self.decoded)
925 self._Next()
926 return str_val
927
928 elif self.tok_id == Id.Eol_Tok:
929 raise self._ParseError('Unexpected EOF while parsing %s' %
930 self.lang_str)
931
932 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
933 raise self._ParseError('Invalid token while parsing %s: %s' %
934 (self.lang_str, Id_str(self.tok_id)))
935
936 def ParseValue(self):
937 # type: () -> value_t
938 """ Raises error.Decode. """
939 self._Next()
940 obj = self._ParseValue()
941
942 n = len(self.s)
943 if self.start_pos != n:
944 extra = n - self.start_pos
945 #log('n %d pos %d', n, self.start_pos)
946 raise self._ParseError(
947 'Got %d bytes of unexpected trailing input' % extra)
948 return obj
949
950
951class Nil8Parser(_Parser):
952 """
953 Tokens not in JSON8:
954 LParen RParen Symbol
955
956 Tokens not in JSON, but in JSON8 and NIL8:
957 Identifier (unquoted keys)
958 Ignored_Comment
959 """
960
961 def __init__(self, s, is_j8):
962 # type: (str, bool) -> None
963 _Parser.__init__(self, s, is_j8)
964
965 if 0:
966
967 def _LookAhead(self):
968 # type: () -> Id_t
969 """
970 Don't need this right now
971 """
972 end_pos = self.end_pos # look ahead from last token
973 while True:
974 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
975 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
976 Id.Ignored_Comment):
977 break
978 return tok_id
979
980 def _ParseRecord(self):
981 # type: () -> nvalue_t
982 """
983 Yaks
984 (self->Next) => (-> self Next)
985 (self->Next obj.field) => ((-> self Next) (. obj field))
986
987 Similar to
988 ((identity identity) 42) => 42 in Clojure
989
990 ASDL
991 (Node left:(. x4beef2))
992 (Node left !x4beef2)
993
994 # Ambiguous because value can be identifier.
995 # We have to look ahead to and see if there's a colon :
996 field =
997 Identifier ':' value
998 | value
999
1000 record = '(' head field* ')'
1001
1002 - Identifier | Symbol are treated the same, it's a side effect of
1003 the lexing style
1004 - do positional args come before named args
1005 - () is invalid? Use [] for empty list
1006 """
1007 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1008
1009 items = [] # type: List[nvalue_t]
1010
1011 self._Next()
1012 if self.tok_id == Id.J8_RParen:
1013 self._Next()
1014 return nvalue.List(items)
1015
1016 #log('TOK %s', Id_str(self.tok_id))
1017 while self.tok_id != Id.J8_RParen:
1018 items.append(self._ParseNil8())
1019 #log('TOK 2 %s', Id_str(self.tok_id))
1020
1021 self._Eat(Id.J8_RParen)
1022
1023 return nvalue.List(items)
1024
1025 def _ParseList8(self):
1026 # type: () -> nvalue_t
1027 """
1028 List8 = '[' value* ']'
1029
1030 No commas, not even optional ones for now.
1031 """
1032 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1033
1034 items = [] # type: List[nvalue_t]
1035
1036 self._Next()
1037 if self.tok_id == Id.J8_RBracket:
1038 self._Next()
1039 return nvalue.List(items)
1040
1041 #log('TOK %s', Id_str(self.tok_id))
1042 while self.tok_id != Id.J8_RBracket:
1043 items.append(self._ParseNil8())
1044 #log('TOK 2 %s', Id_str(self.tok_id))
1045
1046 self._Eat(Id.J8_RBracket)
1047
1048 return nvalue.List(items)
1049
1050 def _ParseNil8(self):
1051 # type: () -> nvalue_t
1052 if self.tok_id == Id.J8_LParen:
1053 obj = self._ParseRecord() # type: nvalue_t
1054 #return obj
1055
1056 elif self.tok_id == Id.J8_LBracket:
1057 obj = self._ParseList8()
1058 #return obj
1059
1060 # Primitives are copied from J8 above.
1061 # TODO: We also want hex literals.
1062 elif self.tok_id == Id.J8_Null:
1063 self._Next()
1064 obj = nvalue.Null
1065
1066 elif self.tok_id == Id.J8_Bool:
1067 b = nvalue.Bool(self.s[self.start_pos] == 't')
1068 self._Next()
1069 obj = b
1070
1071 elif self.tok_id == Id.J8_Int:
1072 part = self.s[self.start_pos:self.end_pos]
1073 self._Next()
1074 obj = nvalue.Int(int(part))
1075
1076 elif self.tok_id == Id.J8_Float:
1077 part = self.s[self.start_pos:self.end_pos]
1078 self._Next()
1079 obj = nvalue.Float(float(part))
1080
1081 elif self.tok_id == Id.J8_String:
1082 str_val = nvalue.Str(self.decoded)
1083 self._Next()
1084 obj = str_val
1085
1086 # <- etc.
1087 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1088 Id.J8_Comma):
1089 # unquoted "word" treated like a string
1090 part = self.s[self.start_pos:self.end_pos]
1091 self._Next()
1092 obj = nvalue.Symbol(part)
1093
1094 elif self.tok_id == Id.Eol_Tok:
1095 raise self._ParseError('Unexpected EOF while parsing %s' %
1096 self.lang_str)
1097
1098 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1099 raise self._ParseError('Invalid token while parsing %s: %s' %
1100 (self.lang_str, Id_str(self.tok_id)))
1101
1102 #log('YO %s', Id_str(self.tok_id))
1103 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1104 #log('AT %s', Id_str(self.tok_id))
1105
1106 # key: "value" -> (: key "value")
1107 part = self.s[self.start_pos:self.end_pos]
1108 op = nvalue.Symbol(part)
1109
1110 self._Next()
1111 operand2 = self._ParseNil8()
1112 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1113 #print("--> INFIX %d %s" % (id(infix), infix))
1114 return infix
1115
1116 #next_id = self._LookAhead()
1117 #print('NEXT %s' % Id_str(next_id))
1118
1119 #raise AssertionError()
1120 #print("--> OBJ %d %s" % (id(obj), obj))
1121 return obj
1122
1123 def ParseNil8(self):
1124 # type: () -> nvalue_t
1125 """ Raises error.Decode. """
1126 self._Next()
1127 #print('yo')
1128 obj = self._ParseNil8()
1129 #print("==> %d %s" % (id(obj), obj))
1130 if self.tok_id != Id.Eol_Tok:
1131 raise self._ParseError('Unexpected trailing input')
1132 return obj
1133
1134
1135class J8LinesParser(_Parser):
1136 """Decode lines from a string with newlines.
1137
1138 We specify this with a grammar, to preserve location info and to reduce
1139 allocations. (But note that unquoted_line is more like a LOOP than it is
1140 grammatical.)
1141
1142 Grammar:
1143
1144 end = J8_Newline | Eol_Tok
1145
1146 empty_line = WS_Space? end
1147
1148 # special case: read until end token, but REMOVE trailing WS_Space
1149 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1150
1151 j8_line = WS_Space? J8_String WS_Space? end
1152
1153 lines = (empty_line | unquoted_line | j8_line)*
1154
1155 where Lit_Chars is valid UTF-8
1156
1157 Notes:
1158
1159 (1) We disallow multiple strings on a line, like:
1160
1161 "json" "json2"
1162 "json" unquoted
1163
1164 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1165
1166 foo "" u''
1167
1168 The "" and u'' are not a decoded string, because the line started with
1169 Id.Lit_Chars literals.
1170
1171 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1172 Does it have - for empty cell?
1173 """
1174
1175 def __init__(self, s):
1176 # type: (str) -> None
1177 _Parser.__init__(self, s, True)
1178
1179 def _Show(self, s):
1180 # type: (str) -> None
1181 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1182 self.end_pos)
1183
1184 def _ParseLine(self, out):
1185 # type: (List[str]) -> None
1186 """ May append a line to 'out' """
1187 #self._Show('1')
1188 if self.tok_id == Id.WS_Space:
1189 self._NextForLines()
1190
1191 # Empty line - return without doing anything
1192 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1193 self._NextForLines()
1194 return
1195
1196 # Quoted string on line
1197 if self.tok_id == Id.J8_String:
1198 out.append(self.decoded)
1199 self._NextForLines()
1200
1201 if self.tok_id == Id.WS_Space: # trailing whitespace
1202 self._NextForLines()
1203
1204 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1205 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1206 Id_str(self.tok_id))
1207
1208 self._NextForLines()
1209 return
1210
1211 # Unquoted line
1212 if self.tok_id == Id.Lit_Chars:
1213 # ' unquoted "" text on line ' # read every token until end
1214 string_start = self.start_pos
1215 while True:
1216 # for stripping whitespace
1217 prev_id = self.tok_id
1218 prev_start = self.start_pos
1219
1220 self._NextForLines()
1221
1222 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1223 # \r, but we're sticking with the JSON spec definition of
1224 # whitespace. (As another data point, CPython on Unix allows
1225 # \r in the middle of expressions, treating it as whitespace.)
1226 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1227 break
1228
1229 if prev_id == Id.WS_Space:
1230 string_end = prev_start # remove trailing whitespace
1231 else:
1232 string_end = self.start_pos
1233
1234 out.append(self.s[string_start:string_end])
1235
1236 self._NextForLines() # past newline
1237 return
1238
1239 raise AssertionError(Id_str(self.tok_id))
1240
1241 def Parse(self):
1242 # type: () -> List[str]
1243 """ Raises error.Decode. """
1244 self._NextForLines()
1245
1246 lines = [] # type: List[str]
1247 while self.tok_id != Id.Eol_Tok:
1248 self._ParseLine(lines)
1249
1250 if self.tok_id != Id.Eol_Tok:
1251 raise self._ParseError('Unexpected trailing input in J8 Lines')
1252
1253 return lines
1254
1255
1256def SplitJ8Lines(s):
1257 # type: (str) -> List[str]
1258 """Used by @(echo split command sub)
1259
1260 Raises:
1261 error.Decode
1262
1263 3 Errors:
1264 - J8 string syntax error inside quotes
1265 - Extra input on line
1266 - unquoted line isn't utf-8
1267 """
1268 p = J8LinesParser(s)
1269 return p.Parse()
1270
1271
1272# vim: sw=4