OILS / data_lang / j8.py View on Github | oilshell.org

1267 lines, 620 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
187 f.write(buf.getvalue())
188 f.write('\n')
189
190
191def EncodeString(s, buf, unquoted_ok=False):
192 # type: (str, mylib.BufWriter, bool) -> None
193 """ For pp proc, etc."""
194
195 if unquoted_ok and fastfunc.CanOmitQuotes(s):
196 buf.write(s)
197 return
198
199 _Print(value.Str(s), buf, -1)
200
201
202def MaybeEncodeString(s):
203 # type: (str) -> str
204 """ For write --json8 $s and compexport """
205
206 # TODO: add unquoted_ok here?
207 # /usr/local/foo-bar/x.y/a_b
208
209 buf = mylib.BufWriter()
210 _Print(value.Str(s), buf, -1)
211 return buf.getvalue()
212
213
214def MaybeEncodeJsonString(s):
215 # type: (str) -> str
216 """ For write --json """
217
218 # TODO: add unquoted_ok here?
219 # /usr/local/foo-bar/x.y/a_b
220 buf = mylib.BufWriter()
221 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
222 return buf.getvalue()
223
224
225# DFS traversal state
226UNSEEN = 0
227EXPLORING = 1
228FINISHED = 2
229
230
231class InstancePrinter(object):
232 """Print a value tree as J8/JSON."""
233
234 def __init__(self, buf, indent, options):
235 # type: (mylib.BufWriter, int, int) -> None
236 self.buf = buf
237 self.indent = indent
238 self.options = options
239
240 # Key is vm.HeapValueId(val)
241 # Value is always True
242 # Dict[int, None] doesn't translate -- it would be nice to have a set()
243 self.visited = {} # type: Dict[int, int]
244
245 def _ItemIndent(self, level):
246 # type: (int) -> None
247
248 if self.indent == -1:
249 return
250
251 self.buf.write_spaces((level + 1) * self.indent)
252
253 def _BracketIndent(self, level):
254 # type: (int) -> None
255
256 if self.indent == -1:
257 return
258
259 self.buf.write_spaces(level * self.indent)
260
261 def _MaybeNewline(self):
262 # type: () -> None
263 if self.indent == -1:
264 return
265 self.buf.write('\n')
266
267 def _MaybeSpace(self):
268 # type: () -> None
269 if self.indent == -1:
270 return
271 self.buf.write(' ')
272
273 def _PrintList(self, val, level):
274 # type: (value.List, int) -> None
275
276 if len(val.items) == 0: # Special case like Python/JS
277 self.buf.write('[]')
278 else:
279 self.buf.write('[')
280 self._MaybeNewline()
281 for i, item in enumerate(val.items):
282 if i != 0:
283 self.buf.write(',')
284 self._MaybeNewline()
285
286 self._ItemIndent(level)
287 self.Print(item, level + 1)
288 self._MaybeNewline()
289
290 self._BracketIndent(level)
291 self.buf.write(']')
292
293 def _PrintDict(self, val, level):
294 # type: (value.Dict, int) -> None
295
296 if len(val.d) == 0: # Special case like Python/JS
297 self.buf.write('{}')
298 else:
299 self.buf.write('{')
300 self._MaybeNewline()
301 i = 0
302 for k, v in iteritems(val.d):
303 if i != 0:
304 self.buf.write(',')
305 self._MaybeNewline()
306
307 self._ItemIndent(level)
308
309 pyj8.WriteString(k, self.options, self.buf)
310
311 self.buf.write(':')
312 self._MaybeSpace()
313
314 self.Print(v, level + 1)
315
316 i += 1
317
318 self._MaybeNewline()
319 self._BracketIndent(level)
320 self.buf.write('}')
321
322 def Print(self, val, level=0):
323 # type: (value_t, int) -> None
324
325 # special value that means everything is on one line
326 # It's like
327 # JSON.stringify(d, null, 0)
328 # except we use -1, not 0. 0 can still have newlines.
329
330 UP_val = val
331 with tagswitch(val) as case:
332 if case(value_e.Null):
333 self.buf.write('null')
334
335 elif case(value_e.Bool):
336 val = cast(value.Bool, UP_val)
337 self.buf.write('true' if val.b else 'false')
338
339 elif case(value_e.Int):
340 val = cast(value.Int, UP_val)
341 # TODO: avoid intermediate allocation with
342 # self.buf.WriteBigInt(val.i)
343 #
344 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
345 # be of arbitrary length, and will need a growth strategy.
346 # Although that is not very common, so we could allocate in
347 # that case.
348
349 self.buf.write(mops.ToStr(val.i))
350
351 elif case(value_e.Float):
352 val = cast(value.Float, UP_val)
353
354 fl = val.f
355 if math.isinf(fl):
356 if self.options & INF_NAN_ARE_NULL:
357 s = 'null' # negative infinity is null too
358 else:
359 s = 'INFINITY'
360 if fl < 0:
361 s = '-' + s
362 elif math.isnan(fl):
363 if self.options & INF_NAN_ARE_NULL:
364 # JavaScript JSON lib behavior: Inf and NaN are null
365 # Python has a bug in the encoder by default, and then
366 # allow_nan=False raises an error
367 s = 'null'
368 else:
369 s = 'NAN'
370 else:
371 # TODO: can we avoid intermediate allocation?
372 # self.buf.WriteFloat(val.f)
373 s = str(fl)
374
375 self.buf.write(s)
376
377 elif case(value_e.Str):
378 val = cast(value.Str, UP_val)
379
380 pyj8.WriteString(val.s, self.options, self.buf)
381
382 elif case(value_e.List):
383 val = cast(value.List, UP_val)
384
385 # Cycle detection, only for containers that can be in cycles
386 heap_id = HeapValueId(val)
387
388 node_state = self.visited.get(heap_id, UNSEEN)
389 if node_state == FINISHED:
390 # Print it AGAIN. We print a JSON tree, which means we can
391 # visit and print nodes MANY TIMES, as long as they're not
392 # in a cycle.
393 self._PrintList(val, level)
394 return
395 if node_state == EXPLORING:
396 if self.options & SHOW_CYCLES:
397 self.buf.write('[ -->%s ]' % ValueIdString(val))
398 return
399 else:
400 # node.js prints which index closes the cycle
401 raise error.Encode(
402 "Can't encode List%s in object cycle" %
403 ValueIdString(val))
404
405 self.visited[heap_id] = EXPLORING
406 self._PrintList(val, level)
407 self.visited[heap_id] = FINISHED
408
409 elif case(value_e.Dict):
410 val = cast(value.Dict, UP_val)
411
412 # Cycle detection, only for containers that can be in cycles
413 heap_id = HeapValueId(val)
414
415 node_state = self.visited.get(heap_id, UNSEEN)
416 if node_state == FINISHED:
417 # Print it AGAIN. We print a JSON tree, which means we can
418 # visit and print nodes MANY TIMES, as long as they're not
419 # in a cycle.
420 self._PrintDict(val, level)
421 return
422 if node_state == EXPLORING:
423 if self.options & SHOW_CYCLES:
424 self.buf.write('{ -->%s }' % ValueIdString(val))
425 return
426 else:
427 # node.js prints which key closes the cycle
428 raise error.Encode(
429 "Can't encode Dict%s in object cycle" %
430 ValueIdString(val))
431
432 self.visited[heap_id] = EXPLORING
433 self._PrintDict(val, level)
434 self.visited[heap_id] = FINISHED
435
436 # BashArray and BashAssoc should be printed with pp line (x), e.g.
437 # for spec tests.
438 # - BashAssoc has a clear encoding.
439 # - BashArray could eventually be Dict[int, str]. But that's not
440 # encodable in JSON, which has string keys!
441 # So I think we can print it like ["a",null,'b"] and that won't
442 # change. That's what users expect.
443 elif case(value_e.BashArray):
444 val = cast(value.BashArray, UP_val)
445
446 self.buf.write('[')
447 self._MaybeNewline()
448 for i, s in enumerate(val.strs):
449 if i != 0:
450 self.buf.write(',')
451 self._MaybeNewline()
452
453 self._ItemIndent(level)
454 if s is None:
455 self.buf.write('null')
456 else:
457 pyj8.WriteString(s, self.options, self.buf)
458
459 self._MaybeNewline()
460
461 self._BracketIndent(level)
462 self.buf.write(']')
463
464 elif case(value_e.BashAssoc):
465 val = cast(value.BashAssoc, UP_val)
466
467 self.buf.write('{')
468 self._MaybeNewline()
469 i = 0
470 for k2, v2 in iteritems(val.d):
471 if i != 0:
472 self.buf.write(',')
473 self._MaybeNewline()
474
475 self._ItemIndent(level)
476
477 pyj8.WriteString(k2, self.options, self.buf)
478
479 self.buf.write(':')
480 self._MaybeSpace()
481
482 pyj8.WriteString(v2, self.options, self.buf)
483
484 i += 1
485
486 self._MaybeNewline()
487 self._BracketIndent(level)
488 self.buf.write('}')
489
490 else:
491 pass # mycpp workaround
492 if self.options & SHOW_NON_DATA:
493 # Similar to = operator, ui.DebugPrint()
494 # TODO: that prints value.Range in a special way
495 ysh_type = ValType(val)
496 id_str = ValueIdString(val)
497 self.buf.write('<%s%s>' % (ysh_type, id_str))
498 else:
499 raise error.Encode("Can't serialize object of type %s" %
500 ValType(val))
501
502
503class PrettyPrinter(object):
504 """ Unused right now, but could enhance the = operator.
505
506 Output to polymorphic ColorOutput
507
508 Features like asdl/format.py:
509 - line wrapping
510 - color
511 - sharing detection by passing in a REF COUTN dict
512 - print @123 the first time, and then print ... the second time
513
514 and
515
516 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
517 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
518
519 - Omitting commas for ASDL? Maybe we can use two spaces
520
521 (Token id: Id.VSub_DollarName start: 0 length: 3)
522 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
523 """
524
525 def __init__(self, max_col):
526 # type: (int) -> None
527 self.max_col = max_col
528
529 # This could be an optimized set an C++ bit set like
530 # mark_sweep_heap.h, rather than a Dict
531 #self.unique_objs = mylib.UniqueObjects()
532
533 # first pass of object ID -> number of times references
534
535 self.ref_count = {} # type: Dict[int, int]
536
537 def PrettyTree(self, val, f):
538 # type: (value_t, fmt.ColorOutput) -> None
539
540 # TODO: first convert to hnode.asdl types?
541
542 # Although we might want
543 # hnode.AlreadyShown = (str type, int unique_id)
544 pass
545
546 def Print(self, val, buf):
547 # type: (value_t, mylib.BufWriter) -> None
548
549 # Or print to stderr?
550 f = fmt.DetectConsoleOutput(mylib.Stdout())
551 self.PrettyTree(val, f)
552
553 # Then print those with ASDL
554 pass
555
556
557class LexerDecoder(object):
558 """J8 lexer and string decoder.
559
560 Similar interface as SimpleLexer, except we return an optional decoded
561 string
562 """
563
564 def __init__(self, s, is_j8, lang_str):
565 # type: (str, bool, str) -> None
566 self.s = s
567 self.is_j8 = is_j8
568 self.lang_str = lang_str
569
570 self.pos = 0
571
572 # current line being lexed -- for error messages
573 self.cur_line_num = 1
574
575 # Reuse this instance to save GC objects. JSON objects could have
576 # thousands of strings.
577 self.decoded = mylib.BufWriter()
578
579 def _Error(self, msg, end_pos):
580 # type: (str, int) -> error.Decode
581
582 # Use the current position as start pos
583 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
584
585 def Next(self):
586 # type: () -> Tuple[Id_t, int, Optional[str]]
587 """ Returns a token and updates self.pos """
588
589 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
590
591 if not self.is_j8:
592 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
593 raise self._Error(
594 "Single quotes aren't part of JSON; you may want 'json8 read'",
595 end_pos)
596 if tok_id == Id.Ignored_Comment:
597 raise self._Error(
598 "Comments aren't part of JSON; you may want 'json8 read'",
599 end_pos)
600
601 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
602 Id.Left_USingleQuote):
603 return self._DecodeString(tok_id, end_pos)
604
605 if tok_id == Id.Left_JDoubleQuote:
606 if self.is_j8:
607 return self._DecodeString(tok_id, end_pos)
608 else:
609 raise self._Error('Pure JSON does not accept j"" prefix',
610 end_pos)
611
612 if tok_id == Id.Ignored_Newline:
613 #log('LINE %d', self.cur_line_num)
614 self.cur_line_num += 1
615
616 self.pos = end_pos
617 return tok_id, end_pos, None
618
619 def NextForLines(self):
620 # type: () -> Tuple[Id_t, int, Optional[str]]
621 """ Like Next(), but for J8 Lines """
622
623 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
624
625 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
626 Id.Left_BSingleQuote, Id.Left_USingleQuote):
627 return self._DecodeString(tok_id, end_pos)
628
629 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
630 # this for quoted strings.)
631 if (tok_id == Id.Lit_Chars and
632 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
633 raise self._Error(
634 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
635 if tok_id == Id.Char_AsciiControl:
636 raise self._Error(
637 "J8 Lines can't have unescaped ASCII control chars", end_pos)
638
639 if tok_id == Id.J8_Newline:
640 #log('LINE %d', self.cur_line_num)
641 self.cur_line_num += 1
642
643 self.pos = end_pos
644 return tok_id, end_pos, None
645
646 def _DecodeString(self, left_id, str_pos):
647 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
648 """ Returns a string token and updates self.pos """
649
650 while True:
651 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
652 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
653 else:
654 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
655
656 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
657
658 if tok_id == Id.Eol_Tok:
659 # TODO: point to beginning of # quote?
660 raise self._Error(
661 'Unexpected EOF while lexing %s string' % self.lang_str,
662 str_end)
663 if tok_id == Id.Unknown_Backslash:
664 raise self._Error(
665 'Bad backslash escape in %s string' % self.lang_str,
666 str_end)
667 if tok_id == Id.Char_AsciiControl:
668 raise self._Error(
669 "%s strings can't have unescaped ASCII control chars" %
670 self.lang_str, str_end)
671
672 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
673
674 self.pos = str_end
675
676 s = self.decoded.getvalue()
677 self.decoded.clear() # reuse this instance
678
679 #log('decoded %r', self.decoded.getvalue())
680 return Id.J8_String, str_end, s
681
682 #
683 # Now handle each kind of token
684 #
685
686 if tok_id == Id.Lit_Chars: # JSON and J8
687 part = self.s[str_pos:str_end]
688 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
689 raise self._Error(
690 'Invalid UTF-8 in %s string literal' % self.lang_str,
691 str_end)
692
693 # TODO: would be nice to avoid allocation in all these cases.
694 # But LookupCharC() would have to change.
695
696 elif tok_id == Id.Char_OneChar: # JSON and J8
697 ch = self.s[str_pos + 1]
698 part = consts.LookupCharC(ch)
699
700 elif tok_id == Id.Char_UBraced: # J8 only
701 h = self.s[str_pos + 3:str_end - 1]
702 i = int(h, 16)
703
704 # Same checks in osh/word_compile.py
705 if i > 0x10ffff:
706 raise self._Error(
707 "Code point can't be greater than U+10ffff", str_end)
708 if 0xD800 <= i and i < 0xE000:
709 raise self._Error(
710 r"\u{%s} escape is illegal because it's in the surrogate range"
711 % h, str_end)
712
713 part = Utf8Encode(i)
714
715 elif tok_id == Id.Char_YHex: # J8 only
716 h = self.s[str_pos + 2:str_end]
717
718 # Same check in osh/word_parse.py
719 if left_id != Id.Left_BSingleQuote:
720 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
721 raise self._Error(
722 r"\y%s escapes not allowed in u'' strings" % h,
723 str_end)
724
725 i = int(h, 16)
726 part = chr(i)
727
728 elif tok_id == Id.Char_SurrogatePair:
729 h1 = self.s[str_pos + 2:str_pos + 6]
730 h2 = self.s[str_pos + 8:str_pos + 12]
731
732 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
733 i1 = int(h1, 16) - 0xD800 # high surrogate
734 i2 = int(h2, 16) - 0xDC00 # low surrogate
735 code_point = 0x10000 + (i1 << 10) + i2
736
737 part = Utf8Encode(code_point)
738
739 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
740 h = self.s[str_pos + 2:str_end]
741 i = int(h, 16)
742 part = Utf8Encode(i)
743
744 else:
745 # Should never happen
746 raise AssertionError(Id_str(tok_id))
747
748 #log('%s part %r', Id_str(tok_id), part)
749 self.decoded.write(part)
750 str_pos = str_end
751
752
753class _Parser(object):
754
755 def __init__(self, s, is_j8):
756 # type: (str, bool) -> None
757 self.s = s
758 self.is_j8 = is_j8
759 self.lang_str = "J8" if is_j8 else "JSON"
760
761 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
762 self.tok_id = Id.Undefined_Tok
763 self.start_pos = 0
764 self.end_pos = 0
765 self.decoded = '' # decoded J8 string
766
767 def _Next(self):
768 # type: () -> None
769
770 # This isn't the start of a J8_Bool token, it's the END of the token before it
771 while True:
772 self.start_pos = self.end_pos
773 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
774 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
775 Id.Ignored_Comment):
776 break
777 # TODO: add Ignored_Newline to count lines, and show line numbers
778 # in errors messages. The position of the last newline and a token
779 # can be used to calculate a column number.
780
781 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
782
783 def _Eat(self, tok_id):
784 # type: (Id_t) -> None
785
786 if self.tok_id != tok_id:
787 #log('position %r %d-%d %r', self.s, self.start_pos,
788 # self.end_pos, self.s[self.start_pos:self.end_pos])
789 raise self._ParseError("Expected %s, got %s" %
790 (Id_str(tok_id), Id_str(self.tok_id)))
791 self._Next()
792
793 def _NextForLines(self):
794 # type: () -> None
795 """Like _Next, but use the J8 Lines lexer."""
796 self.start_pos = self.end_pos
797 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
798
799 def _ParseError(self, msg):
800 # type: (str) -> error.Decode
801 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
802 self.lexer.cur_line_num)
803
804
805class Parser(_Parser):
806 """JSON and JSON8 Parser."""
807
808 def __init__(self, s, is_j8):
809 # type: (str, bool) -> None
810 _Parser.__init__(self, s, is_j8)
811
812 def _ParsePair(self):
813 # type: () -> Tuple[str, value_t]
814
815 k = self.decoded # Save the potential string value
816 self._Eat(Id.J8_String) # Check that it's a string
817 assert k is not None
818
819 self._Eat(Id.J8_Colon)
820
821 v = self._ParseValue()
822 return k, v
823
824 def _ParseDict(self):
825 # type: () -> value_t
826 """
827 pair = string ':' value
828 Dict = '{' '}'
829 | '{' pair (',' pair)* '}'
830 """
831 # precondition
832 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
833
834 #log('> Dict')
835
836 d = NewDict() # type: Dict[str, value_t]
837
838 self._Next()
839 if self.tok_id == Id.J8_RBrace:
840 self._Next()
841 return value.Dict(d)
842
843 k, v = self._ParsePair()
844 d[k] = v
845 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
846
847 while self.tok_id == Id.J8_Comma:
848 self._Next()
849 k, v = self._ParsePair()
850 d[k] = v
851 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
852
853 self._Eat(Id.J8_RBrace)
854
855 #log('< Dict')
856
857 return value.Dict(d)
858
859 def _ParseList(self):
860 # type: () -> value_t
861 """
862 List = '[' ']'
863 | '[' value (',' value)* ']'
864 """
865 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
866
867 items = [] # type: List[value_t]
868
869 self._Next()
870 if self.tok_id == Id.J8_RBracket:
871 self._Next()
872 return value.List(items)
873
874 items.append(self._ParseValue())
875
876 while self.tok_id == Id.J8_Comma:
877 self._Next()
878 items.append(self._ParseValue())
879
880 self._Eat(Id.J8_RBracket)
881
882 return value.List(items)
883
884 def _ParseValue(self):
885 # type: () -> value_t
886 if self.tok_id == Id.J8_LBrace:
887 return self._ParseDict()
888
889 elif self.tok_id == Id.J8_LBracket:
890 return self._ParseList()
891
892 elif self.tok_id == Id.J8_Null:
893 self._Next()
894 return value.Null
895
896 elif self.tok_id == Id.J8_Bool:
897 #log('%r %d', self.s[self.start_pos], self.start_pos)
898 b = value.Bool(self.s[self.start_pos] == 't')
899 self._Next()
900 return b
901
902 elif self.tok_id == Id.J8_Int:
903 part = self.s[self.start_pos:self.end_pos]
904 self._Next()
905 try:
906 big = mops.FromStr(part)
907 except ValueError:
908 raise self._ParseError('Integer is too big')
909 return value.Int(big)
910
911 elif self.tok_id == Id.J8_Float:
912 part = self.s[self.start_pos:self.end_pos]
913 self._Next()
914 return value.Float(float(part))
915
916 # UString, BString too
917 elif self.tok_id == Id.J8_String:
918 str_val = value.Str(self.decoded)
919 #log('d %r', self.decoded)
920 self._Next()
921 return str_val
922
923 elif self.tok_id == Id.Eol_Tok:
924 raise self._ParseError('Unexpected EOF while parsing %s' %
925 self.lang_str)
926
927 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
928 raise self._ParseError('Invalid token while parsing %s: %s' %
929 (self.lang_str, Id_str(self.tok_id)))
930
931 def ParseValue(self):
932 # type: () -> value_t
933 """ Raises error.Decode. """
934 self._Next()
935 obj = self._ParseValue()
936
937 n = len(self.s)
938 if self.start_pos != n:
939 extra = n - self.start_pos
940 #log('n %d pos %d', n, self.start_pos)
941 raise self._ParseError(
942 'Got %d bytes of unexpected trailing input' % extra)
943 return obj
944
945
946class Nil8Parser(_Parser):
947 """
948 Tokens not in JSON8:
949 LParen RParen Symbol
950
951 Tokens not in JSON, but in JSON8 and NIL8:
952 Identifier (unquoted keys)
953 Ignored_Comment
954 """
955
956 def __init__(self, s, is_j8):
957 # type: (str, bool) -> None
958 _Parser.__init__(self, s, is_j8)
959
960 if 0:
961
962 def _LookAhead(self):
963 # type: () -> Id_t
964 """
965 Don't need this right now
966 """
967 end_pos = self.end_pos # look ahead from last token
968 while True:
969 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
970 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
971 Id.Ignored_Comment):
972 break
973 return tok_id
974
975 def _ParseRecord(self):
976 # type: () -> nvalue_t
977 """
978 Yaks
979 (self->Next) => (-> self Next)
980 (self->Next obj.field) => ((-> self Next) (. obj field))
981
982 Similar to
983 ((identity identity) 42) => 42 in Clojure
984
985 ASDL
986 (Node left:(. x4beef2))
987 (Node left !x4beef2)
988
989 # Ambiguous because value can be identifier.
990 # We have to look ahead to and see if there's a colon :
991 field =
992 Identifier ':' value
993 | value
994
995 record = '(' head field* ')'
996
997 - Identifier | Symbol are treated the same, it's a side effect of
998 the lexing style
999 - do positional args come before named args
1000 - () is invalid? Use [] for empty list
1001 """
1002 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1003
1004 items = [] # type: List[nvalue_t]
1005
1006 self._Next()
1007 if self.tok_id == Id.J8_RParen:
1008 self._Next()
1009 return nvalue.List(items)
1010
1011 #log('TOK %s', Id_str(self.tok_id))
1012 while self.tok_id != Id.J8_RParen:
1013 items.append(self._ParseNil8())
1014 #log('TOK 2 %s', Id_str(self.tok_id))
1015
1016 self._Eat(Id.J8_RParen)
1017
1018 return nvalue.List(items)
1019
1020 def _ParseList8(self):
1021 # type: () -> nvalue_t
1022 """
1023 List8 = '[' value* ']'
1024
1025 No commas, not even optional ones for now.
1026 """
1027 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1028
1029 items = [] # type: List[nvalue_t]
1030
1031 self._Next()
1032 if self.tok_id == Id.J8_RBracket:
1033 self._Next()
1034 return nvalue.List(items)
1035
1036 #log('TOK %s', Id_str(self.tok_id))
1037 while self.tok_id != Id.J8_RBracket:
1038 items.append(self._ParseNil8())
1039 #log('TOK 2 %s', Id_str(self.tok_id))
1040
1041 self._Eat(Id.J8_RBracket)
1042
1043 return nvalue.List(items)
1044
1045 def _ParseNil8(self):
1046 # type: () -> nvalue_t
1047 if self.tok_id == Id.J8_LParen:
1048 obj = self._ParseRecord() # type: nvalue_t
1049 #return obj
1050
1051 elif self.tok_id == Id.J8_LBracket:
1052 obj = self._ParseList8()
1053 #return obj
1054
1055 # Primitives are copied from J8 above.
1056 # TODO: We also want hex literals.
1057 elif self.tok_id == Id.J8_Null:
1058 self._Next()
1059 obj = nvalue.Null
1060
1061 elif self.tok_id == Id.J8_Bool:
1062 b = nvalue.Bool(self.s[self.start_pos] == 't')
1063 self._Next()
1064 obj = b
1065
1066 elif self.tok_id == Id.J8_Int:
1067 part = self.s[self.start_pos:self.end_pos]
1068 self._Next()
1069 obj = nvalue.Int(int(part))
1070
1071 elif self.tok_id == Id.J8_Float:
1072 part = self.s[self.start_pos:self.end_pos]
1073 self._Next()
1074 obj = nvalue.Float(float(part))
1075
1076 elif self.tok_id == Id.J8_String:
1077 str_val = nvalue.Str(self.decoded)
1078 self._Next()
1079 obj = str_val
1080
1081 # <- etc.
1082 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1083 Id.J8_Comma):
1084 # unquoted "word" treated like a string
1085 part = self.s[self.start_pos:self.end_pos]
1086 self._Next()
1087 obj = nvalue.Symbol(part)
1088
1089 elif self.tok_id == Id.Eol_Tok:
1090 raise self._ParseError('Unexpected EOF while parsing %s' %
1091 self.lang_str)
1092
1093 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1094 raise self._ParseError('Invalid token while parsing %s: %s' %
1095 (self.lang_str, Id_str(self.tok_id)))
1096
1097 #log('YO %s', Id_str(self.tok_id))
1098 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1099 #log('AT %s', Id_str(self.tok_id))
1100
1101 # key: "value" -> (: key "value")
1102 part = self.s[self.start_pos:self.end_pos]
1103 op = nvalue.Symbol(part)
1104
1105 self._Next()
1106 operand2 = self._ParseNil8()
1107 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1108 #print("--> INFIX %d %s" % (id(infix), infix))
1109 return infix
1110
1111 #next_id = self._LookAhead()
1112 #print('NEXT %s' % Id_str(next_id))
1113
1114 #raise AssertionError()
1115 #print("--> OBJ %d %s" % (id(obj), obj))
1116 return obj
1117
1118 def ParseNil8(self):
1119 # type: () -> nvalue_t
1120 """ Raises error.Decode. """
1121 self._Next()
1122 #print('yo')
1123 obj = self._ParseNil8()
1124 #print("==> %d %s" % (id(obj), obj))
1125 if self.tok_id != Id.Eol_Tok:
1126 raise self._ParseError('Unexpected trailing input')
1127 return obj
1128
1129
1130class J8LinesParser(_Parser):
1131 """Decode lines from a string with newlines.
1132
1133 We specify this with a grammar, to preserve location info and to reduce
1134 allocations. (But note that unquoted_line is more like a LOOP than it is
1135 grammatical.)
1136
1137 Grammar:
1138
1139 end = J8_Newline | Eol_Tok
1140
1141 empty_line = WS_Space? end
1142
1143 # special case: read until end token, but REMOVE trailing WS_Space
1144 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1145
1146 j8_line = WS_Space? J8_String WS_Space? end
1147
1148 lines = (empty_line | unquoted_line | j8_line)*
1149
1150 where Lit_Chars is valid UTF-8
1151
1152 Notes:
1153
1154 (1) We disallow multiple strings on a line, like:
1155
1156 "json" "json2"
1157 "json" unquoted
1158
1159 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1160
1161 foo "" u''
1162
1163 The "" and u'' are not a decoded string, because the line started with
1164 Id.Lit_Chars literals.
1165
1166 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1167 Does it have - for empty cell?
1168 """
1169
1170 def __init__(self, s):
1171 # type: (str) -> None
1172 _Parser.__init__(self, s, True)
1173
1174 def _Show(self, s):
1175 # type: (str) -> None
1176 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1177 self.end_pos)
1178
1179 def _ParseLine(self, out):
1180 # type: (List[str]) -> None
1181 """ May append a line to 'out' """
1182 #self._Show('1')
1183 if self.tok_id == Id.WS_Space:
1184 self._NextForLines()
1185
1186 # Empty line - return without doing anything
1187 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1188 self._NextForLines()
1189 return
1190
1191 # Quoted string on line
1192 if self.tok_id == Id.J8_String:
1193 out.append(self.decoded)
1194 self._NextForLines()
1195
1196 if self.tok_id == Id.WS_Space: # trailing whitespace
1197 self._NextForLines()
1198
1199 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1200 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1201 Id_str(self.tok_id))
1202
1203 self._NextForLines()
1204 return
1205
1206 # Unquoted line
1207 if self.tok_id == Id.Lit_Chars:
1208 # ' unquoted "" text on line ' # read every token until end
1209 string_start = self.start_pos
1210 while True:
1211 # for stripping whitespace
1212 prev_id = self.tok_id
1213 prev_start = self.start_pos
1214
1215 self._NextForLines()
1216
1217 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1218 # \r, but we're sticking with the JSON spec definition of
1219 # whitespace. (As another data point, CPython on Unix allows
1220 # \r in the middle of expressions, treating it as whitespace.)
1221 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1222 break
1223
1224 if prev_id == Id.WS_Space:
1225 string_end = prev_start # remove trailing whitespace
1226 else:
1227 string_end = self.start_pos
1228
1229 out.append(self.s[string_start:string_end])
1230
1231 self._NextForLines() # past newline
1232 return
1233
1234 raise AssertionError(Id_str(self.tok_id))
1235
1236 def Parse(self):
1237 # type: () -> List[str]
1238 """ Raises error.Decode. """
1239 self._NextForLines()
1240
1241 lines = [] # type: List[str]
1242 while self.tok_id != Id.Eol_Tok:
1243 self._ParseLine(lines)
1244
1245 if self.tok_id != Id.Eol_Tok:
1246 raise self._ParseError('Unexpected trailing input in J8 Lines')
1247
1248 return lines
1249
1250
1251def SplitJ8Lines(s):
1252 # type: (str) -> List[str]
1253 """Used by @(echo split command sub)
1254
1255 Raises:
1256 error.Decode
1257
1258 3 Errors:
1259 - J8 string syntax error inside quotes
1260 - Extra input on line
1261 - unquoted line isn't utf-8
1262 """
1263 p = J8LinesParser(s)
1264 return p.Parse()
1265
1266
1267# vim: sw=4