OILS / data_lang / j8.py View on Github | oilshell.org

1299 lines, 639 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186
187 # TODO: Omit type at top level
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194def EncodeString(s, buf, unquoted_ok=False):
195 # type: (str, mylib.BufWriter, bool) -> None
196 """ For pp proc, etc."""
197
198 if unquoted_ok and fastfunc.CanOmitQuotes(s):
199 buf.write(s)
200 return
201
202 _Print(value.Str(s), buf, -1)
203
204
205def MaybeEncodeString(s):
206 # type: (str) -> str
207 """ For write --json8 $s and compexport """
208
209 # TODO: add unquoted_ok here?
210 # /usr/local/foo-bar/x.y/a_b
211
212 buf = mylib.BufWriter()
213 _Print(value.Str(s), buf, -1)
214 return buf.getvalue()
215
216
217def MaybeEncodeJsonString(s):
218 # type: (str) -> str
219 """ For write --json """
220
221 # TODO: add unquoted_ok here?
222 # /usr/local/foo-bar/x.y/a_b
223 buf = mylib.BufWriter()
224 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
225 return buf.getvalue()
226
227
228# DFS traversal state
229UNSEEN = 0
230EXPLORING = 1
231FINISHED = 2
232
233
234class InstancePrinter(object):
235 """Print a value tree as J8/JSON."""
236
237 def __init__(self, buf, indent, options):
238 # type: (mylib.BufWriter, int, int) -> None
239 self.buf = buf
240 self.indent = indent
241 self.options = options
242
243 # Key is vm.HeapValueId(val)
244 # Value is always True
245 # Dict[int, None] doesn't translate -- it would be nice to have a set()
246 self.visited = {} # type: Dict[int, int]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintDict(self, val, level):
297 # type: (value.Dict, int) -> None
298
299 if len(val.d) == 0: # Special case like Python/JS
300 self.buf.write('{}')
301 else:
302 self.buf.write('{')
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(val.d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write('}')
324
325 def Print(self, val, level=0):
326 # type: (value_t, int) -> None
327
328 # special value that means everything is on one line
329 # It's like
330 # JSON.stringify(d, null, 0)
331 # except we use -1, not 0. 0 can still have newlines.
332
333 UP_val = val
334 with tagswitch(val) as case:
335 if case(value_e.Null):
336 self.buf.write('null')
337
338 elif case(value_e.Bool):
339 val = cast(value.Bool, UP_val)
340 self.buf.write('true' if val.b else 'false')
341
342 elif case(value_e.Int):
343 val = cast(value.Int, UP_val)
344 # TODO: avoid intermediate allocation with
345 # self.buf.WriteBigInt(val.i)
346 #
347 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
348 # be of arbitrary length, and will need a growth strategy.
349 # Although that is not very common, so we could allocate in
350 # that case.
351
352 self.buf.write(mops.ToStr(val.i))
353
354 elif case(value_e.Float):
355 val = cast(value.Float, UP_val)
356
357 fl = val.f
358 if math.isinf(fl):
359 if self.options & INF_NAN_ARE_NULL:
360 s = 'null' # negative infinity is null too
361 else:
362 s = 'INFINITY'
363 if fl < 0:
364 s = '-' + s
365 elif math.isnan(fl):
366 if self.options & INF_NAN_ARE_NULL:
367 # JavaScript JSON lib behavior: Inf and NaN are null
368 # Python has a bug in the encoder by default, and then
369 # allow_nan=False raises an error
370 s = 'null'
371 else:
372 s = 'NAN'
373 else:
374 # TODO: can we avoid intermediate allocation?
375 # self.buf.WriteFloat(val.f)
376 s = str(fl)
377
378 self.buf.write(s)
379
380 elif case(value_e.Str):
381 val = cast(value.Str, UP_val)
382
383 pyj8.WriteString(val.s, self.options, self.buf)
384
385 elif case(value_e.List):
386 val = cast(value.List, UP_val)
387
388 # Cycle detection, only for containers that can be in cycles
389 heap_id = HeapValueId(val)
390
391 node_state = self.visited.get(heap_id, UNSEEN)
392 if node_state == FINISHED:
393 # Print it AGAIN. We print a JSON tree, which means we can
394 # visit and print nodes MANY TIMES, as long as they're not
395 # in a cycle.
396 self._PrintList(val, level)
397 return
398 if node_state == EXPLORING:
399 if self.options & SHOW_CYCLES:
400 self.buf.write('[ -->%s ]' % ValueIdString(val))
401 return
402 else:
403 # node.js prints which index closes the cycle
404 raise error.Encode(
405 "Can't encode List%s in object cycle" %
406 ValueIdString(val))
407
408 self.visited[heap_id] = EXPLORING
409 self._PrintList(val, level)
410 self.visited[heap_id] = FINISHED
411
412 elif case(value_e.Dict):
413 val = cast(value.Dict, UP_val)
414
415 # Cycle detection, only for containers that can be in cycles
416 heap_id = HeapValueId(val)
417
418 node_state = self.visited.get(heap_id, UNSEEN)
419 if node_state == FINISHED:
420 # Print it AGAIN. We print a JSON tree, which means we can
421 # visit and print nodes MANY TIMES, as long as they're not
422 # in a cycle.
423 self._PrintDict(val, level)
424 return
425 if node_state == EXPLORING:
426 if self.options & SHOW_CYCLES:
427 self.buf.write('{ -->%s }' % ValueIdString(val))
428 return
429 else:
430 # node.js prints which key closes the cycle
431 raise error.Encode(
432 "Can't encode Dict%s in object cycle" %
433 ValueIdString(val))
434
435 self.visited[heap_id] = EXPLORING
436 self._PrintDict(val, level)
437 self.visited[heap_id] = FINISHED
438
439 # TODO: New format, which should consistent with pretty printing
440 # pp line (x) supports BashArray and BashAssoc, e.g. for spec
441 # tests.
442
443 # - BashAssoc is Dict[str, str]
444 # (BashAssoc ['1']='foo' ['3']='bar')
445 # - BashArray will be Dict[int, str] - SparseArray. We should write it like
446 # (BashArray [1]='foo' [3]='bar')
447
448 elif case(value_e.BashArray):
449 val = cast(value.BashArray, UP_val)
450
451 self.buf.write('{')
452 self._MaybeNewline()
453 self._ItemIndent(level)
454 self.buf.write('"type":')
455 self._MaybeSpace()
456 self.buf.write('"BashArray",')
457
458 self._MaybeNewline()
459
460 self._ItemIndent(level)
461 self.buf.write('"value":')
462 self._MaybeSpace()
463 self.buf.write('{')
464 self._MaybeNewline()
465
466 level += 1
467 first = True
468 for i, s in enumerate(val.strs):
469 if s is None:
470 continue
471
472 if not first:
473 self.buf.write(',')
474 self._MaybeNewline()
475
476 self._ItemIndent(level)
477
478 pyj8.WriteString(str(i), self.options, self.buf)
479 self.buf.write(':')
480 self._MaybeSpace()
481
482 pyj8.WriteString(s, self.options, self.buf)
483
484 first = False
485
486 self._MaybeNewline()
487
488 self._BracketIndent(level)
489 self.buf.write('}')
490
491 level -= 1
492 self._MaybeNewline()
493 self._BracketIndent(level)
494 self.buf.write('}')
495
496 elif case(value_e.BashAssoc):
497 val = cast(value.BashAssoc, UP_val)
498
499 self.buf.write('{')
500 self._MaybeNewline()
501 i = 0
502 for k2, v2 in iteritems(val.d):
503 if i != 0:
504 self.buf.write(',')
505 self._MaybeNewline()
506
507 self._ItemIndent(level)
508
509 pyj8.WriteString(k2, self.options, self.buf)
510
511 self.buf.write(':')
512 self._MaybeSpace()
513
514 pyj8.WriteString(v2, self.options, self.buf)
515
516 i += 1
517
518 self._MaybeNewline()
519 self._BracketIndent(level)
520 self.buf.write('}')
521
522 else:
523 pass # mycpp workaround
524 if self.options & SHOW_NON_DATA:
525 # Similar to = operator, ui.DebugPrint()
526 # TODO: that prints value.Range in a special way
527 ysh_type = ValType(val)
528 id_str = ValueIdString(val)
529 self.buf.write('<%s%s>' % (ysh_type, id_str))
530 else:
531 raise error.Encode("Can't serialize object of type %s" %
532 ValType(val))
533
534
535class PrettyPrinter(object):
536 """ Unused right now, but could enhance the = operator.
537
538 Output to polymorphic ColorOutput
539
540 Features like asdl/format.py:
541 - line wrapping
542 - color
543 - sharing detection by passing in a REF COUTN dict
544 - print @123 the first time, and then print ... the second time
545
546 and
547
548 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
549 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
550
551 - Omitting commas for ASDL? Maybe we can use two spaces
552
553 (Token id: Id.VSub_DollarName start: 0 length: 3)
554 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
555 """
556
557 def __init__(self, max_col):
558 # type: (int) -> None
559 self.max_col = max_col
560
561 # This could be an optimized set an C++ bit set like
562 # mark_sweep_heap.h, rather than a Dict
563 #self.unique_objs = mylib.UniqueObjects()
564
565 # first pass of object ID -> number of times references
566
567 self.ref_count = {} # type: Dict[int, int]
568
569 def PrettyTree(self, val, f):
570 # type: (value_t, fmt.ColorOutput) -> None
571
572 # TODO: first convert to hnode.asdl types?
573
574 # Although we might want
575 # hnode.AlreadyShown = (str type, int unique_id)
576 pass
577
578 def Print(self, val, buf):
579 # type: (value_t, mylib.BufWriter) -> None
580
581 # Or print to stderr?
582 f = fmt.DetectConsoleOutput(mylib.Stdout())
583 self.PrettyTree(val, f)
584
585 # Then print those with ASDL
586 pass
587
588
589class LexerDecoder(object):
590 """J8 lexer and string decoder.
591
592 Similar interface as SimpleLexer, except we return an optional decoded
593 string
594 """
595
596 def __init__(self, s, is_j8, lang_str):
597 # type: (str, bool, str) -> None
598 self.s = s
599 self.is_j8 = is_j8
600 self.lang_str = lang_str
601
602 self.pos = 0
603
604 # current line being lexed -- for error messages
605 self.cur_line_num = 1
606
607 # Reuse this instance to save GC objects. JSON objects could have
608 # thousands of strings.
609 self.decoded = mylib.BufWriter()
610
611 def _Error(self, msg, end_pos):
612 # type: (str, int) -> error.Decode
613
614 # Use the current position as start pos
615 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
616
617 def Next(self):
618 # type: () -> Tuple[Id_t, int, Optional[str]]
619 """ Returns a token and updates self.pos """
620
621 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
622
623 if not self.is_j8:
624 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
625 raise self._Error(
626 "Single quotes aren't part of JSON; you may want 'json8 read'",
627 end_pos)
628 if tok_id == Id.Ignored_Comment:
629 raise self._Error(
630 "Comments aren't part of JSON; you may want 'json8 read'",
631 end_pos)
632
633 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
634 Id.Left_USingleQuote):
635 return self._DecodeString(tok_id, end_pos)
636
637 if tok_id == Id.Left_JDoubleQuote:
638 if self.is_j8:
639 return self._DecodeString(tok_id, end_pos)
640 else:
641 raise self._Error('Pure JSON does not accept j"" prefix',
642 end_pos)
643
644 if tok_id == Id.Ignored_Newline:
645 #log('LINE %d', self.cur_line_num)
646 self.cur_line_num += 1
647
648 self.pos = end_pos
649 return tok_id, end_pos, None
650
651 def NextForLines(self):
652 # type: () -> Tuple[Id_t, int, Optional[str]]
653 """ Like Next(), but for J8 Lines """
654
655 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
656
657 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
658 Id.Left_BSingleQuote, Id.Left_USingleQuote):
659 return self._DecodeString(tok_id, end_pos)
660
661 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
662 # this for quoted strings.)
663 if (tok_id == Id.Lit_Chars and
664 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
665 raise self._Error(
666 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
667 if tok_id == Id.Char_AsciiControl:
668 raise self._Error(
669 "J8 Lines can't have unescaped ASCII control chars", end_pos)
670
671 if tok_id == Id.J8_Newline:
672 #log('LINE %d', self.cur_line_num)
673 self.cur_line_num += 1
674
675 self.pos = end_pos
676 return tok_id, end_pos, None
677
678 def _DecodeString(self, left_id, str_pos):
679 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
680 """ Returns a string token and updates self.pos """
681
682 while True:
683 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
684 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
685 else:
686 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
687
688 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
689
690 if tok_id == Id.Eol_Tok:
691 # TODO: point to beginning of # quote?
692 raise self._Error(
693 'Unexpected EOF while lexing %s string' % self.lang_str,
694 str_end)
695 if tok_id == Id.Unknown_Backslash:
696 raise self._Error(
697 'Bad backslash escape in %s string' % self.lang_str,
698 str_end)
699 if tok_id == Id.Char_AsciiControl:
700 raise self._Error(
701 "%s strings can't have unescaped ASCII control chars" %
702 self.lang_str, str_end)
703
704 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
705
706 self.pos = str_end
707
708 s = self.decoded.getvalue()
709 self.decoded.clear() # reuse this instance
710
711 #log('decoded %r', self.decoded.getvalue())
712 return Id.J8_String, str_end, s
713
714 #
715 # Now handle each kind of token
716 #
717
718 if tok_id == Id.Lit_Chars: # JSON and J8
719 part = self.s[str_pos:str_end]
720 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
721 raise self._Error(
722 'Invalid UTF-8 in %s string literal' % self.lang_str,
723 str_end)
724
725 # TODO: would be nice to avoid allocation in all these cases.
726 # But LookupCharC() would have to change.
727
728 elif tok_id == Id.Char_OneChar: # JSON and J8
729 ch = self.s[str_pos + 1]
730 part = consts.LookupCharC(ch)
731
732 elif tok_id == Id.Char_UBraced: # J8 only
733 h = self.s[str_pos + 3:str_end - 1]
734 i = int(h, 16)
735
736 # Same checks in osh/word_compile.py
737 if i > 0x10ffff:
738 raise self._Error(
739 "Code point can't be greater than U+10ffff", str_end)
740 if 0xD800 <= i and i < 0xE000:
741 raise self._Error(
742 r"\u{%s} escape is illegal because it's in the surrogate range"
743 % h, str_end)
744
745 part = Utf8Encode(i)
746
747 elif tok_id == Id.Char_YHex: # J8 only
748 h = self.s[str_pos + 2:str_end]
749
750 # Same check in osh/word_parse.py
751 if left_id != Id.Left_BSingleQuote:
752 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
753 raise self._Error(
754 r"\y%s escapes not allowed in u'' strings" % h,
755 str_end)
756
757 i = int(h, 16)
758 part = chr(i)
759
760 elif tok_id == Id.Char_SurrogatePair:
761 h1 = self.s[str_pos + 2:str_pos + 6]
762 h2 = self.s[str_pos + 8:str_pos + 12]
763
764 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
765 i1 = int(h1, 16) - 0xD800 # high surrogate
766 i2 = int(h2, 16) - 0xDC00 # low surrogate
767 code_point = 0x10000 + (i1 << 10) + i2
768
769 part = Utf8Encode(code_point)
770
771 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
772 h = self.s[str_pos + 2:str_end]
773 i = int(h, 16)
774 part = Utf8Encode(i)
775
776 else:
777 # Should never happen
778 raise AssertionError(Id_str(tok_id))
779
780 #log('%s part %r', Id_str(tok_id), part)
781 self.decoded.write(part)
782 str_pos = str_end
783
784
785class _Parser(object):
786
787 def __init__(self, s, is_j8):
788 # type: (str, bool) -> None
789 self.s = s
790 self.is_j8 = is_j8
791 self.lang_str = "J8" if is_j8 else "JSON"
792
793 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
794 self.tok_id = Id.Undefined_Tok
795 self.start_pos = 0
796 self.end_pos = 0
797 self.decoded = '' # decoded J8 string
798
799 def _Next(self):
800 # type: () -> None
801
802 # This isn't the start of a J8_Bool token, it's the END of the token before it
803 while True:
804 self.start_pos = self.end_pos
805 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
806 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
807 Id.Ignored_Comment):
808 break
809 # TODO: add Ignored_Newline to count lines, and show line numbers
810 # in errors messages. The position of the last newline and a token
811 # can be used to calculate a column number.
812
813 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
814
815 def _Eat(self, tok_id):
816 # type: (Id_t) -> None
817
818 if self.tok_id != tok_id:
819 #log('position %r %d-%d %r', self.s, self.start_pos,
820 # self.end_pos, self.s[self.start_pos:self.end_pos])
821 raise self._ParseError("Expected %s, got %s" %
822 (Id_str(tok_id), Id_str(self.tok_id)))
823 self._Next()
824
825 def _NextForLines(self):
826 # type: () -> None
827 """Like _Next, but use the J8 Lines lexer."""
828 self.start_pos = self.end_pos
829 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
830
831 def _ParseError(self, msg):
832 # type: (str) -> error.Decode
833 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
834 self.lexer.cur_line_num)
835
836
837class Parser(_Parser):
838 """JSON and JSON8 Parser."""
839
840 def __init__(self, s, is_j8):
841 # type: (str, bool) -> None
842 _Parser.__init__(self, s, is_j8)
843
844 def _ParsePair(self):
845 # type: () -> Tuple[str, value_t]
846
847 k = self.decoded # Save the potential string value
848 self._Eat(Id.J8_String) # Check that it's a string
849 assert k is not None
850
851 self._Eat(Id.J8_Colon)
852
853 v = self._ParseValue()
854 return k, v
855
856 def _ParseDict(self):
857 # type: () -> value_t
858 """
859 pair = string ':' value
860 Dict = '{' '}'
861 | '{' pair (',' pair)* '}'
862 """
863 # precondition
864 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
865
866 #log('> Dict')
867
868 d = NewDict() # type: Dict[str, value_t]
869
870 self._Next()
871 if self.tok_id == Id.J8_RBrace:
872 self._Next()
873 return value.Dict(d)
874
875 k, v = self._ParsePair()
876 d[k] = v
877 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
878
879 while self.tok_id == Id.J8_Comma:
880 self._Next()
881 k, v = self._ParsePair()
882 d[k] = v
883 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
884
885 self._Eat(Id.J8_RBrace)
886
887 #log('< Dict')
888
889 return value.Dict(d)
890
891 def _ParseList(self):
892 # type: () -> value_t
893 """
894 List = '[' ']'
895 | '[' value (',' value)* ']'
896 """
897 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
898
899 items = [] # type: List[value_t]
900
901 self._Next()
902 if self.tok_id == Id.J8_RBracket:
903 self._Next()
904 return value.List(items)
905
906 items.append(self._ParseValue())
907
908 while self.tok_id == Id.J8_Comma:
909 self._Next()
910 items.append(self._ParseValue())
911
912 self._Eat(Id.J8_RBracket)
913
914 return value.List(items)
915
916 def _ParseValue(self):
917 # type: () -> value_t
918 if self.tok_id == Id.J8_LBrace:
919 return self._ParseDict()
920
921 elif self.tok_id == Id.J8_LBracket:
922 return self._ParseList()
923
924 elif self.tok_id == Id.J8_Null:
925 self._Next()
926 return value.Null
927
928 elif self.tok_id == Id.J8_Bool:
929 #log('%r %d', self.s[self.start_pos], self.start_pos)
930 b = value.Bool(self.s[self.start_pos] == 't')
931 self._Next()
932 return b
933
934 elif self.tok_id == Id.J8_Int:
935 part = self.s[self.start_pos:self.end_pos]
936 self._Next()
937 try:
938 big = mops.FromStr(part)
939 except ValueError:
940 raise self._ParseError('Integer is too big')
941 return value.Int(big)
942
943 elif self.tok_id == Id.J8_Float:
944 part = self.s[self.start_pos:self.end_pos]
945 self._Next()
946 return value.Float(float(part))
947
948 # UString, BString too
949 elif self.tok_id == Id.J8_String:
950 str_val = value.Str(self.decoded)
951 #log('d %r', self.decoded)
952 self._Next()
953 return str_val
954
955 elif self.tok_id == Id.Eol_Tok:
956 raise self._ParseError('Unexpected EOF while parsing %s' %
957 self.lang_str)
958
959 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
960 raise self._ParseError('Invalid token while parsing %s: %s' %
961 (self.lang_str, Id_str(self.tok_id)))
962
963 def ParseValue(self):
964 # type: () -> value_t
965 """ Raises error.Decode. """
966 self._Next()
967 obj = self._ParseValue()
968
969 n = len(self.s)
970 if self.start_pos != n:
971 extra = n - self.start_pos
972 #log('n %d pos %d', n, self.start_pos)
973 raise self._ParseError(
974 'Got %d bytes of unexpected trailing input' % extra)
975 return obj
976
977
978class Nil8Parser(_Parser):
979 """
980 Tokens not in JSON8:
981 LParen RParen Symbol
982
983 Tokens not in JSON, but in JSON8 and NIL8:
984 Identifier (unquoted keys)
985 Ignored_Comment
986 """
987
988 def __init__(self, s, is_j8):
989 # type: (str, bool) -> None
990 _Parser.__init__(self, s, is_j8)
991
992 if 0:
993
994 def _LookAhead(self):
995 # type: () -> Id_t
996 """
997 Don't need this right now
998 """
999 end_pos = self.end_pos # look ahead from last token
1000 while True:
1001 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1002 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1003 Id.Ignored_Comment):
1004 break
1005 return tok_id
1006
1007 def _ParseRecord(self):
1008 # type: () -> nvalue_t
1009 """
1010 Yaks
1011 (self->Next) => (-> self Next)
1012 (self->Next obj.field) => ((-> self Next) (. obj field))
1013
1014 Similar to
1015 ((identity identity) 42) => 42 in Clojure
1016
1017 ASDL
1018 (Node left:(. x4beef2))
1019 (Node left !x4beef2)
1020
1021 # Ambiguous because value can be identifier.
1022 # We have to look ahead to and see if there's a colon :
1023 field =
1024 Identifier ':' value
1025 | value
1026
1027 record = '(' head field* ')'
1028
1029 - Identifier | Symbol are treated the same, it's a side effect of
1030 the lexing style
1031 - do positional args come before named args
1032 - () is invalid? Use [] for empty list
1033 """
1034 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1035
1036 items = [] # type: List[nvalue_t]
1037
1038 self._Next()
1039 if self.tok_id == Id.J8_RParen:
1040 self._Next()
1041 return nvalue.List(items)
1042
1043 #log('TOK %s', Id_str(self.tok_id))
1044 while self.tok_id != Id.J8_RParen:
1045 items.append(self._ParseNil8())
1046 #log('TOK 2 %s', Id_str(self.tok_id))
1047
1048 self._Eat(Id.J8_RParen)
1049
1050 return nvalue.List(items)
1051
1052 def _ParseList8(self):
1053 # type: () -> nvalue_t
1054 """
1055 List8 = '[' value* ']'
1056
1057 No commas, not even optional ones for now.
1058 """
1059 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1060
1061 items = [] # type: List[nvalue_t]
1062
1063 self._Next()
1064 if self.tok_id == Id.J8_RBracket:
1065 self._Next()
1066 return nvalue.List(items)
1067
1068 #log('TOK %s', Id_str(self.tok_id))
1069 while self.tok_id != Id.J8_RBracket:
1070 items.append(self._ParseNil8())
1071 #log('TOK 2 %s', Id_str(self.tok_id))
1072
1073 self._Eat(Id.J8_RBracket)
1074
1075 return nvalue.List(items)
1076
1077 def _ParseNil8(self):
1078 # type: () -> nvalue_t
1079 if self.tok_id == Id.J8_LParen:
1080 obj = self._ParseRecord() # type: nvalue_t
1081 #return obj
1082
1083 elif self.tok_id == Id.J8_LBracket:
1084 obj = self._ParseList8()
1085 #return obj
1086
1087 # Primitives are copied from J8 above.
1088 # TODO: We also want hex literals.
1089 elif self.tok_id == Id.J8_Null:
1090 self._Next()
1091 obj = nvalue.Null
1092
1093 elif self.tok_id == Id.J8_Bool:
1094 b = nvalue.Bool(self.s[self.start_pos] == 't')
1095 self._Next()
1096 obj = b
1097
1098 elif self.tok_id == Id.J8_Int:
1099 part = self.s[self.start_pos:self.end_pos]
1100 self._Next()
1101 obj = nvalue.Int(int(part))
1102
1103 elif self.tok_id == Id.J8_Float:
1104 part = self.s[self.start_pos:self.end_pos]
1105 self._Next()
1106 obj = nvalue.Float(float(part))
1107
1108 elif self.tok_id == Id.J8_String:
1109 str_val = nvalue.Str(self.decoded)
1110 self._Next()
1111 obj = str_val
1112
1113 # <- etc.
1114 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1115 Id.J8_Comma):
1116 # unquoted "word" treated like a string
1117 part = self.s[self.start_pos:self.end_pos]
1118 self._Next()
1119 obj = nvalue.Symbol(part)
1120
1121 elif self.tok_id == Id.Eol_Tok:
1122 raise self._ParseError('Unexpected EOF while parsing %s' %
1123 self.lang_str)
1124
1125 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1126 raise self._ParseError('Invalid token while parsing %s: %s' %
1127 (self.lang_str, Id_str(self.tok_id)))
1128
1129 #log('YO %s', Id_str(self.tok_id))
1130 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1131 #log('AT %s', Id_str(self.tok_id))
1132
1133 # key: "value" -> (: key "value")
1134 part = self.s[self.start_pos:self.end_pos]
1135 op = nvalue.Symbol(part)
1136
1137 self._Next()
1138 operand2 = self._ParseNil8()
1139 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1140 #print("--> INFIX %d %s" % (id(infix), infix))
1141 return infix
1142
1143 #next_id = self._LookAhead()
1144 #print('NEXT %s' % Id_str(next_id))
1145
1146 #raise AssertionError()
1147 #print("--> OBJ %d %s" % (id(obj), obj))
1148 return obj
1149
1150 def ParseNil8(self):
1151 # type: () -> nvalue_t
1152 """ Raises error.Decode. """
1153 self._Next()
1154 #print('yo')
1155 obj = self._ParseNil8()
1156 #print("==> %d %s" % (id(obj), obj))
1157 if self.tok_id != Id.Eol_Tok:
1158 raise self._ParseError('Unexpected trailing input')
1159 return obj
1160
1161
1162class J8LinesParser(_Parser):
1163 """Decode lines from a string with newlines.
1164
1165 We specify this with a grammar, to preserve location info and to reduce
1166 allocations. (But note that unquoted_line is more like a LOOP than it is
1167 grammatical.)
1168
1169 Grammar:
1170
1171 end = J8_Newline | Eol_Tok
1172
1173 empty_line = WS_Space? end
1174
1175 # special case: read until end token, but REMOVE trailing WS_Space
1176 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1177
1178 j8_line = WS_Space? J8_String WS_Space? end
1179
1180 lines = (empty_line | unquoted_line | j8_line)*
1181
1182 where Lit_Chars is valid UTF-8
1183
1184 Notes:
1185
1186 (1) We disallow multiple strings on a line, like:
1187
1188 "json" "json2"
1189 "json" unquoted
1190
1191 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1192
1193 foo "" u''
1194
1195 The "" and u'' are not a decoded string, because the line started with
1196 Id.Lit_Chars literals.
1197
1198 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1199 Does it have - for empty cell?
1200 """
1201
1202 def __init__(self, s):
1203 # type: (str) -> None
1204 _Parser.__init__(self, s, True)
1205
1206 def _Show(self, s):
1207 # type: (str) -> None
1208 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1209 self.end_pos)
1210
1211 def _ParseLine(self, out):
1212 # type: (List[str]) -> None
1213 """ May append a line to 'out' """
1214 #self._Show('1')
1215 if self.tok_id == Id.WS_Space:
1216 self._NextForLines()
1217
1218 # Empty line - return without doing anything
1219 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1220 self._NextForLines()
1221 return
1222
1223 # Quoted string on line
1224 if self.tok_id == Id.J8_String:
1225 out.append(self.decoded)
1226 self._NextForLines()
1227
1228 if self.tok_id == Id.WS_Space: # trailing whitespace
1229 self._NextForLines()
1230
1231 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1232 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1233 Id_str(self.tok_id))
1234
1235 self._NextForLines()
1236 return
1237
1238 # Unquoted line
1239 if self.tok_id == Id.Lit_Chars:
1240 # ' unquoted "" text on line ' # read every token until end
1241 string_start = self.start_pos
1242 while True:
1243 # for stripping whitespace
1244 prev_id = self.tok_id
1245 prev_start = self.start_pos
1246
1247 self._NextForLines()
1248
1249 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1250 # \r, but we're sticking with the JSON spec definition of
1251 # whitespace. (As another data point, CPython on Unix allows
1252 # \r in the middle of expressions, treating it as whitespace.)
1253 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1254 break
1255
1256 if prev_id == Id.WS_Space:
1257 string_end = prev_start # remove trailing whitespace
1258 else:
1259 string_end = self.start_pos
1260
1261 out.append(self.s[string_start:string_end])
1262
1263 self._NextForLines() # past newline
1264 return
1265
1266 raise AssertionError(Id_str(self.tok_id))
1267
1268 def Parse(self):
1269 # type: () -> List[str]
1270 """ Raises error.Decode. """
1271 self._NextForLines()
1272
1273 lines = [] # type: List[str]
1274 while self.tok_id != Id.Eol_Tok:
1275 self._ParseLine(lines)
1276
1277 if self.tok_id != Id.Eol_Tok:
1278 raise self._ParseError('Unexpected trailing input in J8 Lines')
1279
1280 return lines
1281
1282
1283def SplitJ8Lines(s):
1284 # type: (str) -> List[str]
1285 """Used by @(echo split command sub)
1286
1287 Raises:
1288 error.Decode
1289
1290 3 Errors:
1291 - J8 string syntax error inside quotes
1292 - Extra input on line
1293 - unquoted line isn't utf-8
1294 """
1295 p = J8LinesParser(s)
1296 return p.Parse()
1297
1298
1299# vim: sw=4