OILS / data_lang / j8.py View on Github | oilshell.org

1244 lines, 600 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5TODO:
6
7- Many more tests
8 - Run JSONTestSuite
9
10Later:
11
12- PrettyPrinter uses hnode.asdl?
13 - color
14 - line wrapping -- do this later
15 - would like CONTRIBUTORS here
16
17- Unify with ASDL pretty printing - NIL8
18 - {} [] are identical
19 - () is for statically typed ASDL data
20 (command.Simple blame_tok:(...) words:[ ])
21 although we are also using [] for typed ASDL arrays, not just JSON
22 - object IDs
23 - @ x123 can create an ID
24 - ! x123 can reference an ID
25 - <> can be for non-J8 data types? For the = operator
26 - 'hi \(name)' interpolation is useful for code
27
28- Common between JSON8 and NIL8 - for writing by hand
29 - comments - # line or // line (JSON5 uses // line, following JS)
30 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
31 - commas
32 - JSON8 could have trailing commas rule
33 - NIL8 at least has no commas for [1 2 "hi"]
34"""
35
36from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40from asdl import format as fmt
41from core import error
42from data_lang import pyj8
43# dependency issue: consts.py pulls in frontend/option_def.py
44from frontend import consts
45from frontend import match
46from mycpp import mops
47from mycpp import mylib
48from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50import fastfunc
51
52_ = log
53
54from typing import cast, Dict, List, Tuple, Optional
55
56
57# COPIED from ui.ValType() to break dep
58def ValType(val):
59 # type: (value_t) -> str
60 """For displaying type errors in the UI."""
61
62 return value_str(val.tag(), dot=False)
63
64
65if mylib.PYTHON:
66
67 def HeapValueId(val):
68 # type: (value_t) -> int
69 """
70 Python's id() returns the address, which is up to 64 bits.
71
72 In C++ we can use the GC ID, which fits within 32 bits.
73 """
74 return id(val)
75
76
77def ValueId(val):
78 # type: (value_t) -> int
79 """
80 Return an integer ID for object that:
81
82 1. Can be used to determine whether 2 objects are the same, e.g. for
83 List, Dict, Func, Proc, etc.
84 2. Will help detect object cycles
85
86 Primitives types like Int and Float don't have this notion. They're
87 immutable values that are copied and compared by value.
88 """
89 with tagswitch(val) as case:
90 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91 value_e.Str):
92 # These will not be on the heap if we switch to tagged pointers
93 # Str is handled conservatively - when we add small string
94 # optimization, some strings will be values, so we assume all are.
95 return -1
96 else:
97 return HeapValueId(val)
98
99
100def ValueIdString(val):
101 # type: (value_t) -> str
102 """Used by pp value (42) and = 42"""
103 heap_id = ValueId(val) # could be -1
104 if heap_id == -1:
105 return ''
106 else:
107 return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110def Utf8Encode(code):
111 # type: (int) -> str
112 """Return utf-8 encoded bytes from a unicode code point.
113
114 Based on https://stackoverflow.com/a/23502707
115 """
116 num_cont_bytes = 0
117
118 if code <= 0x7F:
119 return chr(code & 0x7F) # ASCII
120
121 elif code <= 0x7FF:
122 num_cont_bytes = 1
123 elif code <= 0xFFFF:
124 num_cont_bytes = 2
125 else:
126 # What about the check code <= 0x10FFFF ?
127 # - it happens in statically parsed $'' u''
128 # - but not dynamically parsed echo -e / printf, following bash/zsh
129 num_cont_bytes = 3
130
131 bytes_ = [] # type: List[int]
132 for _ in xrange(num_cont_bytes):
133 bytes_.append(0x80 | (code & 0x3F))
134 code >>= 6
135
136 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
137 bytes_.append(b)
138 bytes_.reverse()
139
140 # mod 256 because Python ints don't wrap around!
141 tmp = [chr(b & 0xFF) for b in bytes_]
142 return ''.join(tmp)
143
144
145SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147LOSSY_JSON = 1 << 3 # JSON is lossy
148
149# Hack until we fully translate
150assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153def _Print(val, buf, indent, options=0):
154 # type: (value_t, mylib.BufWriter, int, int) -> None
155 """
156 Args:
157 indent: number of spaces to indent, or -1 for everything on one line
158 """
159 p = InstancePrinter(buf, indent, options)
160 p.Print(val)
161
162
163def PrintMessage(val, buf, indent):
164 # type: (value_t, mylib.BufWriter, int) -> None
165 """ For json8 write (x) and toJson8()
166
167 Caller must handle error.Encode
168 """
169 _Print(val, buf, indent)
170
171
172def PrintJsonMessage(val, buf, indent):
173 # type: (value_t, mylib.BufWriter, int) -> None
174 """ For json write (x) and toJson()
175
176 Caller must handle error.Encode()
177 Doesn't decay to b'' strings - will use Unicode replacement char.
178 """
179 _Print(val, buf, indent, options=LOSSY_JSON)
180
181
182def PrintLine(val, f):
183 # type: (value_t, mylib.Writer) -> None
184 """ For pp line (x) """
185
186 # error.Encode should be impossible - we show cycles and non-data
187 buf = mylib.BufWriter()
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193def EncodeString(s, buf, unquoted_ok=False):
194 # type: (str, mylib.BufWriter, bool) -> None
195 """ For pp proc, etc."""
196
197 if unquoted_ok and fastfunc.CanOmitQuotes(s):
198 buf.write(s)
199 return
200
201 _Print(value.Str(s), buf, -1)
202
203
204def MaybeEncodeString(s):
205 # type: (str) -> str
206 """ For write --json8 $s and compexport """
207
208 # TODO: add unquoted_ok here?
209 # /usr/local/foo-bar/x.y/a_b
210
211 buf = mylib.BufWriter()
212 _Print(value.Str(s), buf, -1)
213 return buf.getvalue()
214
215
216def MaybeEncodeJsonString(s):
217 # type: (str) -> str
218 """ For write --json """
219
220 # TODO: add unquoted_ok here?
221 # /usr/local/foo-bar/x.y/a_b
222 buf = mylib.BufWriter()
223 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224 return buf.getvalue()
225
226
227# DFS traversal state
228UNSEEN = 0
229EXPLORING = 1
230FINISHED = 2
231
232
233class InstancePrinter(object):
234 """Print a value tree as J8/JSON."""
235
236 def __init__(self, buf, indent, options):
237 # type: (mylib.BufWriter, int, int) -> None
238 self.buf = buf
239 self.indent = indent
240 self.options = options
241
242 # Key is vm.HeapValueId(val)
243 # Value is always True
244 # Dict[int, None] doesn't translate -- it would be nice to have a set()
245 self.visited = {} # type: Dict[int, int]
246
247 def _ItemIndent(self, level):
248 # type: (int) -> None
249
250 if self.indent == -1:
251 return
252
253 self.buf.write_spaces((level + 1) * self.indent)
254
255 def _BracketIndent(self, level):
256 # type: (int) -> None
257
258 if self.indent == -1:
259 return
260
261 self.buf.write_spaces(level * self.indent)
262
263 def _MaybeNewline(self):
264 # type: () -> None
265 if self.indent == -1:
266 return
267 self.buf.write('\n')
268
269 def _MaybeSpace(self):
270 # type: () -> None
271 if self.indent == -1:
272 return
273 self.buf.write(' ')
274
275 def _PrintList(self, val, level):
276 # type: (value.List, int) -> None
277
278 if len(val.items) == 0: # Special case like Python/JS
279 self.buf.write('[]')
280 else:
281 self.buf.write('[')
282 self._MaybeNewline()
283 for i, item in enumerate(val.items):
284 if i != 0:
285 self.buf.write(',')
286 self._MaybeNewline()
287
288 self._ItemIndent(level)
289 self.Print(item, level + 1)
290 self._MaybeNewline()
291
292 self._BracketIndent(level)
293 self.buf.write(']')
294
295 def _PrintDict(self, val, level):
296 # type: (value.Dict, int) -> None
297
298 if len(val.d) == 0: # Special case like Python/JS
299 self.buf.write('{}')
300 else:
301 self.buf.write('{')
302 self._MaybeNewline()
303 i = 0
304 for k, v in iteritems(val.d):
305 if i != 0:
306 self.buf.write(',')
307 self._MaybeNewline()
308
309 self._ItemIndent(level)
310
311 pyj8.WriteString(k, self.options, self.buf)
312
313 self.buf.write(':')
314 self._MaybeSpace()
315
316 self.Print(v, level + 1)
317
318 i += 1
319
320 self._MaybeNewline()
321 self._BracketIndent(level)
322 self.buf.write('}')
323
324 def Print(self, val, level=0):
325 # type: (value_t, int) -> None
326
327 # special value that means everything is on one line
328 # It's like
329 # JSON.stringify(d, null, 0)
330 # except we use -1, not 0. 0 can still have newlines.
331
332 UP_val = val
333 with tagswitch(val) as case:
334 if case(value_e.Null):
335 self.buf.write('null')
336
337 elif case(value_e.Bool):
338 val = cast(value.Bool, UP_val)
339 self.buf.write('true' if val.b else 'false')
340
341 elif case(value_e.Int):
342 val = cast(value.Int, UP_val)
343 # TODO: avoid intermediate allocation with
344 # self.buf.WriteBigInt(val.i)
345 #
346 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347 # be of arbitrary length, and will need a growth strategy.
348 # Although that is not very common, so we could allocate in
349 # that case.
350
351 self.buf.write(mops.ToStr(val.i))
352
353 elif case(value_e.Float):
354 val = cast(value.Float, UP_val)
355 # TODO: avoid intrmediate allocation with
356 # self.buf.WriteFloat(val.f)
357 self.buf.write(str(val.f))
358
359 elif case(value_e.Str):
360 val = cast(value.Str, UP_val)
361
362 pyj8.WriteString(val.s, self.options, self.buf)
363
364 elif case(value_e.List):
365 val = cast(value.List, UP_val)
366
367 # Cycle detection, only for containers that can be in cycles
368 heap_id = HeapValueId(val)
369
370 node_state = self.visited.get(heap_id, UNSEEN)
371 if node_state == FINISHED:
372 # Print it AGAIN. We print a JSON tree, which means we can
373 # visit and print nodes MANY TIMES, as long as they're not
374 # in a cycle.
375 self._PrintList(val, level)
376 return
377 if node_state == EXPLORING:
378 if self.options & SHOW_CYCLES:
379 self.buf.write('[ -->%s ]' % ValueIdString(val))
380 return
381 else:
382 # node.js prints which index closes the cycle
383 raise error.Encode(
384 "Can't encode List%s in object cycle" %
385 ValueIdString(val))
386
387 self.visited[heap_id] = EXPLORING
388 self._PrintList(val, level)
389 self.visited[heap_id] = FINISHED
390
391 elif case(value_e.Dict):
392 val = cast(value.Dict, UP_val)
393
394 # Cycle detection, only for containers that can be in cycles
395 heap_id = HeapValueId(val)
396
397 node_state = self.visited.get(heap_id, UNSEEN)
398 if node_state == FINISHED:
399 # Print it AGAIN. We print a JSON tree, which means we can
400 # visit and print nodes MANY TIMES, as long as they're not
401 # in a cycle.
402 self._PrintDict(val, level)
403 return
404 if node_state == EXPLORING:
405 if self.options & SHOW_CYCLES:
406 self.buf.write('{ -->%s }' % ValueIdString(val))
407 return
408 else:
409 # node.js prints which key closes the cycle
410 raise error.Encode(
411 "Can't encode Dict%s in object cycle" %
412 ValueIdString(val))
413
414 self.visited[heap_id] = EXPLORING
415 self._PrintDict(val, level)
416 self.visited[heap_id] = FINISHED
417
418 # BashArray and BashAssoc should be printed with pp line (x), e.g.
419 # for spec tests.
420 # - BashAssoc has a clear encoding.
421 # - BashArray could eventually be Dict[int, str]. But that's not
422 # encodable in JSON, which has string keys!
423 # So I think we can print it like ["a",null,'b"] and that won't
424 # change. That's what users expect.
425 elif case(value_e.BashArray):
426 val = cast(value.BashArray, UP_val)
427
428 self.buf.write('[')
429 self._MaybeNewline()
430 for i, s in enumerate(val.strs):
431 if i != 0:
432 self.buf.write(',')
433 self._MaybeNewline()
434
435 self._ItemIndent(level)
436 if s is None:
437 self.buf.write('null')
438 else:
439 pyj8.WriteString(s, self.options, self.buf)
440
441 self._MaybeNewline()
442
443 self._BracketIndent(level)
444 self.buf.write(']')
445
446 elif case(value_e.BashAssoc):
447 val = cast(value.BashAssoc, UP_val)
448
449 self.buf.write('{')
450 self._MaybeNewline()
451 i = 0
452 for k2, v2 in iteritems(val.d):
453 if i != 0:
454 self.buf.write(',')
455 self._MaybeNewline()
456
457 self._ItemIndent(level)
458
459 pyj8.WriteString(k2, self.options, self.buf)
460
461 self.buf.write(':')
462 self._MaybeSpace()
463
464 pyj8.WriteString(v2, self.options, self.buf)
465
466 i += 1
467
468 self._MaybeNewline()
469 self._BracketIndent(level)
470 self.buf.write('}')
471
472 else:
473 pass # mycpp workaround
474 if self.options & SHOW_NON_DATA:
475 # Similar to = operator, ui.DebugPrint()
476 # TODO: that prints value.Range in a special way
477 ysh_type = ValType(val)
478 id_str = ValueIdString(val)
479 self.buf.write('<%s%s>' % (ysh_type, id_str))
480 else:
481 raise error.Encode("Can't serialize object of type %s" %
482 ValType(val))
483
484
485class PrettyPrinter(object):
486 """ Unused right now, but could enhance the = operator.
487
488 Output to polymorphic ColorOutput
489
490 Features like asdl/format.py:
491 - line wrapping
492 - color
493 - sharing detection by passing in a REF COUTN dict
494 - print @123 the first time, and then print ... the second time
495
496 and
497
498 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501 - Omitting commas for ASDL? Maybe we can use two spaces
502
503 (Token id: Id.VSub_DollarName start: 0 length: 3)
504 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505 """
506
507 def __init__(self, max_col):
508 # type: (int) -> None
509 self.max_col = max_col
510
511 # This could be an optimized set an C++ bit set like
512 # mark_sweep_heap.h, rather than a Dict
513 #self.unique_objs = mylib.UniqueObjects()
514
515 # first pass of object ID -> number of times references
516
517 self.ref_count = {} # type: Dict[int, int]
518
519 def PrettyTree(self, val, f):
520 # type: (value_t, fmt.ColorOutput) -> None
521
522 # TODO: first convert to hnode.asdl types?
523
524 # Although we might want
525 # hnode.AlreadyShown = (str type, int unique_id)
526 pass
527
528 def Print(self, val, buf):
529 # type: (value_t, mylib.BufWriter) -> None
530
531 # Or print to stderr?
532 f = fmt.DetectConsoleOutput(mylib.Stdout())
533 self.PrettyTree(val, f)
534
535 # Then print those with ASDL
536 pass
537
538
539class LexerDecoder(object):
540 """J8 lexer and string decoder.
541
542 Similar interface as SimpleLexer, except we return an optional decoded
543 string
544 """
545
546 def __init__(self, s, is_j8, lang_str):
547 # type: (str, bool, str) -> None
548 self.s = s
549 self.is_j8 = is_j8
550 self.lang_str = lang_str
551
552 self.pos = 0
553
554 # current line being lexed -- for error messages
555 self.cur_line_num = 1
556
557 # Reuse this instance to save GC objects. JSON objects could have
558 # thousands of strings.
559 self.decoded = mylib.BufWriter()
560
561 def _Error(self, msg, end_pos):
562 # type: (str, int) -> error.Decode
563
564 # Use the current position as start pos
565 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567 def Next(self):
568 # type: () -> Tuple[Id_t, int, Optional[str]]
569 """ Returns a token and updates self.pos """
570
571 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573 if not self.is_j8:
574 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575 raise self._Error(
576 "Single quotes aren't part of JSON; you may want 'json8 read'",
577 end_pos)
578 if tok_id == Id.Ignored_Comment:
579 raise self._Error(
580 "Comments aren't part of JSON; you may want 'json8 read'",
581 end_pos)
582
583 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584 Id.Left_USingleQuote):
585 return self._DecodeString(tok_id, end_pos)
586
587 if tok_id == Id.Left_JDoubleQuote:
588 if self.is_j8:
589 return self._DecodeString(tok_id, end_pos)
590 else:
591 raise self._Error('Pure JSON does not accept j"" prefix',
592 end_pos)
593
594 if tok_id == Id.Ignored_Newline:
595 #log('LINE %d', self.cur_line_num)
596 self.cur_line_num += 1
597
598 self.pos = end_pos
599 return tok_id, end_pos, None
600
601 def NextForLines(self):
602 # type: () -> Tuple[Id_t, int, Optional[str]]
603 """ Like Next(), but for J8 Lines """
604
605 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
606
607 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
608 Id.Left_BSingleQuote, Id.Left_USingleQuote):
609 return self._DecodeString(tok_id, end_pos)
610
611 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
612 # this for quoted strings.)
613 if (tok_id == Id.Lit_Chars and
614 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
615 raise self._Error(
616 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
617 if tok_id == Id.Char_AsciiControl:
618 raise self._Error(
619 "J8 Lines can't have unescaped ASCII control chars", end_pos)
620
621 if tok_id == Id.J8_Newline:
622 #log('LINE %d', self.cur_line_num)
623 self.cur_line_num += 1
624
625 self.pos = end_pos
626 return tok_id, end_pos, None
627
628 def _DecodeString(self, left_id, str_pos):
629 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
630 """ Returns a string token and updates self.pos """
631
632 while True:
633 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
634 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
635 else:
636 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
637
638 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
639
640 if tok_id == Id.Eol_Tok:
641 # TODO: point to beginning of # quote?
642 raise self._Error(
643 'Unexpected EOF while lexing %s string' % self.lang_str,
644 str_end)
645 if tok_id == Id.Unknown_Backslash:
646 raise self._Error(
647 'Bad backslash escape in %s string' % self.lang_str,
648 str_end)
649 if tok_id == Id.Char_AsciiControl:
650 raise self._Error(
651 "%s strings can't have unescaped ASCII control chars" %
652 self.lang_str, str_end)
653
654 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
655
656 self.pos = str_end
657
658 s = self.decoded.getvalue()
659 self.decoded.clear() # reuse this instance
660
661 #log('decoded %r', self.decoded.getvalue())
662 return Id.J8_String, str_end, s
663
664 #
665 # Now handle each kind of token
666 #
667
668 if tok_id == Id.Lit_Chars: # JSON and J8
669 part = self.s[str_pos:str_end]
670 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
671 raise self._Error(
672 'Invalid UTF-8 in %s string literal' % self.lang_str,
673 str_end)
674
675 # TODO: would be nice to avoid allocation in all these cases.
676 # But LookupCharC() would have to change.
677
678 elif tok_id == Id.Char_OneChar: # JSON and J8
679 ch = self.s[str_pos + 1]
680 part = consts.LookupCharC(ch)
681
682 elif tok_id == Id.Char_UBraced: # J8 only
683 h = self.s[str_pos + 3:str_end - 1]
684 i = int(h, 16)
685
686 # Same checks in osh/word_compile.py
687 if i > 0x10ffff:
688 raise self._Error(
689 "Code point can't be greater than U+10ffff", str_end)
690 if 0xD800 <= i and i < 0xE000:
691 raise self._Error(
692 r"\u{%s} escape is illegal because it's in the surrogate range"
693 % h, str_end)
694
695 part = Utf8Encode(i)
696
697 elif tok_id == Id.Char_YHex: # J8 only
698 h = self.s[str_pos + 2:str_end]
699
700 # Same check in osh/word_parse.py
701 if left_id != Id.Left_BSingleQuote:
702 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
703 raise self._Error(
704 r"\y%s escapes not allowed in u'' strings" % h,
705 str_end)
706
707 i = int(h, 16)
708 part = chr(i)
709
710 elif tok_id == Id.Char_SurrogatePair:
711 h1 = self.s[str_pos + 2:str_pos + 6]
712 h2 = self.s[str_pos + 8:str_pos + 12]
713
714 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
715 i1 = int(h1, 16) - 0xD800 # high surrogate
716 i2 = int(h2, 16) - 0xDC00 # low surrogate
717 code_point = 0x10000 + (i1 << 10) + i2
718
719 part = Utf8Encode(code_point)
720
721 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
722 h = self.s[str_pos + 2:str_end]
723 i = int(h, 16)
724 part = Utf8Encode(i)
725
726 else:
727 # Should never happen
728 raise AssertionError(Id_str(tok_id))
729
730 #log('%s part %r', Id_str(tok_id), part)
731 self.decoded.write(part)
732 str_pos = str_end
733
734
735class _Parser(object):
736
737 def __init__(self, s, is_j8):
738 # type: (str, bool) -> None
739 self.s = s
740 self.is_j8 = is_j8
741 self.lang_str = "J8" if is_j8 else "JSON"
742
743 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
744 self.tok_id = Id.Undefined_Tok
745 self.start_pos = 0
746 self.end_pos = 0
747 self.decoded = '' # decoded J8 string
748
749 def _Next(self):
750 # type: () -> None
751
752 # This isn't the start of a J8_Bool token, it's the END of the token before it
753 while True:
754 self.start_pos = self.end_pos
755 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
756 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
757 Id.Ignored_Comment):
758 break
759 # TODO: add Ignored_Newline to count lines, and show line numbers
760 # in errors messages. The position of the last newline and a token
761 # can be used to calculate a column number.
762
763 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
764
765 def _Eat(self, tok_id):
766 # type: (Id_t) -> None
767
768 if self.tok_id != tok_id:
769 #log('position %r %d-%d %r', self.s, self.start_pos,
770 # self.end_pos, self.s[self.start_pos:self.end_pos])
771 raise self._ParseError("Expected %s, got %s" %
772 (Id_str(tok_id), Id_str(self.tok_id)))
773 self._Next()
774
775 def _NextForLines(self):
776 # type: () -> None
777 """Like _Next, but use the J8 Lines lexer."""
778 self.start_pos = self.end_pos
779 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
780
781 def _ParseError(self, msg):
782 # type: (str) -> error.Decode
783 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
784 self.lexer.cur_line_num)
785
786
787class Parser(_Parser):
788 """JSON and JSON8 Parser."""
789
790 def __init__(self, s, is_j8):
791 # type: (str, bool) -> None
792 _Parser.__init__(self, s, is_j8)
793
794 def _ParsePair(self):
795 # type: () -> Tuple[str, value_t]
796
797 k = self.decoded # Save the potential string value
798 self._Eat(Id.J8_String) # Check that it's a string
799 assert k is not None
800
801 self._Eat(Id.J8_Colon)
802
803 v = self._ParseValue()
804 return k, v
805
806 def _ParseDict(self):
807 # type: () -> value_t
808 """
809 pair = string ':' value
810 Dict = '{' '}'
811 | '{' pair (',' pair)* '}'
812 """
813 # precondition
814 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
815
816 #log('> Dict')
817
818 d = NewDict() # type: Dict[str, value_t]
819
820 self._Next()
821 if self.tok_id == Id.J8_RBrace:
822 self._Next()
823 return value.Dict(d)
824
825 k, v = self._ParsePair()
826 d[k] = v
827 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
828
829 while self.tok_id == Id.J8_Comma:
830 self._Next()
831 k, v = self._ParsePair()
832 d[k] = v
833 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
834
835 self._Eat(Id.J8_RBrace)
836
837 #log('< Dict')
838
839 return value.Dict(d)
840
841 def _ParseList(self):
842 # type: () -> value_t
843 """
844 List = '[' ']'
845 | '[' value (',' value)* ']'
846 """
847 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
848
849 items = [] # type: List[value_t]
850
851 self._Next()
852 if self.tok_id == Id.J8_RBracket:
853 self._Next()
854 return value.List(items)
855
856 items.append(self._ParseValue())
857
858 while self.tok_id == Id.J8_Comma:
859 self._Next()
860 items.append(self._ParseValue())
861
862 self._Eat(Id.J8_RBracket)
863
864 return value.List(items)
865
866 def _ParseValue(self):
867 # type: () -> value_t
868 if self.tok_id == Id.J8_LBrace:
869 return self._ParseDict()
870
871 elif self.tok_id == Id.J8_LBracket:
872 return self._ParseList()
873
874 elif self.tok_id == Id.J8_Null:
875 self._Next()
876 return value.Null
877
878 elif self.tok_id == Id.J8_Bool:
879 #log('%r %d', self.s[self.start_pos], self.start_pos)
880 b = value.Bool(self.s[self.start_pos] == 't')
881 self._Next()
882 return b
883
884 elif self.tok_id == Id.J8_Int:
885 part = self.s[self.start_pos:self.end_pos]
886 self._Next()
887 try:
888 big = mops.FromStr(part)
889 except ValueError:
890 raise self._ParseError('Integer is too big')
891 return value.Int(big)
892
893 elif self.tok_id == Id.J8_Float:
894 part = self.s[self.start_pos:self.end_pos]
895 self._Next()
896 return value.Float(float(part))
897
898 # UString, BString too
899 elif self.tok_id == Id.J8_String:
900 str_val = value.Str(self.decoded)
901 #log('d %r', self.decoded)
902 self._Next()
903 return str_val
904
905 elif self.tok_id == Id.Eol_Tok:
906 raise self._ParseError('Unexpected EOF while parsing %s' %
907 self.lang_str)
908
909 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
910 raise self._ParseError('Invalid token while parsing %s: %s' %
911 (self.lang_str, Id_str(self.tok_id)))
912
913 def ParseValue(self):
914 # type: () -> value_t
915 """ Raises error.Decode. """
916 self._Next()
917 obj = self._ParseValue()
918 if self.tok_id != Id.Eol_Tok:
919 raise self._ParseError('Unexpected trailing input')
920 return obj
921
922
923class Nil8Parser(_Parser):
924 """
925 Tokens not in JSON8:
926 LParen RParen Symbol
927
928 Tokens not in JSON, but in JSON8 and NIL8:
929 Identifier (unquoted keys)
930 Ignored_Comment
931 """
932
933 def __init__(self, s, is_j8):
934 # type: (str, bool) -> None
935 _Parser.__init__(self, s, is_j8)
936
937 if 0:
938
939 def _LookAhead(self):
940 # type: () -> Id_t
941 """
942 Don't need this right now
943 """
944 end_pos = self.end_pos # look ahead from last token
945 while True:
946 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
947 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
948 Id.Ignored_Comment):
949 break
950 return tok_id
951
952 def _ParseRecord(self):
953 # type: () -> nvalue_t
954 """
955 Yaks
956 (self->Next) => (-> self Next)
957 (self->Next obj.field) => ((-> self Next) (. obj field))
958
959 Similar to
960 ((identity identity) 42) => 42 in Clojure
961
962 ASDL
963 (Node left:(. x4beef2))
964 (Node left !x4beef2)
965
966 # Ambiguous because value can be identifier.
967 # We have to look ahead to and see if there's a colon :
968 field =
969 Identifier ':' value
970 | value
971
972 record = '(' head field* ')'
973
974 - Identifier | Symbol are treated the same, it's a side effect of
975 the lexing style
976 - do positional args come before named args
977 - () is invalid? Use [] for empty list
978 """
979 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
980
981 items = [] # type: List[nvalue_t]
982
983 self._Next()
984 if self.tok_id == Id.J8_RParen:
985 self._Next()
986 return nvalue.List(items)
987
988 #log('TOK %s', Id_str(self.tok_id))
989 while self.tok_id != Id.J8_RParen:
990 items.append(self._ParseNil8())
991 #log('TOK 2 %s', Id_str(self.tok_id))
992
993 self._Eat(Id.J8_RParen)
994
995 return nvalue.List(items)
996
997 def _ParseList8(self):
998 # type: () -> nvalue_t
999 """
1000 List8 = '[' value* ']'
1001
1002 No commas, not even optional ones for now.
1003 """
1004 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1005
1006 items = [] # type: List[nvalue_t]
1007
1008 self._Next()
1009 if self.tok_id == Id.J8_RBracket:
1010 self._Next()
1011 return nvalue.List(items)
1012
1013 #log('TOK %s', Id_str(self.tok_id))
1014 while self.tok_id != Id.J8_RBracket:
1015 items.append(self._ParseNil8())
1016 #log('TOK 2 %s', Id_str(self.tok_id))
1017
1018 self._Eat(Id.J8_RBracket)
1019
1020 return nvalue.List(items)
1021
1022 def _ParseNil8(self):
1023 # type: () -> nvalue_t
1024 if self.tok_id == Id.J8_LParen:
1025 obj = self._ParseRecord() # type: nvalue_t
1026 #return obj
1027
1028 elif self.tok_id == Id.J8_LBracket:
1029 obj = self._ParseList8()
1030 #return obj
1031
1032 # Primitives are copied from J8 above.
1033 # TODO: We also want hex literals.
1034 elif self.tok_id == Id.J8_Null:
1035 self._Next()
1036 obj = nvalue.Null
1037
1038 elif self.tok_id == Id.J8_Bool:
1039 b = nvalue.Bool(self.s[self.start_pos] == 't')
1040 self._Next()
1041 obj = b
1042
1043 elif self.tok_id == Id.J8_Int:
1044 part = self.s[self.start_pos:self.end_pos]
1045 self._Next()
1046 obj = nvalue.Int(int(part))
1047
1048 elif self.tok_id == Id.J8_Float:
1049 part = self.s[self.start_pos:self.end_pos]
1050 self._Next()
1051 obj = nvalue.Float(float(part))
1052
1053 elif self.tok_id == Id.J8_String:
1054 str_val = nvalue.Str(self.decoded)
1055 self._Next()
1056 obj = str_val
1057
1058 # <- etc.
1059 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1060 Id.J8_Comma):
1061 # unquoted "word" treated like a string
1062 part = self.s[self.start_pos:self.end_pos]
1063 self._Next()
1064 obj = nvalue.Symbol(part)
1065
1066 elif self.tok_id == Id.Eol_Tok:
1067 raise self._ParseError('Unexpected EOF while parsing %s' %
1068 self.lang_str)
1069
1070 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1071 raise self._ParseError('Invalid token while parsing %s: %s' %
1072 (self.lang_str, Id_str(self.tok_id)))
1073
1074 #log('YO %s', Id_str(self.tok_id))
1075 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1076 #log('AT %s', Id_str(self.tok_id))
1077
1078 # key: "value" -> (: key "value")
1079 part = self.s[self.start_pos:self.end_pos]
1080 op = nvalue.Symbol(part)
1081
1082 self._Next()
1083 operand2 = self._ParseNil8()
1084 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1085 #print("--> INFIX %d %s" % (id(infix), infix))
1086 return infix
1087
1088 #next_id = self._LookAhead()
1089 #print('NEXT %s' % Id_str(next_id))
1090
1091 #raise AssertionError()
1092 #print("--> OBJ %d %s" % (id(obj), obj))
1093 return obj
1094
1095 def ParseNil8(self):
1096 # type: () -> nvalue_t
1097 """ Raises error.Decode. """
1098 self._Next()
1099 #print('yo')
1100 obj = self._ParseNil8()
1101 #print("==> %d %s" % (id(obj), obj))
1102 if self.tok_id != Id.Eol_Tok:
1103 raise self._ParseError('Unexpected trailing input')
1104 return obj
1105
1106
1107class J8LinesParser(_Parser):
1108 """Decode lines from a string with newlines.
1109
1110 We specify this with a grammar, to preserve location info and to reduce
1111 allocations. (But note that unquoted_line is more like a LOOP than it is
1112 grammatical.)
1113
1114 Grammar:
1115
1116 end = J8_Newline | Eol_Tok
1117
1118 empty_line = WS_Space? end
1119
1120 # special case: read until end token, but REMOVE trailing WS_Space
1121 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1122
1123 j8_line = WS_Space? J8_String WS_Space? end
1124
1125 lines = (empty_line | unquoted_line | j8_line)*
1126
1127 where Lit_Chars is valid UTF-8
1128
1129 Notes:
1130
1131 (1) We disallow multiple strings on a line, like:
1132
1133 "json" "json2"
1134 "json" unquoted
1135
1136 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1137
1138 foo "" u''
1139
1140 The "" and u'' are not a decoded string, because the line started with
1141 Id.Lit_Chars literals.
1142
1143 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1144 Does it have - for empty cell?
1145 """
1146
1147 def __init__(self, s):
1148 # type: (str) -> None
1149 _Parser.__init__(self, s, True)
1150
1151 def _Show(self, s):
1152 # type: (str) -> None
1153 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1154 self.end_pos)
1155
1156 def _ParseLine(self, out):
1157 # type: (List[str]) -> None
1158 """ May append a line to 'out' """
1159 #self._Show('1')
1160 if self.tok_id == Id.WS_Space:
1161 self._NextForLines()
1162
1163 # Empty line - return without doing anything
1164 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1165 self._NextForLines()
1166 return
1167
1168 # Quoted string on line
1169 if self.tok_id == Id.J8_String:
1170 out.append(self.decoded)
1171 self._NextForLines()
1172
1173 if self.tok_id == Id.WS_Space: # trailing whitespace
1174 self._NextForLines()
1175
1176 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1177 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1178 Id_str(self.tok_id))
1179
1180 self._NextForLines()
1181 return
1182
1183 # Unquoted line
1184 if self.tok_id == Id.Lit_Chars:
1185 # ' unquoted "" text on line ' # read every token until end
1186 string_start = self.start_pos
1187 while True:
1188 # for stripping whitespace
1189 prev_id = self.tok_id
1190 prev_start = self.start_pos
1191
1192 self._NextForLines()
1193
1194 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1195 # \r, but we're sticking with the JSON spec definition of
1196 # whitespace. (As another data point, CPython on Unix allows
1197 # \r in the middle of expressions, treating it as whitespace.)
1198 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1199 break
1200
1201 if prev_id == Id.WS_Space:
1202 string_end = prev_start # remove trailing whitespace
1203 else:
1204 string_end = self.start_pos
1205
1206 out.append(self.s[string_start:string_end])
1207
1208 self._NextForLines() # past newline
1209 return
1210
1211 raise AssertionError(Id_str(self.tok_id))
1212
1213 def Parse(self):
1214 # type: () -> List[str]
1215 """ Raises error.Decode. """
1216 self._NextForLines()
1217
1218 lines = [] # type: List[str]
1219 while self.tok_id != Id.Eol_Tok:
1220 self._ParseLine(lines)
1221
1222 if self.tok_id != Id.Eol_Tok:
1223 raise self._ParseError('Unexpected trailing input in J8 Lines')
1224
1225 return lines
1226
1227
1228def SplitJ8Lines(s):
1229 # type: (str) -> List[str]
1230 """Used by @(echo split command sub)
1231
1232 Raises:
1233 error.Decode
1234
1235 3 Errors:
1236 - J8 string syntax error inside quotes
1237 - Extra input on line
1238 - unquoted line isn't utf-8
1239 """
1240 p = J8LinesParser(s)
1241 return p.Parse()
1242
1243
1244# vim: sw=4