OILS / data_lang / j8.py View on Github | oilshell.org

1249 lines, 603 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5TODO:
6
7- Many more tests
8 - Run JSONTestSuite
9
10Later:
11
12- PrettyPrinter uses hnode.asdl?
13 - color
14 - line wrapping -- do this later
15 - would like CONTRIBUTORS here
16
17- Unify with ASDL pretty printing - NIL8
18 - {} [] are identical
19 - () is for statically typed ASDL data
20 (command.Simple blame_tok:(...) words:[ ])
21 although we are also using [] for typed ASDL arrays, not just JSON
22 - object IDs
23 - @ x123 can create an ID
24 - ! x123 can reference an ID
25 - <> can be for non-J8 data types? For the = operator
26 - 'hi \(name)' interpolation is useful for code
27
28- Common between JSON8 and NIL8 - for writing by hand
29 - comments - # line or // line (JSON5 uses // line, following JS)
30 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
31 - commas
32 - JSON8 could have trailing commas rule
33 - NIL8 at least has no commas for [1 2 "hi"]
34"""
35
36from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40from asdl import format as fmt
41from core import error
42from data_lang import pyj8
43# dependency issue: consts.py pulls in frontend/option_def.py
44from frontend import consts
45from frontend import match
46from mycpp import mops
47from mycpp import mylib
48from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50import fastfunc
51
52_ = log
53
54from typing import cast, Dict, List, Tuple, Optional
55
56
57# COPIED from ui.ValType() to break dep
58def ValType(val):
59 # type: (value_t) -> str
60 """For displaying type errors in the UI."""
61
62 return value_str(val.tag(), dot=False)
63
64
65if mylib.PYTHON:
66
67 def HeapValueId(val):
68 # type: (value_t) -> int
69 """
70 Python's id() returns the address, which is up to 64 bits.
71
72 In C++ we can use the GC ID, which fits within 32 bits.
73 """
74 return id(val)
75
76
77def ValueId(val):
78 # type: (value_t) -> int
79 """
80 Return an integer ID for object that:
81
82 1. Can be used to determine whether 2 objects are the same, e.g. for
83 List, Dict, Func, Proc, etc.
84 2. Will help detect object cycles
85
86 Primitives types like Int and Float don't have this notion. They're
87 immutable values that are copied and compared by value.
88 """
89 with tagswitch(val) as case:
90 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91 value_e.Str):
92 # These will not be on the heap if we switch to tagged pointers
93 # Str is handled conservatively - when we add small string
94 # optimization, some strings will be values, so we assume all are.
95 return -1
96 else:
97 return HeapValueId(val)
98
99
100def ValueIdString(val):
101 # type: (value_t) -> str
102 """Used by pp value (42) and = 42"""
103 heap_id = ValueId(val) # could be -1
104 if heap_id == -1:
105 return ''
106 else:
107 return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110def Utf8Encode(code):
111 # type: (int) -> str
112 """Return utf-8 encoded bytes from a unicode code point.
113
114 Based on https://stackoverflow.com/a/23502707
115 """
116 num_cont_bytes = 0
117
118 if code <= 0x7F:
119 return chr(code & 0x7F) # ASCII
120
121 elif code <= 0x7FF:
122 num_cont_bytes = 1
123 elif code <= 0xFFFF:
124 num_cont_bytes = 2
125 else:
126 # What about the check code <= 0x10FFFF ?
127 # - it happens in statically parsed $'' u''
128 # - but not dynamically parsed echo -e / printf, following bash/zsh
129 num_cont_bytes = 3
130
131 bytes_ = [] # type: List[int]
132 for _ in xrange(num_cont_bytes):
133 bytes_.append(0x80 | (code & 0x3F))
134 code >>= 6
135
136 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
137 bytes_.append(b)
138 bytes_.reverse()
139
140 # mod 256 because Python ints don't wrap around!
141 tmp = [chr(b & 0xFF) for b in bytes_]
142 return ''.join(tmp)
143
144
145SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147LOSSY_JSON = 1 << 3 # JSON is lossy
148
149# Hack until we fully translate
150assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153def _Print(val, buf, indent, options=0):
154 # type: (value_t, mylib.BufWriter, int, int) -> None
155 """
156 Args:
157 indent: number of spaces to indent, or -1 for everything on one line
158 """
159 p = InstancePrinter(buf, indent, options)
160 p.Print(val)
161
162
163def PrintMessage(val, buf, indent):
164 # type: (value_t, mylib.BufWriter, int) -> None
165 """ For json8 write (x) and toJson8()
166
167 Caller must handle error.Encode
168 """
169 _Print(val, buf, indent)
170
171
172def PrintJsonMessage(val, buf, indent):
173 # type: (value_t, mylib.BufWriter, int) -> None
174 """ For json write (x) and toJson()
175
176 Caller must handle error.Encode()
177 Doesn't decay to b'' strings - will use Unicode replacement char.
178 """
179 _Print(val, buf, indent, options=LOSSY_JSON)
180
181
182def PrintLine(val, f):
183 # type: (value_t, mylib.Writer) -> None
184 """ For pp line (x) """
185
186 # error.Encode should be impossible - we show cycles and non-data
187 buf = mylib.BufWriter()
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193def EncodeString(s, buf, unquoted_ok=False):
194 # type: (str, mylib.BufWriter, bool) -> None
195 """ For pp proc, etc."""
196
197 if unquoted_ok and fastfunc.CanOmitQuotes(s):
198 buf.write(s)
199 return
200
201 _Print(value.Str(s), buf, -1)
202
203
204def MaybeEncodeString(s):
205 # type: (str) -> str
206 """ For write --json8 $s and compexport """
207
208 # TODO: add unquoted_ok here?
209 # /usr/local/foo-bar/x.y/a_b
210
211 buf = mylib.BufWriter()
212 _Print(value.Str(s), buf, -1)
213 return buf.getvalue()
214
215
216def MaybeEncodeJsonString(s):
217 # type: (str) -> str
218 """ For write --json """
219
220 # TODO: add unquoted_ok here?
221 # /usr/local/foo-bar/x.y/a_b
222 buf = mylib.BufWriter()
223 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224 return buf.getvalue()
225
226
227# DFS traversal state
228UNSEEN = 0
229EXPLORING = 1
230FINISHED = 2
231
232
233class InstancePrinter(object):
234 """Print a value tree as J8/JSON."""
235
236 def __init__(self, buf, indent, options):
237 # type: (mylib.BufWriter, int, int) -> None
238 self.buf = buf
239 self.indent = indent
240 self.options = options
241
242 # Key is vm.HeapValueId(val)
243 # Value is always True
244 # Dict[int, None] doesn't translate -- it would be nice to have a set()
245 self.visited = {} # type: Dict[int, int]
246
247 def _ItemIndent(self, level):
248 # type: (int) -> None
249
250 if self.indent == -1:
251 return
252
253 self.buf.write_spaces((level + 1) * self.indent)
254
255 def _BracketIndent(self, level):
256 # type: (int) -> None
257
258 if self.indent == -1:
259 return
260
261 self.buf.write_spaces(level * self.indent)
262
263 def _MaybeNewline(self):
264 # type: () -> None
265 if self.indent == -1:
266 return
267 self.buf.write('\n')
268
269 def _MaybeSpace(self):
270 # type: () -> None
271 if self.indent == -1:
272 return
273 self.buf.write(' ')
274
275 def _PrintList(self, val, level):
276 # type: (value.List, int) -> None
277
278 if len(val.items) == 0: # Special case like Python/JS
279 self.buf.write('[]')
280 else:
281 self.buf.write('[')
282 self._MaybeNewline()
283 for i, item in enumerate(val.items):
284 if i != 0:
285 self.buf.write(',')
286 self._MaybeNewline()
287
288 self._ItemIndent(level)
289 self.Print(item, level + 1)
290 self._MaybeNewline()
291
292 self._BracketIndent(level)
293 self.buf.write(']')
294
295 def _PrintDict(self, val, level):
296 # type: (value.Dict, int) -> None
297
298 if len(val.d) == 0: # Special case like Python/JS
299 self.buf.write('{}')
300 else:
301 self.buf.write('{')
302 self._MaybeNewline()
303 i = 0
304 for k, v in iteritems(val.d):
305 if i != 0:
306 self.buf.write(',')
307 self._MaybeNewline()
308
309 self._ItemIndent(level)
310
311 pyj8.WriteString(k, self.options, self.buf)
312
313 self.buf.write(':')
314 self._MaybeSpace()
315
316 self.Print(v, level + 1)
317
318 i += 1
319
320 self._MaybeNewline()
321 self._BracketIndent(level)
322 self.buf.write('}')
323
324 def Print(self, val, level=0):
325 # type: (value_t, int) -> None
326
327 # special value that means everything is on one line
328 # It's like
329 # JSON.stringify(d, null, 0)
330 # except we use -1, not 0. 0 can still have newlines.
331
332 UP_val = val
333 with tagswitch(val) as case:
334 if case(value_e.Null):
335 self.buf.write('null')
336
337 elif case(value_e.Bool):
338 val = cast(value.Bool, UP_val)
339 self.buf.write('true' if val.b else 'false')
340
341 elif case(value_e.Int):
342 val = cast(value.Int, UP_val)
343 # TODO: avoid intermediate allocation with
344 # self.buf.WriteBigInt(val.i)
345 #
346 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347 # be of arbitrary length, and will need a growth strategy.
348 # Although that is not very common, so we could allocate in
349 # that case.
350
351 self.buf.write(mops.ToStr(val.i))
352
353 elif case(value_e.Float):
354 val = cast(value.Float, UP_val)
355 # TODO: avoid intrmediate allocation with
356 # self.buf.WriteFloat(val.f)
357 self.buf.write(str(val.f))
358
359 elif case(value_e.Str):
360 val = cast(value.Str, UP_val)
361
362 pyj8.WriteString(val.s, self.options, self.buf)
363
364 elif case(value_e.List):
365 val = cast(value.List, UP_val)
366
367 # Cycle detection, only for containers that can be in cycles
368 heap_id = HeapValueId(val)
369
370 node_state = self.visited.get(heap_id, UNSEEN)
371 if node_state == FINISHED:
372 # Print it AGAIN. We print a JSON tree, which means we can
373 # visit and print nodes MANY TIMES, as long as they're not
374 # in a cycle.
375 self._PrintList(val, level)
376 return
377 if node_state == EXPLORING:
378 if self.options & SHOW_CYCLES:
379 self.buf.write('[ -->%s ]' % ValueIdString(val))
380 return
381 else:
382 # node.js prints which index closes the cycle
383 raise error.Encode(
384 "Can't encode List%s in object cycle" %
385 ValueIdString(val))
386
387 self.visited[heap_id] = EXPLORING
388 self._PrintList(val, level)
389 self.visited[heap_id] = FINISHED
390
391 elif case(value_e.Dict):
392 val = cast(value.Dict, UP_val)
393
394 # Cycle detection, only for containers that can be in cycles
395 heap_id = HeapValueId(val)
396
397 node_state = self.visited.get(heap_id, UNSEEN)
398 if node_state == FINISHED:
399 # Print it AGAIN. We print a JSON tree, which means we can
400 # visit and print nodes MANY TIMES, as long as they're not
401 # in a cycle.
402 self._PrintDict(val, level)
403 return
404 if node_state == EXPLORING:
405 if self.options & SHOW_CYCLES:
406 self.buf.write('{ -->%s }' % ValueIdString(val))
407 return
408 else:
409 # node.js prints which key closes the cycle
410 raise error.Encode(
411 "Can't encode Dict%s in object cycle" %
412 ValueIdString(val))
413
414 self.visited[heap_id] = EXPLORING
415 self._PrintDict(val, level)
416 self.visited[heap_id] = FINISHED
417
418 # BashArray and BashAssoc should be printed with pp line (x), e.g.
419 # for spec tests.
420 # - BashAssoc has a clear encoding.
421 # - BashArray could eventually be Dict[int, str]. But that's not
422 # encodable in JSON, which has string keys!
423 # So I think we can print it like ["a",null,'b"] and that won't
424 # change. That's what users expect.
425 elif case(value_e.BashArray):
426 val = cast(value.BashArray, UP_val)
427
428 self.buf.write('[')
429 self._MaybeNewline()
430 for i, s in enumerate(val.strs):
431 if i != 0:
432 self.buf.write(',')
433 self._MaybeNewline()
434
435 self._ItemIndent(level)
436 if s is None:
437 self.buf.write('null')
438 else:
439 pyj8.WriteString(s, self.options, self.buf)
440
441 self._MaybeNewline()
442
443 self._BracketIndent(level)
444 self.buf.write(']')
445
446 elif case(value_e.BashAssoc):
447 val = cast(value.BashAssoc, UP_val)
448
449 self.buf.write('{')
450 self._MaybeNewline()
451 i = 0
452 for k2, v2 in iteritems(val.d):
453 if i != 0:
454 self.buf.write(',')
455 self._MaybeNewline()
456
457 self._ItemIndent(level)
458
459 pyj8.WriteString(k2, self.options, self.buf)
460
461 self.buf.write(':')
462 self._MaybeSpace()
463
464 pyj8.WriteString(v2, self.options, self.buf)
465
466 i += 1
467
468 self._MaybeNewline()
469 self._BracketIndent(level)
470 self.buf.write('}')
471
472 else:
473 pass # mycpp workaround
474 if self.options & SHOW_NON_DATA:
475 # Similar to = operator, ui.DebugPrint()
476 # TODO: that prints value.Range in a special way
477 ysh_type = ValType(val)
478 id_str = ValueIdString(val)
479 self.buf.write('<%s%s>' % (ysh_type, id_str))
480 else:
481 raise error.Encode("Can't serialize object of type %s" %
482 ValType(val))
483
484
485class PrettyPrinter(object):
486 """ Unused right now, but could enhance the = operator.
487
488 Output to polymorphic ColorOutput
489
490 Features like asdl/format.py:
491 - line wrapping
492 - color
493 - sharing detection by passing in a REF COUTN dict
494 - print @123 the first time, and then print ... the second time
495
496 and
497
498 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501 - Omitting commas for ASDL? Maybe we can use two spaces
502
503 (Token id: Id.VSub_DollarName start: 0 length: 3)
504 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505 """
506
507 def __init__(self, max_col):
508 # type: (int) -> None
509 self.max_col = max_col
510
511 # This could be an optimized set an C++ bit set like
512 # mark_sweep_heap.h, rather than a Dict
513 #self.unique_objs = mylib.UniqueObjects()
514
515 # first pass of object ID -> number of times references
516
517 self.ref_count = {} # type: Dict[int, int]
518
519 def PrettyTree(self, val, f):
520 # type: (value_t, fmt.ColorOutput) -> None
521
522 # TODO: first convert to hnode.asdl types?
523
524 # Although we might want
525 # hnode.AlreadyShown = (str type, int unique_id)
526 pass
527
528 def Print(self, val, buf):
529 # type: (value_t, mylib.BufWriter) -> None
530
531 # Or print to stderr?
532 f = fmt.DetectConsoleOutput(mylib.Stdout())
533 self.PrettyTree(val, f)
534
535 # Then print those with ASDL
536 pass
537
538
539class LexerDecoder(object):
540 """J8 lexer and string decoder.
541
542 Similar interface as SimpleLexer, except we return an optional decoded
543 string
544 """
545
546 def __init__(self, s, is_j8, lang_str):
547 # type: (str, bool, str) -> None
548 self.s = s
549 self.is_j8 = is_j8
550 self.lang_str = lang_str
551
552 self.pos = 0
553
554 # current line being lexed -- for error messages
555 self.cur_line_num = 1
556
557 # Reuse this instance to save GC objects. JSON objects could have
558 # thousands of strings.
559 self.decoded = mylib.BufWriter()
560
561 def _Error(self, msg, end_pos):
562 # type: (str, int) -> error.Decode
563
564 # Use the current position as start pos
565 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567 def Next(self):
568 # type: () -> Tuple[Id_t, int, Optional[str]]
569 """ Returns a token and updates self.pos """
570
571 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573 if not self.is_j8:
574 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575 raise self._Error(
576 "Single quotes aren't part of JSON; you may want 'json8 read'",
577 end_pos)
578 if tok_id == Id.Ignored_Comment:
579 raise self._Error(
580 "Comments aren't part of JSON; you may want 'json8 read'",
581 end_pos)
582
583 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584 Id.Left_USingleQuote):
585 return self._DecodeString(tok_id, end_pos)
586
587 if tok_id == Id.Left_JDoubleQuote:
588 if self.is_j8:
589 return self._DecodeString(tok_id, end_pos)
590 else:
591 raise self._Error('Pure JSON does not accept j"" prefix',
592 end_pos)
593
594 if tok_id == Id.Ignored_Newline:
595 #log('LINE %d', self.cur_line_num)
596 self.cur_line_num += 1
597
598 self.pos = end_pos
599 return tok_id, end_pos, None
600
601 def NextForLines(self):
602 # type: () -> Tuple[Id_t, int, Optional[str]]
603 """ Like Next(), but for J8 Lines """
604
605 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
606
607 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
608 Id.Left_BSingleQuote, Id.Left_USingleQuote):
609 return self._DecodeString(tok_id, end_pos)
610
611 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
612 # this for quoted strings.)
613 if (tok_id == Id.Lit_Chars and
614 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
615 raise self._Error(
616 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
617 if tok_id == Id.Char_AsciiControl:
618 raise self._Error(
619 "J8 Lines can't have unescaped ASCII control chars", end_pos)
620
621 if tok_id == Id.J8_Newline:
622 #log('LINE %d', self.cur_line_num)
623 self.cur_line_num += 1
624
625 self.pos = end_pos
626 return tok_id, end_pos, None
627
628 def _DecodeString(self, left_id, str_pos):
629 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
630 """ Returns a string token and updates self.pos """
631
632 while True:
633 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
634 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
635 else:
636 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
637
638 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
639
640 if tok_id == Id.Eol_Tok:
641 # TODO: point to beginning of # quote?
642 raise self._Error(
643 'Unexpected EOF while lexing %s string' % self.lang_str,
644 str_end)
645 if tok_id == Id.Unknown_Backslash:
646 raise self._Error(
647 'Bad backslash escape in %s string' % self.lang_str,
648 str_end)
649 if tok_id == Id.Char_AsciiControl:
650 raise self._Error(
651 "%s strings can't have unescaped ASCII control chars" %
652 self.lang_str, str_end)
653
654 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
655
656 self.pos = str_end
657
658 s = self.decoded.getvalue()
659 self.decoded.clear() # reuse this instance
660
661 #log('decoded %r', self.decoded.getvalue())
662 return Id.J8_String, str_end, s
663
664 #
665 # Now handle each kind of token
666 #
667
668 if tok_id == Id.Lit_Chars: # JSON and J8
669 part = self.s[str_pos:str_end]
670 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
671 raise self._Error(
672 'Invalid UTF-8 in %s string literal' % self.lang_str,
673 str_end)
674
675 # TODO: would be nice to avoid allocation in all these cases.
676 # But LookupCharC() would have to change.
677
678 elif tok_id == Id.Char_OneChar: # JSON and J8
679 ch = self.s[str_pos + 1]
680 part = consts.LookupCharC(ch)
681
682 elif tok_id == Id.Char_UBraced: # J8 only
683 h = self.s[str_pos + 3:str_end - 1]
684 i = int(h, 16)
685
686 # Same checks in osh/word_compile.py
687 if i > 0x10ffff:
688 raise self._Error(
689 "Code point can't be greater than U+10ffff", str_end)
690 if 0xD800 <= i and i < 0xE000:
691 raise self._Error(
692 r"\u{%s} escape is illegal because it's in the surrogate range"
693 % h, str_end)
694
695 part = Utf8Encode(i)
696
697 elif tok_id == Id.Char_YHex: # J8 only
698 h = self.s[str_pos + 2:str_end]
699
700 # Same check in osh/word_parse.py
701 if left_id != Id.Left_BSingleQuote:
702 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
703 raise self._Error(
704 r"\y%s escapes not allowed in u'' strings" % h,
705 str_end)
706
707 i = int(h, 16)
708 part = chr(i)
709
710 elif tok_id == Id.Char_SurrogatePair:
711 h1 = self.s[str_pos + 2:str_pos + 6]
712 h2 = self.s[str_pos + 8:str_pos + 12]
713
714 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
715 i1 = int(h1, 16) - 0xD800 # high surrogate
716 i2 = int(h2, 16) - 0xDC00 # low surrogate
717 code_point = 0x10000 + (i1 << 10) + i2
718
719 part = Utf8Encode(code_point)
720
721 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
722 h = self.s[str_pos + 2:str_end]
723 i = int(h, 16)
724 part = Utf8Encode(i)
725
726 else:
727 # Should never happen
728 raise AssertionError(Id_str(tok_id))
729
730 #log('%s part %r', Id_str(tok_id), part)
731 self.decoded.write(part)
732 str_pos = str_end
733
734
735class _Parser(object):
736
737 def __init__(self, s, is_j8):
738 # type: (str, bool) -> None
739 self.s = s
740 self.is_j8 = is_j8
741 self.lang_str = "J8" if is_j8 else "JSON"
742
743 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
744 self.tok_id = Id.Undefined_Tok
745 self.start_pos = 0
746 self.end_pos = 0
747 self.decoded = '' # decoded J8 string
748
749 def _Next(self):
750 # type: () -> None
751
752 # This isn't the start of a J8_Bool token, it's the END of the token before it
753 while True:
754 self.start_pos = self.end_pos
755 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
756 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
757 Id.Ignored_Comment):
758 break
759 # TODO: add Ignored_Newline to count lines, and show line numbers
760 # in errors messages. The position of the last newline and a token
761 # can be used to calculate a column number.
762
763 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
764
765 def _Eat(self, tok_id):
766 # type: (Id_t) -> None
767
768 if self.tok_id != tok_id:
769 #log('position %r %d-%d %r', self.s, self.start_pos,
770 # self.end_pos, self.s[self.start_pos:self.end_pos])
771 raise self._ParseError("Expected %s, got %s" %
772 (Id_str(tok_id), Id_str(self.tok_id)))
773 self._Next()
774
775 def _NextForLines(self):
776 # type: () -> None
777 """Like _Next, but use the J8 Lines lexer."""
778 self.start_pos = self.end_pos
779 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
780
781 def _ParseError(self, msg):
782 # type: (str) -> error.Decode
783 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
784 self.lexer.cur_line_num)
785
786
787class Parser(_Parser):
788 """JSON and JSON8 Parser."""
789
790 def __init__(self, s, is_j8):
791 # type: (str, bool) -> None
792 _Parser.__init__(self, s, is_j8)
793
794 def _ParsePair(self):
795 # type: () -> Tuple[str, value_t]
796
797 k = self.decoded # Save the potential string value
798 self._Eat(Id.J8_String) # Check that it's a string
799 assert k is not None
800
801 self._Eat(Id.J8_Colon)
802
803 v = self._ParseValue()
804 return k, v
805
806 def _ParseDict(self):
807 # type: () -> value_t
808 """
809 pair = string ':' value
810 Dict = '{' '}'
811 | '{' pair (',' pair)* '}'
812 """
813 # precondition
814 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
815
816 #log('> Dict')
817
818 d = NewDict() # type: Dict[str, value_t]
819
820 self._Next()
821 if self.tok_id == Id.J8_RBrace:
822 self._Next()
823 return value.Dict(d)
824
825 k, v = self._ParsePair()
826 d[k] = v
827 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
828
829 while self.tok_id == Id.J8_Comma:
830 self._Next()
831 k, v = self._ParsePair()
832 d[k] = v
833 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
834
835 self._Eat(Id.J8_RBrace)
836
837 #log('< Dict')
838
839 return value.Dict(d)
840
841 def _ParseList(self):
842 # type: () -> value_t
843 """
844 List = '[' ']'
845 | '[' value (',' value)* ']'
846 """
847 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
848
849 items = [] # type: List[value_t]
850
851 self._Next()
852 if self.tok_id == Id.J8_RBracket:
853 self._Next()
854 return value.List(items)
855
856 items.append(self._ParseValue())
857
858 while self.tok_id == Id.J8_Comma:
859 self._Next()
860 items.append(self._ParseValue())
861
862 self._Eat(Id.J8_RBracket)
863
864 return value.List(items)
865
866 def _ParseValue(self):
867 # type: () -> value_t
868 if self.tok_id == Id.J8_LBrace:
869 return self._ParseDict()
870
871 elif self.tok_id == Id.J8_LBracket:
872 return self._ParseList()
873
874 elif self.tok_id == Id.J8_Null:
875 self._Next()
876 return value.Null
877
878 elif self.tok_id == Id.J8_Bool:
879 #log('%r %d', self.s[self.start_pos], self.start_pos)
880 b = value.Bool(self.s[self.start_pos] == 't')
881 self._Next()
882 return b
883
884 elif self.tok_id == Id.J8_Int:
885 part = self.s[self.start_pos:self.end_pos]
886 self._Next()
887 try:
888 big = mops.FromStr(part)
889 except ValueError:
890 raise self._ParseError('Integer is too big')
891 return value.Int(big)
892
893 elif self.tok_id == Id.J8_Float:
894 part = self.s[self.start_pos:self.end_pos]
895 self._Next()
896 return value.Float(float(part))
897
898 # UString, BString too
899 elif self.tok_id == Id.J8_String:
900 str_val = value.Str(self.decoded)
901 #log('d %r', self.decoded)
902 self._Next()
903 return str_val
904
905 elif self.tok_id == Id.Eol_Tok:
906 raise self._ParseError('Unexpected EOF while parsing %s' %
907 self.lang_str)
908
909 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
910 raise self._ParseError('Invalid token while parsing %s: %s' %
911 (self.lang_str, Id_str(self.tok_id)))
912
913 def ParseValue(self):
914 # type: () -> value_t
915 """ Raises error.Decode. """
916 self._Next()
917 obj = self._ParseValue()
918
919 n = len(self.s)
920 if self.start_pos != n:
921 extra = n - self.start_pos
922 #log('n %d pos %d', n, self.start_pos)
923 raise self._ParseError(
924 'Got %d bytes of unexpected trailing input' % extra)
925 return obj
926
927
928class Nil8Parser(_Parser):
929 """
930 Tokens not in JSON8:
931 LParen RParen Symbol
932
933 Tokens not in JSON, but in JSON8 and NIL8:
934 Identifier (unquoted keys)
935 Ignored_Comment
936 """
937
938 def __init__(self, s, is_j8):
939 # type: (str, bool) -> None
940 _Parser.__init__(self, s, is_j8)
941
942 if 0:
943
944 def _LookAhead(self):
945 # type: () -> Id_t
946 """
947 Don't need this right now
948 """
949 end_pos = self.end_pos # look ahead from last token
950 while True:
951 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
952 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
953 Id.Ignored_Comment):
954 break
955 return tok_id
956
957 def _ParseRecord(self):
958 # type: () -> nvalue_t
959 """
960 Yaks
961 (self->Next) => (-> self Next)
962 (self->Next obj.field) => ((-> self Next) (. obj field))
963
964 Similar to
965 ((identity identity) 42) => 42 in Clojure
966
967 ASDL
968 (Node left:(. x4beef2))
969 (Node left !x4beef2)
970
971 # Ambiguous because value can be identifier.
972 # We have to look ahead to and see if there's a colon :
973 field =
974 Identifier ':' value
975 | value
976
977 record = '(' head field* ')'
978
979 - Identifier | Symbol are treated the same, it's a side effect of
980 the lexing style
981 - do positional args come before named args
982 - () is invalid? Use [] for empty list
983 """
984 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
985
986 items = [] # type: List[nvalue_t]
987
988 self._Next()
989 if self.tok_id == Id.J8_RParen:
990 self._Next()
991 return nvalue.List(items)
992
993 #log('TOK %s', Id_str(self.tok_id))
994 while self.tok_id != Id.J8_RParen:
995 items.append(self._ParseNil8())
996 #log('TOK 2 %s', Id_str(self.tok_id))
997
998 self._Eat(Id.J8_RParen)
999
1000 return nvalue.List(items)
1001
1002 def _ParseList8(self):
1003 # type: () -> nvalue_t
1004 """
1005 List8 = '[' value* ']'
1006
1007 No commas, not even optional ones for now.
1008 """
1009 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1010
1011 items = [] # type: List[nvalue_t]
1012
1013 self._Next()
1014 if self.tok_id == Id.J8_RBracket:
1015 self._Next()
1016 return nvalue.List(items)
1017
1018 #log('TOK %s', Id_str(self.tok_id))
1019 while self.tok_id != Id.J8_RBracket:
1020 items.append(self._ParseNil8())
1021 #log('TOK 2 %s', Id_str(self.tok_id))
1022
1023 self._Eat(Id.J8_RBracket)
1024
1025 return nvalue.List(items)
1026
1027 def _ParseNil8(self):
1028 # type: () -> nvalue_t
1029 if self.tok_id == Id.J8_LParen:
1030 obj = self._ParseRecord() # type: nvalue_t
1031 #return obj
1032
1033 elif self.tok_id == Id.J8_LBracket:
1034 obj = self._ParseList8()
1035 #return obj
1036
1037 # Primitives are copied from J8 above.
1038 # TODO: We also want hex literals.
1039 elif self.tok_id == Id.J8_Null:
1040 self._Next()
1041 obj = nvalue.Null
1042
1043 elif self.tok_id == Id.J8_Bool:
1044 b = nvalue.Bool(self.s[self.start_pos] == 't')
1045 self._Next()
1046 obj = b
1047
1048 elif self.tok_id == Id.J8_Int:
1049 part = self.s[self.start_pos:self.end_pos]
1050 self._Next()
1051 obj = nvalue.Int(int(part))
1052
1053 elif self.tok_id == Id.J8_Float:
1054 part = self.s[self.start_pos:self.end_pos]
1055 self._Next()
1056 obj = nvalue.Float(float(part))
1057
1058 elif self.tok_id == Id.J8_String:
1059 str_val = nvalue.Str(self.decoded)
1060 self._Next()
1061 obj = str_val
1062
1063 # <- etc.
1064 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1065 Id.J8_Comma):
1066 # unquoted "word" treated like a string
1067 part = self.s[self.start_pos:self.end_pos]
1068 self._Next()
1069 obj = nvalue.Symbol(part)
1070
1071 elif self.tok_id == Id.Eol_Tok:
1072 raise self._ParseError('Unexpected EOF while parsing %s' %
1073 self.lang_str)
1074
1075 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1076 raise self._ParseError('Invalid token while parsing %s: %s' %
1077 (self.lang_str, Id_str(self.tok_id)))
1078
1079 #log('YO %s', Id_str(self.tok_id))
1080 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1081 #log('AT %s', Id_str(self.tok_id))
1082
1083 # key: "value" -> (: key "value")
1084 part = self.s[self.start_pos:self.end_pos]
1085 op = nvalue.Symbol(part)
1086
1087 self._Next()
1088 operand2 = self._ParseNil8()
1089 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1090 #print("--> INFIX %d %s" % (id(infix), infix))
1091 return infix
1092
1093 #next_id = self._LookAhead()
1094 #print('NEXT %s' % Id_str(next_id))
1095
1096 #raise AssertionError()
1097 #print("--> OBJ %d %s" % (id(obj), obj))
1098 return obj
1099
1100 def ParseNil8(self):
1101 # type: () -> nvalue_t
1102 """ Raises error.Decode. """
1103 self._Next()
1104 #print('yo')
1105 obj = self._ParseNil8()
1106 #print("==> %d %s" % (id(obj), obj))
1107 if self.tok_id != Id.Eol_Tok:
1108 raise self._ParseError('Unexpected trailing input')
1109 return obj
1110
1111
1112class J8LinesParser(_Parser):
1113 """Decode lines from a string with newlines.
1114
1115 We specify this with a grammar, to preserve location info and to reduce
1116 allocations. (But note that unquoted_line is more like a LOOP than it is
1117 grammatical.)
1118
1119 Grammar:
1120
1121 end = J8_Newline | Eol_Tok
1122
1123 empty_line = WS_Space? end
1124
1125 # special case: read until end token, but REMOVE trailing WS_Space
1126 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1127
1128 j8_line = WS_Space? J8_String WS_Space? end
1129
1130 lines = (empty_line | unquoted_line | j8_line)*
1131
1132 where Lit_Chars is valid UTF-8
1133
1134 Notes:
1135
1136 (1) We disallow multiple strings on a line, like:
1137
1138 "json" "json2"
1139 "json" unquoted
1140
1141 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1142
1143 foo "" u''
1144
1145 The "" and u'' are not a decoded string, because the line started with
1146 Id.Lit_Chars literals.
1147
1148 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1149 Does it have - for empty cell?
1150 """
1151
1152 def __init__(self, s):
1153 # type: (str) -> None
1154 _Parser.__init__(self, s, True)
1155
1156 def _Show(self, s):
1157 # type: (str) -> None
1158 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1159 self.end_pos)
1160
1161 def _ParseLine(self, out):
1162 # type: (List[str]) -> None
1163 """ May append a line to 'out' """
1164 #self._Show('1')
1165 if self.tok_id == Id.WS_Space:
1166 self._NextForLines()
1167
1168 # Empty line - return without doing anything
1169 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1170 self._NextForLines()
1171 return
1172
1173 # Quoted string on line
1174 if self.tok_id == Id.J8_String:
1175 out.append(self.decoded)
1176 self._NextForLines()
1177
1178 if self.tok_id == Id.WS_Space: # trailing whitespace
1179 self._NextForLines()
1180
1181 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1182 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1183 Id_str(self.tok_id))
1184
1185 self._NextForLines()
1186 return
1187
1188 # Unquoted line
1189 if self.tok_id == Id.Lit_Chars:
1190 # ' unquoted "" text on line ' # read every token until end
1191 string_start = self.start_pos
1192 while True:
1193 # for stripping whitespace
1194 prev_id = self.tok_id
1195 prev_start = self.start_pos
1196
1197 self._NextForLines()
1198
1199 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1200 # \r, but we're sticking with the JSON spec definition of
1201 # whitespace. (As another data point, CPython on Unix allows
1202 # \r in the middle of expressions, treating it as whitespace.)
1203 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1204 break
1205
1206 if prev_id == Id.WS_Space:
1207 string_end = prev_start # remove trailing whitespace
1208 else:
1209 string_end = self.start_pos
1210
1211 out.append(self.s[string_start:string_end])
1212
1213 self._NextForLines() # past newline
1214 return
1215
1216 raise AssertionError(Id_str(self.tok_id))
1217
1218 def Parse(self):
1219 # type: () -> List[str]
1220 """ Raises error.Decode. """
1221 self._NextForLines()
1222
1223 lines = [] # type: List[str]
1224 while self.tok_id != Id.Eol_Tok:
1225 self._ParseLine(lines)
1226
1227 if self.tok_id != Id.Eol_Tok:
1228 raise self._ParseError('Unexpected trailing input in J8 Lines')
1229
1230 return lines
1231
1232
1233def SplitJ8Lines(s):
1234 # type: (str) -> List[str]
1235 """Used by @(echo split command sub)
1236
1237 Raises:
1238 error.Decode
1239
1240 3 Errors:
1241 - J8 string syntax error inside quotes
1242 - Extra input on line
1243 - unquoted line isn't utf-8
1244 """
1245 p = J8LinesParser(s)
1246 return p.Parse()
1247
1248
1249# vim: sw=4