OILS / data_lang / j8.py View on Github | oilshell.org

1258 lines, 611 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
35from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37from asdl import format as fmt
38from core import error
39from data_lang import pyj8
40# dependency issue: consts.py pulls in frontend/option_def.py
41from frontend import consts
42from frontend import match
43from mycpp import mops
44from mycpp import mylib
45from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47import fastfunc
48
49_ = log
50
51from typing import cast, Dict, List, Tuple, Optional
52
53
54# COPIED from ui.ValType() to break dep
55def ValType(val):
56 # type: (value_t) -> str
57 """For displaying type errors in the UI."""
58
59 return value_str(val.tag(), dot=False)
60
61
62if mylib.PYTHON:
63
64 def HeapValueId(val):
65 # type: (value_t) -> int
66 """
67 Python's id() returns the address, which is up to 64 bits.
68
69 In C++ we can use the GC ID, which fits within 32 bits.
70 """
71 return id(val)
72
73
74def ValueId(val):
75 # type: (value_t) -> int
76 """
77 Return an integer ID for object that:
78
79 1. Can be used to determine whether 2 objects are the same, e.g. for
80 List, Dict, Func, Proc, etc.
81 2. Will help detect object cycles
82
83 Primitives types like Int and Float don't have this notion. They're
84 immutable values that are copied and compared by value.
85 """
86 with tagswitch(val) as case:
87 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88 value_e.Str):
89 # These will not be on the heap if we switch to tagged pointers
90 # Str is handled conservatively - when we add small string
91 # optimization, some strings will be values, so we assume all are.
92 return -1
93 else:
94 return HeapValueId(val)
95
96
97def ValueIdString(val):
98 # type: (value_t) -> str
99 """Used by pp value (42) and = 42"""
100 heap_id = ValueId(val) # could be -1
101 if heap_id == -1:
102 return ''
103 else:
104 return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107def Utf8Encode(code):
108 # type: (int) -> str
109 """Return utf-8 encoded bytes from a unicode code point.
110
111 Based on https://stackoverflow.com/a/23502707
112 """
113 num_cont_bytes = 0
114
115 if code <= 0x7F:
116 return chr(code & 0x7F) # ASCII
117
118 elif code <= 0x7FF:
119 num_cont_bytes = 1
120 elif code <= 0xFFFF:
121 num_cont_bytes = 2
122 else:
123 # What about the check code <= 0x10FFFF ?
124 # - it happens in statically parsed $'' u''
125 # - but not dynamically parsed echo -e / printf, following bash/zsh
126 num_cont_bytes = 3
127
128 bytes_ = [] # type: List[int]
129 for _ in xrange(num_cont_bytes):
130 bytes_.append(0x80 | (code & 0x3F))
131 code >>= 6
132
133 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
134 bytes_.append(b)
135 bytes_.reverse()
136
137 # mod 256 because Python ints don't wrap around!
138 tmp = [chr(b & 0xFF) for b in bytes_]
139 return ''.join(tmp)
140
141
142SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144LOSSY_JSON = 1 << 3 # JSON is lossy
145INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147# Hack until we fully translate
148assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151def _Print(val, buf, indent, options=0):
152 # type: (value_t, mylib.BufWriter, int, int) -> None
153 """
154 Args:
155 indent: number of spaces to indent, or -1 for everything on one line
156 """
157 p = InstancePrinter(buf, indent, options)
158 p.Print(val)
159
160
161def PrintMessage(val, buf, indent):
162 # type: (value_t, mylib.BufWriter, int) -> None
163 """ For json8 write (x) and toJson8()
164
165 Caller must handle error.Encode
166 """
167 _Print(val, buf, indent)
168
169
170def PrintJsonMessage(val, buf, indent):
171 # type: (value_t, mylib.BufWriter, int) -> None
172 """ For json write (x) and toJson()
173
174 Caller must handle error.Encode()
175 Doesn't decay to b'' strings - will use Unicode replacement char.
176 """
177 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
178
179
180def PrintLine(val, f):
181 # type: (value_t, mylib.Writer) -> None
182 """ For pp line (x) """
183
184 # error.Encode should be impossible - we show cycles and non-data
185 buf = mylib.BufWriter()
186 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
187 f.write(buf.getvalue())
188 f.write('\n')
189
190
191def EncodeString(s, buf, unquoted_ok=False):
192 # type: (str, mylib.BufWriter, bool) -> None
193 """ For pp proc, etc."""
194
195 if unquoted_ok and fastfunc.CanOmitQuotes(s):
196 buf.write(s)
197 return
198
199 _Print(value.Str(s), buf, -1)
200
201
202def MaybeEncodeString(s):
203 # type: (str) -> str
204 """ For write --json8 $s and compexport """
205
206 # TODO: add unquoted_ok here?
207 # /usr/local/foo-bar/x.y/a_b
208
209 buf = mylib.BufWriter()
210 _Print(value.Str(s), buf, -1)
211 return buf.getvalue()
212
213
214def MaybeEncodeJsonString(s):
215 # type: (str) -> str
216 """ For write --json """
217
218 # TODO: add unquoted_ok here?
219 # /usr/local/foo-bar/x.y/a_b
220 buf = mylib.BufWriter()
221 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
222 return buf.getvalue()
223
224
225# DFS traversal state
226UNSEEN = 0
227EXPLORING = 1
228FINISHED = 2
229
230
231class InstancePrinter(object):
232 """Print a value tree as J8/JSON."""
233
234 def __init__(self, buf, indent, options):
235 # type: (mylib.BufWriter, int, int) -> None
236 self.buf = buf
237 self.indent = indent
238 self.options = options
239
240 # Key is vm.HeapValueId(val)
241 # Value is always True
242 # Dict[int, None] doesn't translate -- it would be nice to have a set()
243 self.visited = {} # type: Dict[int, int]
244
245 def _ItemIndent(self, level):
246 # type: (int) -> None
247
248 if self.indent == -1:
249 return
250
251 self.buf.write_spaces((level + 1) * self.indent)
252
253 def _BracketIndent(self, level):
254 # type: (int) -> None
255
256 if self.indent == -1:
257 return
258
259 self.buf.write_spaces(level * self.indent)
260
261 def _MaybeNewline(self):
262 # type: () -> None
263 if self.indent == -1:
264 return
265 self.buf.write('\n')
266
267 def _MaybeSpace(self):
268 # type: () -> None
269 if self.indent == -1:
270 return
271 self.buf.write(' ')
272
273 def _PrintList(self, val, level):
274 # type: (value.List, int) -> None
275
276 if len(val.items) == 0: # Special case like Python/JS
277 self.buf.write('[]')
278 else:
279 self.buf.write('[')
280 self._MaybeNewline()
281 for i, item in enumerate(val.items):
282 if i != 0:
283 self.buf.write(',')
284 self._MaybeNewline()
285
286 self._ItemIndent(level)
287 self.Print(item, level + 1)
288 self._MaybeNewline()
289
290 self._BracketIndent(level)
291 self.buf.write(']')
292
293 def _PrintDict(self, val, level):
294 # type: (value.Dict, int) -> None
295
296 if len(val.d) == 0: # Special case like Python/JS
297 self.buf.write('{}')
298 else:
299 self.buf.write('{')
300 self._MaybeNewline()
301 i = 0
302 for k, v in iteritems(val.d):
303 if i != 0:
304 self.buf.write(',')
305 self._MaybeNewline()
306
307 self._ItemIndent(level)
308
309 pyj8.WriteString(k, self.options, self.buf)
310
311 self.buf.write(':')
312 self._MaybeSpace()
313
314 self.Print(v, level + 1)
315
316 i += 1
317
318 self._MaybeNewline()
319 self._BracketIndent(level)
320 self.buf.write('}')
321
322 def Print(self, val, level=0):
323 # type: (value_t, int) -> None
324
325 # special value that means everything is on one line
326 # It's like
327 # JSON.stringify(d, null, 0)
328 # except we use -1, not 0. 0 can still have newlines.
329
330 UP_val = val
331 with tagswitch(val) as case:
332 if case(value_e.Null):
333 self.buf.write('null')
334
335 elif case(value_e.Bool):
336 val = cast(value.Bool, UP_val)
337 self.buf.write('true' if val.b else 'false')
338
339 elif case(value_e.Int):
340 val = cast(value.Int, UP_val)
341 # TODO: avoid intermediate allocation with
342 # self.buf.WriteBigInt(val.i)
343 #
344 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
345 # be of arbitrary length, and will need a growth strategy.
346 # Although that is not very common, so we could allocate in
347 # that case.
348
349 self.buf.write(mops.ToStr(val.i))
350
351 elif case(value_e.Float):
352 val = cast(value.Float, UP_val)
353
354 fl = val.f
355 if ((self.options & INF_NAN_ARE_NULL) and
356 (math.isnan(fl) or math.isinf(fl))):
357 # JavaScript JSON lib behavior: Inf and NaN are null
358 # Python has a bug in the encoder by default, and then
359 # allow_nan=False raises an error
360 s = 'null'
361 else:
362 # TODO: can we avoid intermediate allocation?
363 # self.buf.WriteFloat(val.f)
364 s = str(fl)
365
366 self.buf.write(s)
367
368 elif case(value_e.Str):
369 val = cast(value.Str, UP_val)
370
371 pyj8.WriteString(val.s, self.options, self.buf)
372
373 elif case(value_e.List):
374 val = cast(value.List, UP_val)
375
376 # Cycle detection, only for containers that can be in cycles
377 heap_id = HeapValueId(val)
378
379 node_state = self.visited.get(heap_id, UNSEEN)
380 if node_state == FINISHED:
381 # Print it AGAIN. We print a JSON tree, which means we can
382 # visit and print nodes MANY TIMES, as long as they're not
383 # in a cycle.
384 self._PrintList(val, level)
385 return
386 if node_state == EXPLORING:
387 if self.options & SHOW_CYCLES:
388 self.buf.write('[ -->%s ]' % ValueIdString(val))
389 return
390 else:
391 # node.js prints which index closes the cycle
392 raise error.Encode(
393 "Can't encode List%s in object cycle" %
394 ValueIdString(val))
395
396 self.visited[heap_id] = EXPLORING
397 self._PrintList(val, level)
398 self.visited[heap_id] = FINISHED
399
400 elif case(value_e.Dict):
401 val = cast(value.Dict, UP_val)
402
403 # Cycle detection, only for containers that can be in cycles
404 heap_id = HeapValueId(val)
405
406 node_state = self.visited.get(heap_id, UNSEEN)
407 if node_state == FINISHED:
408 # Print it AGAIN. We print a JSON tree, which means we can
409 # visit and print nodes MANY TIMES, as long as they're not
410 # in a cycle.
411 self._PrintDict(val, level)
412 return
413 if node_state == EXPLORING:
414 if self.options & SHOW_CYCLES:
415 self.buf.write('{ -->%s }' % ValueIdString(val))
416 return
417 else:
418 # node.js prints which key closes the cycle
419 raise error.Encode(
420 "Can't encode Dict%s in object cycle" %
421 ValueIdString(val))
422
423 self.visited[heap_id] = EXPLORING
424 self._PrintDict(val, level)
425 self.visited[heap_id] = FINISHED
426
427 # BashArray and BashAssoc should be printed with pp line (x), e.g.
428 # for spec tests.
429 # - BashAssoc has a clear encoding.
430 # - BashArray could eventually be Dict[int, str]. But that's not
431 # encodable in JSON, which has string keys!
432 # So I think we can print it like ["a",null,'b"] and that won't
433 # change. That's what users expect.
434 elif case(value_e.BashArray):
435 val = cast(value.BashArray, UP_val)
436
437 self.buf.write('[')
438 self._MaybeNewline()
439 for i, s in enumerate(val.strs):
440 if i != 0:
441 self.buf.write(',')
442 self._MaybeNewline()
443
444 self._ItemIndent(level)
445 if s is None:
446 self.buf.write('null')
447 else:
448 pyj8.WriteString(s, self.options, self.buf)
449
450 self._MaybeNewline()
451
452 self._BracketIndent(level)
453 self.buf.write(']')
454
455 elif case(value_e.BashAssoc):
456 val = cast(value.BashAssoc, UP_val)
457
458 self.buf.write('{')
459 self._MaybeNewline()
460 i = 0
461 for k2, v2 in iteritems(val.d):
462 if i != 0:
463 self.buf.write(',')
464 self._MaybeNewline()
465
466 self._ItemIndent(level)
467
468 pyj8.WriteString(k2, self.options, self.buf)
469
470 self.buf.write(':')
471 self._MaybeSpace()
472
473 pyj8.WriteString(v2, self.options, self.buf)
474
475 i += 1
476
477 self._MaybeNewline()
478 self._BracketIndent(level)
479 self.buf.write('}')
480
481 else:
482 pass # mycpp workaround
483 if self.options & SHOW_NON_DATA:
484 # Similar to = operator, ui.DebugPrint()
485 # TODO: that prints value.Range in a special way
486 ysh_type = ValType(val)
487 id_str = ValueIdString(val)
488 self.buf.write('<%s%s>' % (ysh_type, id_str))
489 else:
490 raise error.Encode("Can't serialize object of type %s" %
491 ValType(val))
492
493
494class PrettyPrinter(object):
495 """ Unused right now, but could enhance the = operator.
496
497 Output to polymorphic ColorOutput
498
499 Features like asdl/format.py:
500 - line wrapping
501 - color
502 - sharing detection by passing in a REF COUTN dict
503 - print @123 the first time, and then print ... the second time
504
505 and
506
507 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
508 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
509
510 - Omitting commas for ASDL? Maybe we can use two spaces
511
512 (Token id: Id.VSub_DollarName start: 0 length: 3)
513 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
514 """
515
516 def __init__(self, max_col):
517 # type: (int) -> None
518 self.max_col = max_col
519
520 # This could be an optimized set an C++ bit set like
521 # mark_sweep_heap.h, rather than a Dict
522 #self.unique_objs = mylib.UniqueObjects()
523
524 # first pass of object ID -> number of times references
525
526 self.ref_count = {} # type: Dict[int, int]
527
528 def PrettyTree(self, val, f):
529 # type: (value_t, fmt.ColorOutput) -> None
530
531 # TODO: first convert to hnode.asdl types?
532
533 # Although we might want
534 # hnode.AlreadyShown = (str type, int unique_id)
535 pass
536
537 def Print(self, val, buf):
538 # type: (value_t, mylib.BufWriter) -> None
539
540 # Or print to stderr?
541 f = fmt.DetectConsoleOutput(mylib.Stdout())
542 self.PrettyTree(val, f)
543
544 # Then print those with ASDL
545 pass
546
547
548class LexerDecoder(object):
549 """J8 lexer and string decoder.
550
551 Similar interface as SimpleLexer, except we return an optional decoded
552 string
553 """
554
555 def __init__(self, s, is_j8, lang_str):
556 # type: (str, bool, str) -> None
557 self.s = s
558 self.is_j8 = is_j8
559 self.lang_str = lang_str
560
561 self.pos = 0
562
563 # current line being lexed -- for error messages
564 self.cur_line_num = 1
565
566 # Reuse this instance to save GC objects. JSON objects could have
567 # thousands of strings.
568 self.decoded = mylib.BufWriter()
569
570 def _Error(self, msg, end_pos):
571 # type: (str, int) -> error.Decode
572
573 # Use the current position as start pos
574 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
575
576 def Next(self):
577 # type: () -> Tuple[Id_t, int, Optional[str]]
578 """ Returns a token and updates self.pos """
579
580 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
581
582 if not self.is_j8:
583 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
584 raise self._Error(
585 "Single quotes aren't part of JSON; you may want 'json8 read'",
586 end_pos)
587 if tok_id == Id.Ignored_Comment:
588 raise self._Error(
589 "Comments aren't part of JSON; you may want 'json8 read'",
590 end_pos)
591
592 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
593 Id.Left_USingleQuote):
594 return self._DecodeString(tok_id, end_pos)
595
596 if tok_id == Id.Left_JDoubleQuote:
597 if self.is_j8:
598 return self._DecodeString(tok_id, end_pos)
599 else:
600 raise self._Error('Pure JSON does not accept j"" prefix',
601 end_pos)
602
603 if tok_id == Id.Ignored_Newline:
604 #log('LINE %d', self.cur_line_num)
605 self.cur_line_num += 1
606
607 self.pos = end_pos
608 return tok_id, end_pos, None
609
610 def NextForLines(self):
611 # type: () -> Tuple[Id_t, int, Optional[str]]
612 """ Like Next(), but for J8 Lines """
613
614 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
615
616 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
617 Id.Left_BSingleQuote, Id.Left_USingleQuote):
618 return self._DecodeString(tok_id, end_pos)
619
620 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
621 # this for quoted strings.)
622 if (tok_id == Id.Lit_Chars and
623 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
624 raise self._Error(
625 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
626 if tok_id == Id.Char_AsciiControl:
627 raise self._Error(
628 "J8 Lines can't have unescaped ASCII control chars", end_pos)
629
630 if tok_id == Id.J8_Newline:
631 #log('LINE %d', self.cur_line_num)
632 self.cur_line_num += 1
633
634 self.pos = end_pos
635 return tok_id, end_pos, None
636
637 def _DecodeString(self, left_id, str_pos):
638 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
639 """ Returns a string token and updates self.pos """
640
641 while True:
642 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
643 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
644 else:
645 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
646
647 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
648
649 if tok_id == Id.Eol_Tok:
650 # TODO: point to beginning of # quote?
651 raise self._Error(
652 'Unexpected EOF while lexing %s string' % self.lang_str,
653 str_end)
654 if tok_id == Id.Unknown_Backslash:
655 raise self._Error(
656 'Bad backslash escape in %s string' % self.lang_str,
657 str_end)
658 if tok_id == Id.Char_AsciiControl:
659 raise self._Error(
660 "%s strings can't have unescaped ASCII control chars" %
661 self.lang_str, str_end)
662
663 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
664
665 self.pos = str_end
666
667 s = self.decoded.getvalue()
668 self.decoded.clear() # reuse this instance
669
670 #log('decoded %r', self.decoded.getvalue())
671 return Id.J8_String, str_end, s
672
673 #
674 # Now handle each kind of token
675 #
676
677 if tok_id == Id.Lit_Chars: # JSON and J8
678 part = self.s[str_pos:str_end]
679 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
680 raise self._Error(
681 'Invalid UTF-8 in %s string literal' % self.lang_str,
682 str_end)
683
684 # TODO: would be nice to avoid allocation in all these cases.
685 # But LookupCharC() would have to change.
686
687 elif tok_id == Id.Char_OneChar: # JSON and J8
688 ch = self.s[str_pos + 1]
689 part = consts.LookupCharC(ch)
690
691 elif tok_id == Id.Char_UBraced: # J8 only
692 h = self.s[str_pos + 3:str_end - 1]
693 i = int(h, 16)
694
695 # Same checks in osh/word_compile.py
696 if i > 0x10ffff:
697 raise self._Error(
698 "Code point can't be greater than U+10ffff", str_end)
699 if 0xD800 <= i and i < 0xE000:
700 raise self._Error(
701 r"\u{%s} escape is illegal because it's in the surrogate range"
702 % h, str_end)
703
704 part = Utf8Encode(i)
705
706 elif tok_id == Id.Char_YHex: # J8 only
707 h = self.s[str_pos + 2:str_end]
708
709 # Same check in osh/word_parse.py
710 if left_id != Id.Left_BSingleQuote:
711 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
712 raise self._Error(
713 r"\y%s escapes not allowed in u'' strings" % h,
714 str_end)
715
716 i = int(h, 16)
717 part = chr(i)
718
719 elif tok_id == Id.Char_SurrogatePair:
720 h1 = self.s[str_pos + 2:str_pos + 6]
721 h2 = self.s[str_pos + 8:str_pos + 12]
722
723 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
724 i1 = int(h1, 16) - 0xD800 # high surrogate
725 i2 = int(h2, 16) - 0xDC00 # low surrogate
726 code_point = 0x10000 + (i1 << 10) + i2
727
728 part = Utf8Encode(code_point)
729
730 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
731 h = self.s[str_pos + 2:str_end]
732 i = int(h, 16)
733 part = Utf8Encode(i)
734
735 else:
736 # Should never happen
737 raise AssertionError(Id_str(tok_id))
738
739 #log('%s part %r', Id_str(tok_id), part)
740 self.decoded.write(part)
741 str_pos = str_end
742
743
744class _Parser(object):
745
746 def __init__(self, s, is_j8):
747 # type: (str, bool) -> None
748 self.s = s
749 self.is_j8 = is_j8
750 self.lang_str = "J8" if is_j8 else "JSON"
751
752 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
753 self.tok_id = Id.Undefined_Tok
754 self.start_pos = 0
755 self.end_pos = 0
756 self.decoded = '' # decoded J8 string
757
758 def _Next(self):
759 # type: () -> None
760
761 # This isn't the start of a J8_Bool token, it's the END of the token before it
762 while True:
763 self.start_pos = self.end_pos
764 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
765 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
766 Id.Ignored_Comment):
767 break
768 # TODO: add Ignored_Newline to count lines, and show line numbers
769 # in errors messages. The position of the last newline and a token
770 # can be used to calculate a column number.
771
772 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
773
774 def _Eat(self, tok_id):
775 # type: (Id_t) -> None
776
777 if self.tok_id != tok_id:
778 #log('position %r %d-%d %r', self.s, self.start_pos,
779 # self.end_pos, self.s[self.start_pos:self.end_pos])
780 raise self._ParseError("Expected %s, got %s" %
781 (Id_str(tok_id), Id_str(self.tok_id)))
782 self._Next()
783
784 def _NextForLines(self):
785 # type: () -> None
786 """Like _Next, but use the J8 Lines lexer."""
787 self.start_pos = self.end_pos
788 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
789
790 def _ParseError(self, msg):
791 # type: (str) -> error.Decode
792 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
793 self.lexer.cur_line_num)
794
795
796class Parser(_Parser):
797 """JSON and JSON8 Parser."""
798
799 def __init__(self, s, is_j8):
800 # type: (str, bool) -> None
801 _Parser.__init__(self, s, is_j8)
802
803 def _ParsePair(self):
804 # type: () -> Tuple[str, value_t]
805
806 k = self.decoded # Save the potential string value
807 self._Eat(Id.J8_String) # Check that it's a string
808 assert k is not None
809
810 self._Eat(Id.J8_Colon)
811
812 v = self._ParseValue()
813 return k, v
814
815 def _ParseDict(self):
816 # type: () -> value_t
817 """
818 pair = string ':' value
819 Dict = '{' '}'
820 | '{' pair (',' pair)* '}'
821 """
822 # precondition
823 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
824
825 #log('> Dict')
826
827 d = NewDict() # type: Dict[str, value_t]
828
829 self._Next()
830 if self.tok_id == Id.J8_RBrace:
831 self._Next()
832 return value.Dict(d)
833
834 k, v = self._ParsePair()
835 d[k] = v
836 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
837
838 while self.tok_id == Id.J8_Comma:
839 self._Next()
840 k, v = self._ParsePair()
841 d[k] = v
842 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
843
844 self._Eat(Id.J8_RBrace)
845
846 #log('< Dict')
847
848 return value.Dict(d)
849
850 def _ParseList(self):
851 # type: () -> value_t
852 """
853 List = '[' ']'
854 | '[' value (',' value)* ']'
855 """
856 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
857
858 items = [] # type: List[value_t]
859
860 self._Next()
861 if self.tok_id == Id.J8_RBracket:
862 self._Next()
863 return value.List(items)
864
865 items.append(self._ParseValue())
866
867 while self.tok_id == Id.J8_Comma:
868 self._Next()
869 items.append(self._ParseValue())
870
871 self._Eat(Id.J8_RBracket)
872
873 return value.List(items)
874
875 def _ParseValue(self):
876 # type: () -> value_t
877 if self.tok_id == Id.J8_LBrace:
878 return self._ParseDict()
879
880 elif self.tok_id == Id.J8_LBracket:
881 return self._ParseList()
882
883 elif self.tok_id == Id.J8_Null:
884 self._Next()
885 return value.Null
886
887 elif self.tok_id == Id.J8_Bool:
888 #log('%r %d', self.s[self.start_pos], self.start_pos)
889 b = value.Bool(self.s[self.start_pos] == 't')
890 self._Next()
891 return b
892
893 elif self.tok_id == Id.J8_Int:
894 part = self.s[self.start_pos:self.end_pos]
895 self._Next()
896 try:
897 big = mops.FromStr(part)
898 except ValueError:
899 raise self._ParseError('Integer is too big')
900 return value.Int(big)
901
902 elif self.tok_id == Id.J8_Float:
903 part = self.s[self.start_pos:self.end_pos]
904 self._Next()
905 return value.Float(float(part))
906
907 # UString, BString too
908 elif self.tok_id == Id.J8_String:
909 str_val = value.Str(self.decoded)
910 #log('d %r', self.decoded)
911 self._Next()
912 return str_val
913
914 elif self.tok_id == Id.Eol_Tok:
915 raise self._ParseError('Unexpected EOF while parsing %s' %
916 self.lang_str)
917
918 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
919 raise self._ParseError('Invalid token while parsing %s: %s' %
920 (self.lang_str, Id_str(self.tok_id)))
921
922 def ParseValue(self):
923 # type: () -> value_t
924 """ Raises error.Decode. """
925 self._Next()
926 obj = self._ParseValue()
927
928 n = len(self.s)
929 if self.start_pos != n:
930 extra = n - self.start_pos
931 #log('n %d pos %d', n, self.start_pos)
932 raise self._ParseError(
933 'Got %d bytes of unexpected trailing input' % extra)
934 return obj
935
936
937class Nil8Parser(_Parser):
938 """
939 Tokens not in JSON8:
940 LParen RParen Symbol
941
942 Tokens not in JSON, but in JSON8 and NIL8:
943 Identifier (unquoted keys)
944 Ignored_Comment
945 """
946
947 def __init__(self, s, is_j8):
948 # type: (str, bool) -> None
949 _Parser.__init__(self, s, is_j8)
950
951 if 0:
952
953 def _LookAhead(self):
954 # type: () -> Id_t
955 """
956 Don't need this right now
957 """
958 end_pos = self.end_pos # look ahead from last token
959 while True:
960 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
961 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
962 Id.Ignored_Comment):
963 break
964 return tok_id
965
966 def _ParseRecord(self):
967 # type: () -> nvalue_t
968 """
969 Yaks
970 (self->Next) => (-> self Next)
971 (self->Next obj.field) => ((-> self Next) (. obj field))
972
973 Similar to
974 ((identity identity) 42) => 42 in Clojure
975
976 ASDL
977 (Node left:(. x4beef2))
978 (Node left !x4beef2)
979
980 # Ambiguous because value can be identifier.
981 # We have to look ahead to and see if there's a colon :
982 field =
983 Identifier ':' value
984 | value
985
986 record = '(' head field* ')'
987
988 - Identifier | Symbol are treated the same, it's a side effect of
989 the lexing style
990 - do positional args come before named args
991 - () is invalid? Use [] for empty list
992 """
993 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
994
995 items = [] # type: List[nvalue_t]
996
997 self._Next()
998 if self.tok_id == Id.J8_RParen:
999 self._Next()
1000 return nvalue.List(items)
1001
1002 #log('TOK %s', Id_str(self.tok_id))
1003 while self.tok_id != Id.J8_RParen:
1004 items.append(self._ParseNil8())
1005 #log('TOK 2 %s', Id_str(self.tok_id))
1006
1007 self._Eat(Id.J8_RParen)
1008
1009 return nvalue.List(items)
1010
1011 def _ParseList8(self):
1012 # type: () -> nvalue_t
1013 """
1014 List8 = '[' value* ']'
1015
1016 No commas, not even optional ones for now.
1017 """
1018 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1019
1020 items = [] # type: List[nvalue_t]
1021
1022 self._Next()
1023 if self.tok_id == Id.J8_RBracket:
1024 self._Next()
1025 return nvalue.List(items)
1026
1027 #log('TOK %s', Id_str(self.tok_id))
1028 while self.tok_id != Id.J8_RBracket:
1029 items.append(self._ParseNil8())
1030 #log('TOK 2 %s', Id_str(self.tok_id))
1031
1032 self._Eat(Id.J8_RBracket)
1033
1034 return nvalue.List(items)
1035
1036 def _ParseNil8(self):
1037 # type: () -> nvalue_t
1038 if self.tok_id == Id.J8_LParen:
1039 obj = self._ParseRecord() # type: nvalue_t
1040 #return obj
1041
1042 elif self.tok_id == Id.J8_LBracket:
1043 obj = self._ParseList8()
1044 #return obj
1045
1046 # Primitives are copied from J8 above.
1047 # TODO: We also want hex literals.
1048 elif self.tok_id == Id.J8_Null:
1049 self._Next()
1050 obj = nvalue.Null
1051
1052 elif self.tok_id == Id.J8_Bool:
1053 b = nvalue.Bool(self.s[self.start_pos] == 't')
1054 self._Next()
1055 obj = b
1056
1057 elif self.tok_id == Id.J8_Int:
1058 part = self.s[self.start_pos:self.end_pos]
1059 self._Next()
1060 obj = nvalue.Int(int(part))
1061
1062 elif self.tok_id == Id.J8_Float:
1063 part = self.s[self.start_pos:self.end_pos]
1064 self._Next()
1065 obj = nvalue.Float(float(part))
1066
1067 elif self.tok_id == Id.J8_String:
1068 str_val = nvalue.Str(self.decoded)
1069 self._Next()
1070 obj = str_val
1071
1072 # <- etc.
1073 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1074 Id.J8_Comma):
1075 # unquoted "word" treated like a string
1076 part = self.s[self.start_pos:self.end_pos]
1077 self._Next()
1078 obj = nvalue.Symbol(part)
1079
1080 elif self.tok_id == Id.Eol_Tok:
1081 raise self._ParseError('Unexpected EOF while parsing %s' %
1082 self.lang_str)
1083
1084 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1085 raise self._ParseError('Invalid token while parsing %s: %s' %
1086 (self.lang_str, Id_str(self.tok_id)))
1087
1088 #log('YO %s', Id_str(self.tok_id))
1089 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1090 #log('AT %s', Id_str(self.tok_id))
1091
1092 # key: "value" -> (: key "value")
1093 part = self.s[self.start_pos:self.end_pos]
1094 op = nvalue.Symbol(part)
1095
1096 self._Next()
1097 operand2 = self._ParseNil8()
1098 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1099 #print("--> INFIX %d %s" % (id(infix), infix))
1100 return infix
1101
1102 #next_id = self._LookAhead()
1103 #print('NEXT %s' % Id_str(next_id))
1104
1105 #raise AssertionError()
1106 #print("--> OBJ %d %s" % (id(obj), obj))
1107 return obj
1108
1109 def ParseNil8(self):
1110 # type: () -> nvalue_t
1111 """ Raises error.Decode. """
1112 self._Next()
1113 #print('yo')
1114 obj = self._ParseNil8()
1115 #print("==> %d %s" % (id(obj), obj))
1116 if self.tok_id != Id.Eol_Tok:
1117 raise self._ParseError('Unexpected trailing input')
1118 return obj
1119
1120
1121class J8LinesParser(_Parser):
1122 """Decode lines from a string with newlines.
1123
1124 We specify this with a grammar, to preserve location info and to reduce
1125 allocations. (But note that unquoted_line is more like a LOOP than it is
1126 grammatical.)
1127
1128 Grammar:
1129
1130 end = J8_Newline | Eol_Tok
1131
1132 empty_line = WS_Space? end
1133
1134 # special case: read until end token, but REMOVE trailing WS_Space
1135 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1136
1137 j8_line = WS_Space? J8_String WS_Space? end
1138
1139 lines = (empty_line | unquoted_line | j8_line)*
1140
1141 where Lit_Chars is valid UTF-8
1142
1143 Notes:
1144
1145 (1) We disallow multiple strings on a line, like:
1146
1147 "json" "json2"
1148 "json" unquoted
1149
1150 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1151
1152 foo "" u''
1153
1154 The "" and u'' are not a decoded string, because the line started with
1155 Id.Lit_Chars literals.
1156
1157 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1158 Does it have - for empty cell?
1159 """
1160
1161 def __init__(self, s):
1162 # type: (str) -> None
1163 _Parser.__init__(self, s, True)
1164
1165 def _Show(self, s):
1166 # type: (str) -> None
1167 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1168 self.end_pos)
1169
1170 def _ParseLine(self, out):
1171 # type: (List[str]) -> None
1172 """ May append a line to 'out' """
1173 #self._Show('1')
1174 if self.tok_id == Id.WS_Space:
1175 self._NextForLines()
1176
1177 # Empty line - return without doing anything
1178 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1179 self._NextForLines()
1180 return
1181
1182 # Quoted string on line
1183 if self.tok_id == Id.J8_String:
1184 out.append(self.decoded)
1185 self._NextForLines()
1186
1187 if self.tok_id == Id.WS_Space: # trailing whitespace
1188 self._NextForLines()
1189
1190 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1191 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1192 Id_str(self.tok_id))
1193
1194 self._NextForLines()
1195 return
1196
1197 # Unquoted line
1198 if self.tok_id == Id.Lit_Chars:
1199 # ' unquoted "" text on line ' # read every token until end
1200 string_start = self.start_pos
1201 while True:
1202 # for stripping whitespace
1203 prev_id = self.tok_id
1204 prev_start = self.start_pos
1205
1206 self._NextForLines()
1207
1208 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1209 # \r, but we're sticking with the JSON spec definition of
1210 # whitespace. (As another data point, CPython on Unix allows
1211 # \r in the middle of expressions, treating it as whitespace.)
1212 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1213 break
1214
1215 if prev_id == Id.WS_Space:
1216 string_end = prev_start # remove trailing whitespace
1217 else:
1218 string_end = self.start_pos
1219
1220 out.append(self.s[string_start:string_end])
1221
1222 self._NextForLines() # past newline
1223 return
1224
1225 raise AssertionError(Id_str(self.tok_id))
1226
1227 def Parse(self):
1228 # type: () -> List[str]
1229 """ Raises error.Decode. """
1230 self._NextForLines()
1231
1232 lines = [] # type: List[str]
1233 while self.tok_id != Id.Eol_Tok:
1234 self._ParseLine(lines)
1235
1236 if self.tok_id != Id.Eol_Tok:
1237 raise self._ParseError('Unexpected trailing input in J8 Lines')
1238
1239 return lines
1240
1241
1242def SplitJ8Lines(s):
1243 # type: (str) -> List[str]
1244 """Used by @(echo split command sub)
1245
1246 Raises:
1247 error.Decode
1248
1249 3 Errors:
1250 - J8 string syntax error inside quotes
1251 - Extra input on line
1252 - unquoted line isn't utf-8
1253 """
1254 p = J8LinesParser(s)
1255 return p.Parse()
1256
1257
1258# vim: sw=4