OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1708 lines, 1028 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from data_lang import j8
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216 n = p_args.NumChildren()
217 if n > 1:
218 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
219
220 a = p_args.GetChild(0)
221 return Subscript(tok0, base, self._Subscript(a))
222
223 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
224 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
225 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
226 expr_context_e.Store)
227
228 raise AssertionError(typ0)
229
230 def _DictPair(self, p_node):
231 # type: (PNode) -> Tuple[expr_t, expr_t]
232 """
233 dict_pair: ( Expr_Name [':' test]
234 | '[' testlist ']' ':' test )
235 | sq_string ':' test
236 | dq_string ':' test )
237 """
238 assert p_node.typ == grammar_nt.dict_pair
239
240 typ = p_node.GetChild(0).typ
241
242 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
243 key = self.Expr(p_node.GetChild(0)) # type: expr_t
244 val = self.Expr(p_node.GetChild(2))
245 return key, val
246
247 tok0 = p_node.GetChild(0).tok
248 id_ = tok0.id
249
250 if id_ == Id.Expr_Name:
251 key_str = value.Str(lexer.TokenVal(tok0))
252 key = expr.Const(tok0, key_str)
253 if p_node.NumChildren() >= 3:
254 val = self.Expr(p_node.GetChild(2))
255 else:
256 val = expr.Implicit
257
258 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
259 key = self.Expr(p_node.GetChild(1))
260 val = self.Expr(p_node.GetChild(4))
261 return key, val
262
263 return key, val
264
265 def _Dict(self, parent, p_node):
266 # type: (PNode, PNode) -> expr.Dict
267 """
268 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
269 """
270 if p_node.typ == Id.Op_RBrace: # {}
271 return expr.Dict(parent.tok, [], [])
272
273 assert p_node.typ == grammar_nt.dict
274
275 keys = [] # type: List[expr_t]
276 values = [] # type: List[expr_t]
277
278 n = p_node.NumChildren()
279 for i in xrange(0, n, 2):
280 key, val = self._DictPair(p_node.GetChild(i))
281 keys.append(key)
282 values.append(val)
283
284 return expr.Dict(parent.tok, keys, values)
285
286 def _Tuple(self, parent):
287 # type: (PNode) -> expr_t
288
289 n = parent.NumChildren()
290
291 # (x) -- not a tuple
292 if n == 1:
293 return self.Expr(parent.GetChild(0))
294
295 # x, and (x,) aren't allowed
296 if n == 2:
297 p_die('Invalid trailing comma', parent.GetChild(1).tok)
298
299 elts = [] # type: List[expr_t]
300 for i in xrange(0, n, 2): # skip commas
301 p_node = parent.GetChild(i)
302 elts.append(self.Expr(p_node))
303
304 return expr.Tuple(parent.tok, elts,
305 expr_context_e.Store) # unused expr_context_e
306
307 def _TestlistComp(self, parent, p_node, id0):
308 # type: (PNode, PNode, Id_t) -> expr_t
309 """
310 testlist_comp:
311 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
312 """
313 assert p_node.typ == grammar_nt.testlist_comp
314
315 n = p_node.NumChildren()
316 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
317 elt = self.Expr(p_node.GetChild(0))
318 comp = self._CompFor(p_node.GetChild(1))
319 if id0 == Id.Op_LParen: # (x+1 for x in y)
320 return expr.GeneratorExp(elt, [comp])
321 if id0 == Id.Op_LBracket: # [x+1 for x in y]
322 return expr.ListComp(parent.tok, elt, [comp])
323 raise AssertionError()
324
325 if id0 == Id.Op_LParen:
326 # Parenthesized expression like (x+1) or (x)
327 if n == 1:
328 return self.Expr(p_node.GetChild(0))
329
330 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
331 if p_node.GetChild(1).typ == Id.Arith_Comma:
332 return self._Tuple(p_node)
333
334 raise AssertionError()
335
336 if id0 == Id.Op_LBracket: # List [1,2,3]
337 elts = [] # type: List[expr_t]
338 for i in xrange(0, n, 2): # skip commas
339 elts.append(self.Expr(p_node.GetChild(i)))
340
341 return expr.List(parent.tok, elts,
342 expr_context_e.Store) # unused expr_context_e
343
344 raise AssertionError(Id_str(id0))
345
346 def _Atom(self, parent):
347 # type: (PNode) -> expr_t
348 """Handle alternatives of 'atom' where there's more than one child."""
349
350 tok = parent.GetChild(0).tok
351 id_ = tok.id
352 n = parent.NumChildren()
353
354 if id_ == Id.Op_LParen:
355 # atom: '(' [yield_expr|testlist_comp] ')' | ...
356 if n == 2: # () is a tuple
357 assert (
358 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
359 return expr.Tuple(tok, [], expr_context_e.Store)
360
361 return self._TestlistComp(parent, parent.GetChild(1), id_)
362
363 if id_ == Id.Op_LBracket:
364 # atom: ... | '[' [testlist_comp] ']' | ...
365
366 if n == 2: # []
367 assert (parent.GetChild(1).typ == Id.Op_RBracket
368 ), parent.GetChild(1)
369 return expr.List(tok, [],
370 expr_context_e.Store) # unused expr_context_e
371
372 return self._TestlistComp(parent, parent.GetChild(1), id_)
373
374 if id_ == Id.Left_CaretBracket: # ^[42 + x]
375 child = self.Expr(parent.GetChild(1))
376 return expr.Literal(child)
377
378 if id_ == Id.Op_LBrace:
379 # atom: ... | '{' [Op_Newline] [dict] '}'
380 i = 1
381 if parent.GetChild(i).typ == Id.Op_Newline:
382 i += 1
383 return self._Dict(parent, parent.GetChild(i))
384
385 if id_ == Id.Arith_Amp:
386 n = parent.NumChildren()
387 if n >= 3:
388 p_die("Places in containers not implemented yet",
389 parent.GetChild(2).tok)
390
391 name_tok = parent.GetChild(1).tok
392 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
393
394 if id_ == Id.Expr_Func:
395 # STUB. This should really be a Func, not Lambda.
396 return expr.Lambda([], expr.Implicit)
397
398 # 100 M
399 # Ignoring the suffix for now
400 if id_ == Id.Expr_DecInt:
401 assert n > 1
402 p_die("Units suffix not implemented", parent.GetChild(1).tok)
403 #return self.Expr(parent.GetChild(0))
404
405 # 100.5 M
406 # Ignoring the suffix for now
407 if id_ == Id.Expr_Float:
408 assert n > 1
409 p_die("unix suffix implemented", parent.GetChild(1).tok)
410 #return self.Expr(parent.GetChild(0))
411
412 raise AssertionError(Id_str(id_))
413
414 def _NameType(self, p_node):
415 # type: (PNode) -> NameType
416 """ name_type: Expr_Name [':'] [type_expr] """
417 name_tok = p_node.GetChild(0).tok
418 typ = None # type: Optional[TypeExpr]
419
420 n = p_node.NumChildren()
421 if n == 2:
422 typ = self._TypeExpr(p_node.GetChild(1))
423 if n == 3:
424 typ = self._TypeExpr(p_node.GetChild(2))
425
426 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
427
428 def _NameTypeList(self, p_node):
429 # type: (PNode) -> List[NameType]
430 """ name_type_list: name_type (',' name_type)* """
431 assert p_node.typ == grammar_nt.name_type_list
432 results = [] # type: List[NameType]
433
434 n = p_node.NumChildren()
435 for i in xrange(0, n, 2): # was children[::2]
436 results.append(self._NameType(p_node.GetChild(i)))
437 return results
438
439 def _CompFor(self, p_node):
440 # type: (PNode) -> Comprehension
441 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
442 lhs = self._NameTypeList(p_node.GetChild(1))
443 iterable = self.Expr(p_node.GetChild(3))
444
445 if p_node.NumChildren() >= 6:
446 cond = self.Expr(p_node.GetChild(5))
447 else:
448 cond = None
449
450 return Comprehension(lhs, iterable, cond)
451
452 def _CompareChain(self, parent):
453 # type: (PNode) -> expr_t
454 """comparison: expr (comp_op expr)*"""
455 cmp_ops = [] # type: List[Token]
456 comparators = [] # type: List[expr_t]
457 left = self.Expr(parent.GetChild(0))
458
459 i = 1
460 n = parent.NumChildren()
461 while i < n:
462 p = parent.GetChild(i)
463 op = p.GetChild(0).tok
464 if p.NumChildren() == 2:
465 # Blame the first token, and change its type
466 if op.id == Id.Expr_Not: # not in
467 op.id = Id.Node_NotIn
468 elif op.id == Id.Expr_Is: # is not
469 op.id = Id.Node_IsNot
470 else:
471 raise AssertionError()
472 else:
473 # is, <, ==, etc.
474 pass
475
476 cmp_ops.append(op)
477 i += 1
478 comparators.append(self.Expr(parent.GetChild(i)))
479 i += 1
480 return expr.Compare(left, cmp_ops, comparators)
481
482 def _Subscript(self, parent):
483 # type: (PNode) -> expr_t
484 """subscript: expr | [expr] ':' [expr]"""
485 typ0 = parent.GetChild(0).typ
486
487 n = parent.NumChildren()
488
489 if typ0 == grammar_nt.expr:
490 if n == 3: # a[1:2]
491 lower = self.Expr(parent.GetChild(0))
492 upper = self.Expr(parent.GetChild(2))
493 elif n == 2: # a[1:]
494 lower = self.Expr(parent.GetChild(0))
495 upper = None
496 else: # a[1]
497 return self.Expr(parent.GetChild(0))
498 else:
499 assert typ0 == Id.Arith_Colon
500 lower = None
501 if n == 1: # a[:]
502 upper = None
503 else: # a[:3]
504 upper = self.Expr(parent.GetChild(1))
505
506 return expr.Slice(lower, parent.GetChild(0).tok, upper)
507
508 def Expr(self, pnode):
509 # type: (PNode) -> expr_t
510 """Transform expressions (as opposed to statements)"""
511 typ = pnode.typ
512
513 #
514 # YSH Entry Points / Additions
515 #
516
517 if typ == grammar_nt.ysh_expr: # for if/while
518 # ysh_expr: '(' testlist ')'
519 return self.Expr(pnode.GetChild(1))
520
521 if typ == grammar_nt.command_expr:
522 # return_expr: testlist end_stmt
523 return self.Expr(pnode.GetChild(0))
524
525 #
526 # Python-like Expressions / Operators
527 #
528
529 if typ == grammar_nt.atom:
530 if pnode.NumChildren() == 1:
531 return self.Expr(pnode.GetChild(0))
532 return self._Atom(pnode)
533
534 if typ == grammar_nt.testlist:
535 # testlist: test (',' test)* [',']
536 return self._Tuple(pnode)
537
538 if typ == grammar_nt.test:
539 # test: or_test ['if' or_test 'else' test] | lambdef
540 if pnode.NumChildren() == 1:
541 return self.Expr(pnode.GetChild(0))
542
543 # TODO: Handle lambdef
544
545 test = self.Expr(pnode.GetChild(2))
546 body = self.Expr(pnode.GetChild(0))
547 orelse = self.Expr(pnode.GetChild(4))
548 return expr.IfExp(test, body, orelse)
549
550 if typ == grammar_nt.lambdef:
551 # lambdef: '|' [name_type_list] '|' test
552
553 n = pnode.NumChildren()
554 if n == 4:
555 params = self._NameTypeList(pnode.GetChild(1))
556 else:
557 params = []
558
559 body = self.Expr(pnode.GetChild(n - 1))
560 return expr.Lambda(params, body)
561
562 #
563 # Operators with Precedence
564 #
565
566 if typ == grammar_nt.or_test:
567 # or_test: and_test ('or' and_test)*
568 return self._LeftAssoc(pnode)
569
570 if typ == grammar_nt.and_test:
571 # and_test: not_test ('and' not_test)*
572 return self._LeftAssoc(pnode)
573
574 if typ == grammar_nt.not_test:
575 # not_test: 'not' not_test | comparison
576 if pnode.NumChildren() == 1:
577 return self.Expr(pnode.GetChild(0))
578
579 op_tok = pnode.GetChild(0).tok # not
580 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
581
582 elif typ == grammar_nt.comparison:
583 if pnode.NumChildren() == 1:
584 return self.Expr(pnode.GetChild(0))
585
586 return self._CompareChain(pnode)
587
588 elif typ == grammar_nt.range_expr:
589 n = pnode.NumChildren()
590 if n == 1:
591 return self.Expr(pnode.GetChild(0))
592
593 if n == 3:
594 return expr.Range(self.Expr(pnode.GetChild(0)),
595 pnode.GetChild(1).tok,
596 self.Expr(pnode.GetChild(2)))
597
598 raise AssertionError(n)
599
600 elif typ == grammar_nt.expr:
601 # expr: xor_expr ('|' xor_expr)*
602 return self._LeftAssoc(pnode)
603
604 if typ == grammar_nt.xor_expr:
605 # xor_expr: and_expr ('xor' and_expr)*
606 return self._LeftAssoc(pnode)
607
608 if typ == grammar_nt.and_expr: # a & b
609 # and_expr: shift_expr ('&' shift_expr)*
610 return self._LeftAssoc(pnode)
611
612 elif typ == grammar_nt.shift_expr:
613 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
614 return self._LeftAssoc(pnode)
615
616 elif typ == grammar_nt.arith_expr:
617 # arith_expr: term (('+'|'-') term)*
618 return self._LeftAssoc(pnode)
619
620 elif typ == grammar_nt.term:
621 # term: factor (('*'|'/'|'div'|'mod') factor)*
622 return self._LeftAssoc(pnode)
623
624 elif typ == grammar_nt.factor:
625 # factor: ('+'|'-'|'~') factor | power
626 # the power would have already been reduced
627 if pnode.NumChildren() == 1:
628 return self.Expr(pnode.GetChild(0))
629
630 assert pnode.NumChildren() == 2
631 op = pnode.GetChild(0)
632 e = pnode.GetChild(1)
633
634 assert isinstance(op.tok, Token)
635 return expr.Unary(op.tok, self.Expr(e))
636
637 elif typ == grammar_nt.power:
638 # power: atom trailer* ['**' factor]
639
640 node = self.Expr(pnode.GetChild(0))
641 if pnode.NumChildren() == 1: # No trailers
642 return node
643
644 # Support a->startswith(b) and mydict.key
645 n = pnode.NumChildren()
646 i = 1
647 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
648 node = self._Trailer(node, pnode.GetChild(i))
649 i += 1
650
651 if i != n: # ['**' factor]
652 op_tok = pnode.GetChild(i).tok
653 assert op_tok.id == Id.Arith_DStar, op_tok
654 factor = self.Expr(pnode.GetChild(i + 1))
655 node = expr.Binary(op_tok, node, factor)
656
657 return node
658
659 elif typ == grammar_nt.eggex:
660 return self._Eggex(pnode)
661
662 elif typ == grammar_nt.ysh_expr_sub:
663 return self.Expr(pnode.GetChild(0))
664
665 #
666 # YSH Lexer Modes
667 #
668
669 elif typ == grammar_nt.sh_array_literal:
670 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
671
672 elif typ == grammar_nt.old_sh_array_literal:
673 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
674
675 elif typ == grammar_nt.sh_command_sub:
676 return cast(CommandSub, pnode.GetChild(1).tok)
677
678 elif typ == grammar_nt.braced_var_sub:
679 return cast(BracedVarSub, pnode.GetChild(1).tok)
680
681 elif typ == grammar_nt.dq_string:
682 dq = cast(DoubleQuoted, pnode.GetChild(1).tok)
683 # sugar: ^"..." is short for ^["..."]
684 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
685 return expr.Literal(dq)
686 return dq
687
688 elif typ == grammar_nt.sq_string:
689 return cast(SingleQuoted, pnode.GetChild(1).tok)
690
691 elif typ == grammar_nt.simple_var_sub:
692 tok = pnode.GetChild(0).tok
693
694 if tok.id == Id.VSub_DollarName: # $foo is disallowed
695 bare = lexer.TokenSliceLeft(tok, 1)
696 p_die(
697 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
698 % (bare, bare), tok)
699
700 # $? is allowed
701 return SimpleVarSub(tok)
702
703 #
704 # Terminals
705 #
706
707 tok = pnode.tok
708 if typ == Id.Expr_Name:
709 return expr.Var(tok, lexer.TokenVal(tok))
710
711 # Everything else is an expr.Const
712 tok_str = lexer.TokenVal(tok)
713 # Remove underscores from 1_000_000. The lexer is responsible for
714 # validation.
715 c_under = tok_str.replace('_', '')
716
717 if typ == Id.Expr_DecInt:
718 try:
719 cval = value.Int(mops.FromStr(c_under)) # type: value_t
720 except ValueError:
721 p_die('Decimal int constant is too large', tok)
722
723 elif typ == Id.Expr_BinInt:
724 assert c_under[:2] in ('0b', '0B'), c_under
725 try:
726 cval = value.Int(mops.FromStr(c_under[2:], 2))
727 except ValueError:
728 p_die('Binary int constant is too large', tok)
729
730 elif typ == Id.Expr_OctInt:
731 assert c_under[:2] in ('0o', '0O'), c_under
732 try:
733 cval = value.Int(mops.FromStr(c_under[2:], 8))
734 except ValueError:
735 p_die('Octal int constant is too large', tok)
736
737 elif typ == Id.Expr_HexInt:
738 assert c_under[:2] in ('0x', '0X'), c_under
739 try:
740 cval = value.Int(mops.FromStr(c_under[2:], 16))
741 except ValueError:
742 p_die('Hex int constant is too large', tok)
743
744 elif typ == Id.Expr_Float:
745 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
746 # I think this never raises ValueError, because the lexer
747 # should only accept strings that strtod() does?
748 cval = value.Float(float(c_under))
749
750 elif typ == Id.Expr_Null:
751 cval = value.Null
752
753 elif typ == Id.Expr_True:
754 cval = value.Bool(True)
755
756 elif typ == Id.Expr_False:
757 cval = value.Bool(False)
758
759 elif typ == Id.Char_OneChar: # \n
760 assert len(tok_str) == 2, tok_str
761 s = consts.LookupCharC(lexer.TokenSliceLeft(tok, 1))
762 cval = value.Str(s)
763
764 elif typ == Id.Char_YHex: # \yff
765 assert len(tok_str) == 4, tok_str
766 hex_str = lexer.TokenSliceLeft(tok, 2)
767 s = chr(int(hex_str, 16))
768 cval = value.Str(s)
769
770 elif typ == Id.Char_UBraced: # \u{123}
771 hex_str = lexer.TokenSlice(tok, 3, -1)
772 code_point = int(hex_str, 16)
773 s = j8.Utf8Encode(code_point)
774 cval = value.Str(s)
775
776 else:
777 raise AssertionError(typ)
778
779 return expr.Const(tok, cval)
780
781 def _CheckLhs(self, lhs):
782 # type: (expr_t) -> None
783
784 UP_lhs = lhs
785 with tagswitch(lhs) as case:
786 if case(expr_e.Var):
787 # OK - e.g. setvar a.b.c[i] = 42
788 pass
789
790 elif case(expr_e.Subscript):
791 lhs = cast(Subscript, UP_lhs)
792 self._CheckLhs(lhs.obj) # recurse on LHS
793
794 elif case(expr_e.Attribute):
795 lhs = cast(Attribute, UP_lhs)
796 self._CheckLhs(lhs.obj) # recurse on LHS
797
798 else:
799 # Illegal - e.g. setglobal {}["key"] = 42
800 p_die("Subscript/Attribute not allowed on this LHS expression",
801 location.TokenForExpr(lhs))
802
803 def _LhsExprList(self, p_node):
804 # type: (PNode) -> List[y_lhs_t]
805 """lhs_list: expr (',' expr)*"""
806 assert p_node.typ == grammar_nt.lhs_list
807
808 lhs_list = [] # type: List[y_lhs_t]
809 n = p_node.NumChildren()
810 for i in xrange(0, n, 2):
811 p = p_node.GetChild(i)
812 #self.p_printer.Print(p)
813
814 e = self.Expr(p)
815 UP_e = e
816 with tagswitch(e) as case:
817 if case(expr_e.Var):
818 e = cast(expr.Var, UP_e)
819 lhs_list.append(e.left)
820
821 elif case(expr_e.Subscript):
822 e = cast(Subscript, UP_e)
823 self._CheckLhs(e)
824 lhs_list.append(e)
825
826 elif case(expr_e.Attribute):
827 e = cast(Attribute, UP_e)
828 self._CheckLhs(e)
829 if e.op.id != Id.Expr_Dot:
830 # e.g. setvar obj->method is not valid
831 p_die("Can't assign to this attribute expr", e.op)
832 lhs_list.append(e)
833
834 else:
835 pass # work around mycpp bug
836
837 # TODO: could blame arbitary expr_t, bu this works most of
838 # the time
839 if p.tok:
840 blame = p.tok # type: loc_t
841 else:
842 blame = loc.Missing
843 p_die("Can't assign to this expression", blame)
844
845 return lhs_list
846
847 def MakeVarDecl(self, p_node):
848 # type: (PNode) -> command.VarDecl
849 """
850 ysh_var_decl: name_type_list ['=' testlist] end_stmt
851 """
852 assert p_node.typ == grammar_nt.ysh_var_decl
853
854 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
855
856 # This syntax is confusing, and different than JavaScript
857 # var x, y = 1, 2
858 # But this is useful:
859 # var flag, i = parseArgs(spec, argv)
860
861 n = p_node.NumChildren()
862 if n >= 3:
863 rhs = self.Expr(p_node.GetChild(2))
864 else:
865 rhs = None
866
867 # The caller should fill in the keyword token.
868 return command.VarDecl(None, lhs, rhs)
869
870 def MakeMutation(self, p_node):
871 # type: (PNode) -> command.Mutation
872 """
873 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
874 """
875 typ = p_node.typ
876 assert typ == grammar_nt.ysh_mutation
877
878 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
879 op_tok = p_node.GetChild(1).tok
880 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
881 p_die('Multiple assignment must use =', op_tok)
882 rhs = self.Expr(p_node.GetChild(2))
883 return command.Mutation(None, lhs_list, op_tok, rhs)
884
885 def _EggexFlag(self, p_node):
886 # type: (PNode) -> EggexFlag
887 n = p_node.NumChildren()
888 if n == 1:
889 return EggexFlag(False, p_node.GetChild(0).tok)
890 elif n == 2:
891 return EggexFlag(True, p_node.GetChild(1).tok)
892 else:
893 raise AssertionError()
894
895 def _Eggex(self, p_node):
896 # type: (PNode) -> Eggex
897 """
898 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
899 """
900 left = p_node.GetChild(0).tok
901 regex = self._Regex(p_node.GetChild(1))
902
903 flags = [] # type: List[EggexFlag]
904 trans_pref = None # type: Optional[Token]
905
906 i = 2
907 current = p_node.GetChild(i)
908 if current.typ == Id.Op_Semi:
909 i += 1
910 while True:
911 current = p_node.GetChild(i)
912 if current.typ != grammar_nt.re_flag:
913 break
914 flags.append(self._EggexFlag(current))
915 i += 1
916
917 if current.typ == Id.Op_Semi:
918 i += 1
919 trans_pref = p_node.GetChild(i).tok
920
921 # Canonicalize and validate flags for ERE only. Default is ERE.
922 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
923 canonical_flags = regex_translate.CanonicalFlags(flags)
924 else:
925 canonical_flags = None
926
927 return Eggex(left, regex, flags, trans_pref, canonical_flags)
928
929 def YshCasePattern(self, pnode):
930 # type: (PNode) -> pat_t
931 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
932
933 pattern = pnode.GetChild(0)
934 typ = pattern.typ
935 if typ == Id.Op_LParen:
936 # pat_expr or pat_else
937 pattern = pnode.GetChild(1)
938 typ = pattern.typ
939
940 if typ == grammar_nt.pat_else:
941 return pat.Else
942
943 if typ == grammar_nt.pat_exprs:
944 exprs = [] # type: List[expr_t]
945 for i in xrange(pattern.NumChildren()):
946 child = pattern.GetChild(i)
947 if child.typ == grammar_nt.expr:
948 expr = self.Expr(child)
949 exprs.append(expr)
950 return pat.YshExprs(exprs)
951
952 if typ == grammar_nt.eggex:
953 return self._Eggex(pattern)
954
955 raise AssertionError()
956
957 def _BlockArg(self, p_node):
958 # type: (PNode) -> expr_t
959
960 n = p_node.NumChildren()
961 if n == 1:
962 child = p_node.GetChild(0)
963 return self.Expr(child)
964
965 # It can only be an expression, not a=42, or ...expr
966 p_die('Invalid block expression argument', p_node.tok)
967
968 def _Argument(self, p_node, after_semi, arglist):
969 # type: (PNode, bool, ArgList) -> None
970 """
971 argument: (
972 test [comp_for]
973 | test '=' test # named arg
974 | '...' test # var args
975 )
976 """
977 pos_args = arglist.pos_args
978 named_args = arglist.named_args
979
980 assert p_node.typ == grammar_nt.argument, p_node
981 n = p_node.NumChildren()
982 if n == 1:
983 child = p_node.GetChild(0)
984 if after_semi:
985 p_die(POS_ARG_MISPLACED, child.tok)
986 arg = self.Expr(child)
987 pos_args.append(arg)
988 return
989
990 if n == 2:
991 # Note: We allow multiple spreads, just like Julia. They are
992 # concatenated as in lists and dicts.
993 tok0 = p_node.GetChild(0).tok
994 if tok0.id == Id.Expr_Ellipsis:
995 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
996 if after_semi: # f(; ... named)
997 named_args.append(NamedArg(None, spread_expr))
998 else: # f(...named)
999 pos_args.append(spread_expr)
1000 return
1001
1002 # Note: generator expression not implemented
1003 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1004 child = p_node.GetChild(0)
1005 if after_semi:
1006 p_die(POS_ARG_MISPLACED, child.tok)
1007
1008 elt = self.Expr(child)
1009 comp = self._CompFor(p_node.GetChild(1))
1010 arg = expr.GeneratorExp(elt, [comp])
1011 pos_args.append(arg)
1012 return
1013
1014 raise AssertionError()
1015
1016 if n == 3: # named args can come before or after the semicolon
1017 n1 = NamedArg(
1018 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1019 named_args.append(n1)
1020 return
1021
1022 raise AssertionError()
1023
1024 def _ArgGroup(self, p_node, after_semi, arglist):
1025 # type: (PNode, bool, ArgList) -> None
1026 """
1027 arg_group: argument (',' argument)* [',']
1028 """
1029 for i in xrange(p_node.NumChildren()):
1030 p_child = p_node.GetChild(i)
1031 if p_child.typ == grammar_nt.argument:
1032 self._Argument(p_child, after_semi, arglist)
1033
1034 def _ArgList(self, p_node, arglist):
1035 # type: (PNode, ArgList) -> None
1036 """For both funcs and procs
1037
1038 arglist: (
1039 [arg_group]
1040 [';' [arg_group]]
1041 )
1042
1043 arglist3: ...
1044 """
1045 n = p_node.NumChildren()
1046 if n == 0:
1047 return
1048
1049 i = 0
1050
1051 if i >= n:
1052 return
1053 child = p_node.GetChild(i)
1054 if child.typ == grammar_nt.arg_group:
1055 self._ArgGroup(child, False, arglist)
1056 i += 1
1057
1058 if i >= n:
1059 return
1060 child = p_node.GetChild(i)
1061 if child.typ == Id.Op_Semi:
1062 arglist.semi_tok = child.tok
1063 i += 1
1064
1065 # Named args after first semi-colon
1066 if i >= n:
1067 return
1068 child = p_node.GetChild(i)
1069 if child.typ == grammar_nt.arg_group:
1070 self._ArgGroup(child, True, arglist)
1071 i += 1
1072
1073 #
1074 # Special third group may have block expression - only for arglist3,
1075 # used for procs!
1076 #
1077
1078 if i >= n:
1079 return
1080 assert p_node.typ == grammar_nt.arglist3, p_node
1081
1082 child = p_node.GetChild(i)
1083 if child.typ == Id.Op_Semi:
1084 arglist.semi_tok2 = child.tok
1085 i += 1
1086
1087 if i >= n:
1088 return
1089 child = p_node.GetChild(i)
1090 if child.typ == grammar_nt.argument:
1091 arglist.block_expr = self._BlockArg(child)
1092 i += 1
1093
1094 def ProcCallArgs(self, pnode, arglist):
1095 # type: (PNode, ArgList) -> None
1096 """
1097 ysh_eager_arglist: '(' [arglist3] ')'
1098 ysh_lazy_arglist: '[' [arglist] ']'
1099 """
1100 n = pnode.NumChildren()
1101 if n == 2: # f()
1102 return
1103
1104 if n == 3:
1105 child1 = pnode.GetChild(1) # the X in '( X )'
1106
1107 self._ArgList(child1, arglist)
1108 return
1109
1110 raise AssertionError()
1111
1112 def _TypeExpr(self, pnode):
1113 # type: (PNode) -> TypeExpr
1114 """
1115 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1116 """
1117 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1118
1119 ty = TypeExpr.CreateNull() # don't allocate children
1120
1121 ty.tok = pnode.GetChild(0).tok
1122 ty.name = lexer.TokenVal(ty.tok)
1123
1124 n = pnode.NumChildren()
1125 if n == 1:
1126 return ty
1127
1128 ty.params = []
1129 i = 2
1130 while i < n:
1131 p = self._TypeExpr(pnode.GetChild(i))
1132 ty.params.append(p)
1133 i += 2 # skip comma
1134
1135 return ty
1136
1137 def _Param(self, pnode):
1138 # type: (PNode) -> Param
1139 """
1140 param: Expr_Name [type_expr] ['=' expr]
1141 """
1142 assert pnode.typ == grammar_nt.param
1143
1144 name_tok = pnode.GetChild(0).tok
1145 n = pnode.NumChildren()
1146
1147 assert name_tok.id == Id.Expr_Name, name_tok
1148
1149 default_val = None # type: expr_t
1150 type_ = None # type: TypeExpr
1151
1152 if n == 1:
1153 # proc p(a)
1154 pass
1155
1156 elif n == 2:
1157 # proc p(a Int)
1158 type_ = self._TypeExpr(pnode.GetChild(1))
1159
1160 elif n == 3:
1161 # proc p(a = 3)
1162 default_val = self.Expr(pnode.GetChild(2))
1163
1164 elif n == 4:
1165 # proc p(a Int = 3)
1166 type_ = self._TypeExpr(pnode.GetChild(1))
1167 default_val = self.Expr(pnode.GetChild(3))
1168
1169 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1170
1171 def _ParamGroup(self, p_node):
1172 # type: (PNode) -> ParamGroup
1173 """
1174 param_group:
1175 (param ',')*
1176 [ (param | '...' Expr_Name) [,] ]
1177 """
1178 assert p_node.typ == grammar_nt.param_group, p_node
1179
1180 params = [] # type: List[Param]
1181 rest_of = None # type: Optional[RestParam]
1182
1183 n = p_node.NumChildren()
1184 i = 0
1185 while i < n:
1186 child = p_node.GetChild(i)
1187 if child.typ == grammar_nt.param:
1188 params.append(self._Param(child))
1189
1190 elif child.typ == Id.Expr_Ellipsis:
1191 tok = p_node.GetChild(i + 1).tok
1192 rest_of = RestParam(tok, lexer.TokenVal(tok))
1193
1194 i += 2
1195
1196 return ParamGroup(params, rest_of)
1197
1198 def Proc(self, p_node):
1199 # type: (PNode) -> proc_sig_t
1200 """
1201 ysh_proc: (
1202 [ '('
1203 [ param_group ] # word params, with defaults
1204 [ ';' [ param_group ] ] # positional typed params, with defaults
1205 [ ';' [ param_group ] ] # named params, with defaults
1206 [ ';' Expr_Name ] # optional block param, with no type or default
1207 ')'
1208 ]
1209 '{' # opening { for pgen2
1210 )
1211 """
1212 typ = p_node.typ
1213 assert typ == grammar_nt.ysh_proc
1214
1215 n = p_node.NumChildren()
1216 if n == 1: # proc f {
1217 return proc_sig.Open
1218
1219 if n == 3: # proc f () {
1220 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1221
1222 # proc f( three param groups, and block group )
1223 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1224
1225 # Word args
1226 i = 1
1227 child = p_node.GetChild(i)
1228 if child.typ == grammar_nt.param_group:
1229 sig.word = self._ParamGroup(p_node.GetChild(i))
1230
1231 # Validate word args
1232 for word in sig.word.params:
1233 if word.type:
1234 if word.type.name not in ('Str', 'Ref'):
1235 p_die('Word params may only have type Str or Ref',
1236 word.type.tok)
1237 if word.type.params is not None:
1238 p_die('Unexpected type parameters', word.type.tok)
1239
1240 i += 2
1241 else:
1242 i += 1
1243
1244 #log('i %d n %d', i, n)
1245 if i >= n:
1246 return sig
1247
1248 # Positional args
1249 child = p_node.GetChild(i)
1250 if child.typ == grammar_nt.param_group:
1251 sig.positional = self._ParamGroup(p_node.GetChild(i))
1252 i += 2
1253 else:
1254 i += 1
1255
1256 #log('i %d n %d', i, n)
1257 if i >= n:
1258 return sig
1259
1260 # Keyword args
1261 child = p_node.GetChild(i)
1262 if child.typ == grammar_nt.param_group:
1263 sig.named = self._ParamGroup(p_node.GetChild(i))
1264 i += 2
1265 else:
1266 i += 1
1267
1268 #log('i %d n %d', i, n)
1269 if i >= n:
1270 return sig
1271
1272 child = p_node.GetChild(i)
1273 if child.typ == grammar_nt.param_group:
1274 group = self._ParamGroup(p_node.GetChild(i))
1275 params = group.params
1276 if len(params) > 1:
1277 p_die('Only 1 block param is allowed', params[1].blame_tok)
1278 if group.rest_of:
1279 p_die("Rest param isn't allowed for blocks",
1280 group.rest_of.blame_tok)
1281
1282 if len(params) == 1:
1283 if params[0].type:
1284 if params[0].type.name != 'Command':
1285 p_die('Block param must have type Command',
1286 params[0].type.tok)
1287 if params[0].type.params is not None:
1288 p_die('Unexpected type parameters', params[0].type.tok)
1289
1290 sig.block_param = params[0]
1291
1292 return sig
1293
1294 def YshFunc(self, p_node, out):
1295 # type: (PNode, Func) -> None
1296 """
1297 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1298 """
1299 assert p_node.typ == grammar_nt.ysh_func
1300
1301 #self.p_printer.Print(p_node)
1302
1303 out.name = p_node.GetChild(0).tok
1304
1305 n = p_node.NumChildren()
1306 i = 2 # after (
1307
1308 child = p_node.GetChild(i)
1309 if child.typ == grammar_nt.param_group:
1310 out.positional = self._ParamGroup(child)
1311 i += 2 # skip past ;
1312 else:
1313 i += 1
1314
1315 if i >= n:
1316 return
1317
1318 child = p_node.GetChild(i)
1319 if child.typ == grammar_nt.param_group:
1320 out.named = self._ParamGroup(child)
1321
1322 #
1323 # Eggex Language
1324 #
1325
1326 def _RangeCharSingleQuoted(self, p_node):
1327 # type: (PNode) -> Optional[CharCode]
1328
1329 assert p_node.typ == grammar_nt.range_char, p_node
1330
1331 # 'a' in 'a'-'b'
1332
1333 child0 = p_node.GetChild(0)
1334 if child0.typ == grammar_nt.sq_string:
1335 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1336 n = len(sq_part.sval)
1337 if n == 0:
1338 p_die("Quoted range char can't be empty",
1339 loc.WordPart(sq_part))
1340 elif n == 1:
1341 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1342 else:
1343 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1344 return None
1345
1346 def _OtherRangeToken(self, p_node):
1347 # type: (PNode) -> Token
1348 """An endpoint of a range (single char)
1349
1350 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1351 a-z 0-9 'a'-'z' \x00-\xff
1352 """
1353 assert p_node.typ == grammar_nt.range_char, p_node
1354
1355 child0 = p_node.GetChild(0)
1356 if child0.typ == grammar_nt.char_literal:
1357 # \x00 in /[\x00 - \x20]/
1358 tok = child0.GetChild(0).tok
1359 return tok
1360
1361 tok = p_node.tok
1362 # a in a-z is Expr_Name
1363 # 0 in 0-9 is Expr_DecInt
1364 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1365
1366 if tok.length != 1:
1367 p_die(RANGE_POINT_TOO_LONG, tok)
1368 return tok
1369
1370 def _NonRangeChars(self, p_node):
1371 # type: (PNode) -> class_literal_term_t
1372 """
1373 \" \u1234 '#'
1374 """
1375 assert p_node.typ == grammar_nt.range_char, p_node
1376
1377 child0 = p_node.GetChild(0)
1378 typ0 = p_node.GetChild(0).typ
1379
1380 if typ0 == grammar_nt.sq_string:
1381 return cast(SingleQuoted, child0.GetChild(1).tok)
1382
1383 if typ0 == grammar_nt.char_literal:
1384 return word_compile.EvalCharLiteralForRegex(child0.tok)
1385
1386 if typ0 == Id.Expr_Name:
1387 # Look up PerlClass and PosixClass
1388 return self._NameInClass(None, child0.tok)
1389
1390 raise AssertionError()
1391
1392 def _ClassLiteralTerm(self, p_node):
1393 # type: (PNode) -> class_literal_term_t
1394 """
1395 class_literal_term:
1396 range_char ['-' range_char ]
1397 | '@' Expr_Name # splice
1398 | '!' Expr_Name # negate char class
1399 ...
1400 """
1401 assert p_node.typ == grammar_nt.class_literal_term, p_node
1402
1403 typ0 = p_node.GetChild(0).typ
1404
1405 if typ0 == grammar_nt.range_char:
1406 n = p_node.NumChildren()
1407
1408 if n == 1:
1409 return self._NonRangeChars(p_node.GetChild(0))
1410
1411 # 'a'-'z' etc.
1412 if n == 3:
1413 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1414
1415 left = p_node.GetChild(0)
1416 right = p_node.GetChild(2)
1417
1418 code1 = self._RangeCharSingleQuoted(left)
1419 if code1 is None:
1420 tok1 = self._OtherRangeToken(left)
1421 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1422
1423 code2 = self._RangeCharSingleQuoted(right)
1424 if code2 is None:
1425 tok2 = self._OtherRangeToken(right)
1426 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1427 return CharRange(code1, code2)
1428
1429 raise AssertionError()
1430
1431 if typ0 == Id.Expr_At:
1432 tok1 = p_node.GetChild(1).tok
1433 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1434
1435 if typ0 == Id.Expr_Bang:
1436 return self._NameInClass(
1437 p_node.GetChild(0).tok,
1438 p_node.GetChild(1).tok)
1439
1440 p_die("This kind of class literal term isn't implemented",
1441 p_node.GetChild(0).tok)
1442
1443 def _ClassLiteral(self, p_node):
1444 # type: (PNode) -> List[class_literal_term_t]
1445 """class_literal: '[' class_literal_term+ ']'."""
1446 assert p_node.typ == grammar_nt.class_literal
1447 # skip [ and ]
1448 terms = [] # type: List[class_literal_term_t]
1449 for i in xrange(1, p_node.NumChildren() - 1):
1450 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1451
1452 return terms
1453
1454 def _NameInRegex(self, negated_tok, tok):
1455 # type: (Token, Token) -> re_t
1456 tok_str = lexer.TokenVal(tok)
1457 if tok_str == 'dot':
1458 if negated_tok:
1459 p_die("Can't negate this symbol", tok)
1460 return re.Primitive(tok, Id.Eggex_Dot)
1461
1462 if tok_str in POSIX_CLASSES:
1463 return PosixClass(negated_tok, tok_str)
1464
1465 perl = PERL_CLASSES.get(tok_str)
1466 if perl is not None:
1467 return PerlClass(negated_tok, perl)
1468
1469 if tok_str[0].isupper(): # e.g. HexDigit
1470 return re.Splice(tok, lexer.TokenVal(tok))
1471
1472 p_die("%r isn't a character class" % tok_str, tok)
1473
1474 def _NameInClass(self, negated_tok, tok):
1475 # type: (Token, Token) -> class_literal_term_t
1476 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1477 tok_str = lexer.TokenVal(tok)
1478
1479 # A bare, unquoted character literal. In the grammar, this is expressed as
1480 # range_char without an ending.
1481
1482 # d is NOT 'digit', it's a literal 'd'!
1483 if len(tok_str) == 1:
1484 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1485 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1486
1487 if negated_tok: # [~d] is not allowed, only [~digit]
1488 p_die("Can't negate this symbol", tok)
1489 return word_compile.EvalCharLiteralForRegex(tok)
1490
1491 # digit, word, but not d, w, etc.
1492 if tok_str in POSIX_CLASSES:
1493 return PosixClass(negated_tok, tok_str)
1494
1495 perl = PERL_CLASSES.get(tok_str)
1496 if perl is not None:
1497 return PerlClass(negated_tok, perl)
1498 p_die("%r isn't a character class" % tok_str, tok)
1499
1500 def _ReAtom(self, p_atom):
1501 # type: (PNode) -> re_t
1502 """
1503 re_atom: ( char_literal | ...
1504 """
1505 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1506
1507 child0 = p_atom.GetChild(0)
1508
1509 typ0 = p_atom.GetChild(0).typ
1510 tok0 = p_atom.GetChild(0).tok
1511
1512 # Non-terminals
1513
1514 if typ0 == grammar_nt.class_literal:
1515 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1516
1517 if typ0 == grammar_nt.sq_string:
1518 return cast(SingleQuoted, child0.GetChild(1).tok)
1519
1520 if typ0 == grammar_nt.char_literal:
1521 # Note: ERE doesn't seem to support escapes like Python
1522 # https://docs.python.org/3/library/re.html
1523 # We might want to do a translation like this;
1524 #
1525 # \u{03bc} -> \u03bc
1526 # \x00 -> \x00
1527 # \n -> \n
1528
1529 # Must be Id.Char_{OneChar,Hex,UBraced}
1530 assert consts.GetKind(tok0.id) == Kind.Char
1531 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1532 return re.LiteralChars(tok0, s)
1533
1534 # Special punctuation
1535 if typ0 == Id.Expr_Dot: # .
1536 return re.Primitive(tok0, Id.Eggex_Dot)
1537
1538 if typ0 == Id.Arith_Caret: # ^
1539 return re.Primitive(tok0, Id.Eggex_Start)
1540
1541 if typ0 == Id.Expr_Dollar: # $
1542 return re.Primitive(tok0, Id.Eggex_End)
1543
1544 if typ0 == Id.Expr_Name:
1545 # d digit -> PosixClass PerlClass etc.
1546 return self._NameInRegex(None, tok0)
1547
1548 if typ0 == Id.Expr_Symbol:
1549 # Validate symbols here, like we validate PerlClass, etc.
1550 tok_str = lexer.TokenVal(tok0)
1551 if tok_str == '%start':
1552 return re.Primitive(tok0, Id.Eggex_Start)
1553 if tok_str == '%end':
1554 return re.Primitive(tok0, Id.Eggex_End)
1555 p_die("Unexpected token %r in regex" % tok_str, tok0)
1556
1557 if typ0 == Id.Expr_At:
1558 # | '@' Expr_Name
1559 tok1 = p_atom.GetChild(1).tok
1560 return re.Splice(tok0, lexer.TokenVal(tok1))
1561
1562 if typ0 == Id.Expr_Bang:
1563 # | '!' (Expr_Name | class_literal)
1564 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1565 n = p_atom.NumChildren()
1566 if n == 2:
1567 child1 = p_atom.GetChild(1)
1568 if child1.typ == grammar_nt.class_literal:
1569 return re.CharClassLiteral(True,
1570 self._ClassLiteral(child1))
1571 else:
1572 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1573 else:
1574 # Note: !! conflicts with shell history
1575 p_die(
1576 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1577 p_atom.GetChild(1).tok)
1578
1579 if typ0 == Id.Op_LParen:
1580 # | '(' regex ')'
1581
1582 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1583 # Capture.
1584 return re.Group(self._Regex(p_atom.GetChild(1)))
1585
1586 if typ0 == Id.Arith_Less:
1587 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1588
1589 n = p_atom.NumChildren()
1590 assert n == 4 or n == 6 or n == 8, n
1591
1592 # < capture d+ >
1593 regex = self._Regex(p_atom.GetChild(2))
1594
1595 as_name = None # type: Optional[Token]
1596 func_name = None # type: Optional[Token]
1597
1598 i = 3 # points at any of > as :
1599
1600 typ = p_atom.GetChild(i).typ
1601 if typ == Id.Expr_As:
1602 as_name = p_atom.GetChild(i + 1).tok
1603 i += 2
1604
1605 typ = p_atom.GetChild(i).typ
1606 if typ == Id.Arith_Colon:
1607 func_name = p_atom.GetChild(i + 1).tok
1608
1609 return re.Capture(regex, as_name, func_name)
1610
1611 raise AssertionError(typ0)
1612
1613 def _RepeatOp(self, p_repeat):
1614 # type: (PNode) -> re_repeat_t
1615 """
1616 repeat_op: '+' | '*' | '?'
1617 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1618 """
1619 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1620
1621 tok = p_repeat.GetChild(0).tok
1622 id_ = tok.id
1623
1624 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1625 return tok # a+ a* a?
1626
1627 if id_ == Id.Op_LBrace:
1628 child1 = p_repeat.GetChild(1)
1629 if child1.typ != grammar_nt.repeat_range:
1630 # e.g. dot{N *} is .*?
1631 p_die("Perl-style repetition isn't implemented with libc",
1632 child1.tok)
1633
1634 # repeat_range: (
1635 # Expr_DecInt [',']
1636 # | ',' Expr_DecInt
1637 # | Expr_DecInt ',' Expr_DecInt
1638 # )
1639
1640 n = child1.NumChildren()
1641 if n == 1: # {3}
1642 tok = child1.GetChild(0).tok
1643 return tok # different operator than + * ?
1644
1645 if n == 2:
1646 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1647 left = child1.GetChild(0).tok
1648 return re_repeat.Range(left, lexer.TokenVal(left), '',
1649 None)
1650 else: # {1,}
1651 right = child1.GetChild(1).tok
1652 return re_repeat.Range(None, '', lexer.TokenVal(right),
1653 right)
1654
1655 if n == 3: # {1,3}
1656 left = child1.GetChild(0).tok
1657 right = child1.GetChild(2).tok
1658 return re_repeat.Range(left, lexer.TokenVal(left),
1659 lexer.TokenVal(right), right)
1660
1661 raise AssertionError(n)
1662
1663 raise AssertionError(id_)
1664
1665 def _ReAlt(self, p_node):
1666 # type: (PNode) -> re_t
1667 """
1668 re_alt: (re_atom [repeat_op])+
1669 """
1670 assert p_node.typ == grammar_nt.re_alt
1671
1672 i = 0
1673 n = p_node.NumChildren()
1674 seq = [] # type: List[re_t]
1675 while i < n:
1676 r = self._ReAtom(p_node.GetChild(i))
1677 i += 1
1678 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1679 repeat_op = self._RepeatOp(p_node.GetChild(i))
1680 r = re.Repeat(r, repeat_op)
1681 i += 1
1682 seq.append(r)
1683
1684 if len(seq) == 1:
1685 return seq[0]
1686 else:
1687 return re.Seq(seq)
1688
1689 def _Regex(self, p_node):
1690 # type: (PNode) -> re_t
1691 """
1692 regex: [re_alt] (('|'|'or') re_alt)*
1693 """
1694 assert p_node.typ == grammar_nt.regex
1695
1696 n = p_node.NumChildren()
1697 alts = [] # type: List[re_t]
1698 for i in xrange(0, n, 2): # was children[::2]
1699 c = p_node.GetChild(i)
1700 alts.append(self._ReAlt(c))
1701
1702 if len(alts) == 1:
1703 return alts[0]
1704 else:
1705 return re.Alt(alts)
1706
1707
1708# vim: sw=4