OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1708 lines, 1022 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from core import num
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216 n = p_args.NumChildren()
217 if n > 1:
218 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
219
220 a = p_args.GetChild(0)
221 return Subscript(tok0, base, self._Subscript(a))
222
223 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
224 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
225 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
226 expr_context_e.Store)
227
228 raise AssertionError(typ0)
229
230 def _DictPair(self, p_node):
231 # type: (PNode) -> Tuple[expr_t, expr_t]
232 """
233 dict_pair: ( Expr_Name [':' test]
234 | '[' testlist ']' ':' test )
235 | sq_string ':' test
236 | dq_string ':' test )
237 """
238 assert p_node.typ == grammar_nt.dict_pair
239
240 typ = p_node.GetChild(0).typ
241
242 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
243 key = self.Expr(p_node.GetChild(0)) # type: expr_t
244 val = self.Expr(p_node.GetChild(2))
245 return key, val
246
247 tok0 = p_node.GetChild(0).tok
248 id_ = tok0.id
249
250 if id_ == Id.Expr_Name:
251 key_str = value.Str(lexer.TokenVal(tok0))
252 key = expr.Const(tok0, key_str)
253 if p_node.NumChildren() >= 3:
254 val = self.Expr(p_node.GetChild(2))
255 else:
256 val = expr.Implicit
257
258 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
259 key = self.Expr(p_node.GetChild(1))
260 val = self.Expr(p_node.GetChild(4))
261 return key, val
262
263 return key, val
264
265 def _Dict(self, parent, p_node):
266 # type: (PNode, PNode) -> expr.Dict
267 """
268 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
269 """
270 if p_node.typ == Id.Op_RBrace: # {}
271 return expr.Dict(parent.tok, [], [])
272
273 assert p_node.typ == grammar_nt.dict
274
275 keys = [] # type: List[expr_t]
276 values = [] # type: List[expr_t]
277
278 n = p_node.NumChildren()
279 for i in xrange(0, n, 2):
280 key, val = self._DictPair(p_node.GetChild(i))
281 keys.append(key)
282 values.append(val)
283
284 return expr.Dict(parent.tok, keys, values)
285
286 def _Tuple(self, parent):
287 # type: (PNode) -> expr_t
288
289 n = parent.NumChildren()
290
291 # (x) -- not a tuple
292 if n == 1:
293 return self.Expr(parent.GetChild(0))
294
295 # x, and (x,) aren't allowed
296 if n == 2:
297 p_die('Invalid trailing comma', parent.GetChild(1).tok)
298
299 elts = [] # type: List[expr_t]
300 for i in xrange(0, n, 2): # skip commas
301 p_node = parent.GetChild(i)
302 elts.append(self.Expr(p_node))
303
304 return expr.Tuple(parent.tok, elts,
305 expr_context_e.Store) # unused expr_context_e
306
307 def _TestlistComp(self, parent, p_node, id0):
308 # type: (PNode, PNode, Id_t) -> expr_t
309 """
310 testlist_comp:
311 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
312 """
313 assert p_node.typ == grammar_nt.testlist_comp
314
315 n = p_node.NumChildren()
316 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
317 elt = self.Expr(p_node.GetChild(0))
318 comp = self._CompFor(p_node.GetChild(1))
319 if id0 == Id.Op_LParen: # (x+1 for x in y)
320 return expr.GeneratorExp(elt, [comp])
321 if id0 == Id.Op_LBracket: # [x+1 for x in y]
322 return expr.ListComp(parent.tok, elt, [comp])
323 raise AssertionError()
324
325 if id0 == Id.Op_LParen:
326 # Parenthesized expression like (x+1) or (x)
327 if n == 1:
328 return self.Expr(p_node.GetChild(0))
329
330 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
331 if p_node.GetChild(1).typ == Id.Arith_Comma:
332 return self._Tuple(p_node)
333
334 raise AssertionError()
335
336 if id0 == Id.Op_LBracket: # List [1,2,3]
337 elts = [] # type: List[expr_t]
338 for i in xrange(0, n, 2): # skip commas
339 elts.append(self.Expr(p_node.GetChild(i)))
340
341 return expr.List(parent.tok, elts,
342 expr_context_e.Store) # unused expr_context_e
343
344 raise AssertionError(Id_str(id0))
345
346 def _Atom(self, parent):
347 # type: (PNode) -> expr_t
348 """Handle alternatives of 'atom' where there's more than one child."""
349
350 tok = parent.GetChild(0).tok
351 id_ = tok.id
352 n = parent.NumChildren()
353
354 if id_ == Id.Op_LParen:
355 # atom: '(' [yield_expr|testlist_comp] ')' | ...
356 if n == 2: # () is a tuple
357 assert (
358 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
359 return expr.Tuple(tok, [], expr_context_e.Store)
360
361 return self._TestlistComp(parent, parent.GetChild(1), id_)
362
363 if id_ == Id.Op_LBracket:
364 # atom: ... | '[' [testlist_comp] ']' | ...
365
366 if n == 2: # []
367 assert (parent.GetChild(1).typ == Id.Op_RBracket
368 ), parent.GetChild(1)
369 return expr.List(tok, [],
370 expr_context_e.Store) # unused expr_context_e
371
372 return self._TestlistComp(parent, parent.GetChild(1), id_)
373
374 if id_ == Id.Left_CaretBracket: # ^[42 + x]
375 child = self.Expr(parent.GetChild(1))
376 return expr.Literal(child)
377
378 if id_ == Id.Op_LBrace:
379 # atom: ... | '{' [Op_Newline] [dict] '}'
380 i = 1
381 if parent.GetChild(i).typ == Id.Op_Newline:
382 i += 1
383 return self._Dict(parent, parent.GetChild(i))
384
385 if id_ == Id.Arith_Amp:
386 n = parent.NumChildren()
387 if n >= 3:
388 p_die("Places in containers not implemented yet",
389 parent.GetChild(2).tok)
390
391 name_tok = parent.GetChild(1).tok
392 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
393
394 if id_ == Id.Expr_Func:
395 # STUB. This should really be a Func, not Lambda.
396 return expr.Lambda([], expr.Implicit)
397
398 # 100 M
399 # Ignoring the suffix for now
400 if id_ == Id.Expr_DecInt:
401 assert n > 1
402 p_die("Units suffix not implemented", parent.GetChild(1).tok)
403 #return self.Expr(parent.GetChild(0))
404
405 # 100.5 M
406 # Ignoring the suffix for now
407 if id_ == Id.Expr_Float:
408 assert n > 1
409 p_die("unix suffix implemented", parent.GetChild(1).tok)
410 #return self.Expr(parent.GetChild(0))
411
412 raise AssertionError(Id_str(id_))
413
414 def _NameType(self, p_node):
415 # type: (PNode) -> NameType
416 """ name_type: Expr_Name [':'] [type_expr] """
417 name_tok = p_node.GetChild(0).tok
418 typ = None # type: Optional[TypeExpr]
419
420 n = p_node.NumChildren()
421 if n == 2:
422 typ = self._TypeExpr(p_node.GetChild(1))
423 if n == 3:
424 typ = self._TypeExpr(p_node.GetChild(2))
425
426 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
427
428 def _NameTypeList(self, p_node):
429 # type: (PNode) -> List[NameType]
430 """ name_type_list: name_type (',' name_type)* """
431 assert p_node.typ == grammar_nt.name_type_list
432 results = [] # type: List[NameType]
433
434 n = p_node.NumChildren()
435 for i in xrange(0, n, 2): # was children[::2]
436 results.append(self._NameType(p_node.GetChild(i)))
437 return results
438
439 def _CompFor(self, p_node):
440 # type: (PNode) -> Comprehension
441 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
442 lhs = self._NameTypeList(p_node.GetChild(1))
443 iterable = self.Expr(p_node.GetChild(3))
444
445 if p_node.NumChildren() >= 6:
446 cond = self.Expr(p_node.GetChild(5))
447 else:
448 cond = None
449
450 return Comprehension(lhs, iterable, cond)
451
452 def _CompareChain(self, parent):
453 # type: (PNode) -> expr_t
454 """comparison: expr (comp_op expr)*"""
455 cmp_ops = [] # type: List[Token]
456 comparators = [] # type: List[expr_t]
457 left = self.Expr(parent.GetChild(0))
458
459 i = 1
460 n = parent.NumChildren()
461 while i < n:
462 p = parent.GetChild(i)
463 op = p.GetChild(0).tok
464 if p.NumChildren() == 2:
465 # Blame the first token, and change its type
466 if op.id == Id.Expr_Not: # not in
467 op.id = Id.Node_NotIn
468 elif op.id == Id.Expr_Is: # is not
469 op.id = Id.Node_IsNot
470 else:
471 raise AssertionError()
472 else:
473 # is, <, ==, etc.
474 pass
475
476 cmp_ops.append(op)
477 i += 1
478 comparators.append(self.Expr(parent.GetChild(i)))
479 i += 1
480 return expr.Compare(left, cmp_ops, comparators)
481
482 def _Subscript(self, parent):
483 # type: (PNode) -> expr_t
484 """subscript: expr | [expr] ':' [expr]"""
485 typ0 = parent.GetChild(0).typ
486
487 n = parent.NumChildren()
488
489 if typ0 == grammar_nt.expr:
490 if n == 3: # a[1:2]
491 lower = self.Expr(parent.GetChild(0))
492 upper = self.Expr(parent.GetChild(2))
493 elif n == 2: # a[1:]
494 lower = self.Expr(parent.GetChild(0))
495 upper = None
496 else: # a[1]
497 return self.Expr(parent.GetChild(0))
498 else:
499 assert typ0 == Id.Arith_Colon
500 lower = None
501 if n == 1: # a[:]
502 upper = None
503 else: # a[:3]
504 upper = self.Expr(parent.GetChild(1))
505
506 return expr.Slice(lower, parent.GetChild(0).tok, upper)
507
508 def Expr(self, pnode):
509 # type: (PNode) -> expr_t
510 """Transform expressions (as opposed to statements)"""
511 typ = pnode.typ
512
513 #
514 # YSH Entry Points / Additions
515 #
516
517 if typ == grammar_nt.ysh_expr: # for if/while
518 # ysh_expr: '(' testlist ')'
519 return self.Expr(pnode.GetChild(1))
520
521 if typ == grammar_nt.command_expr:
522 # return_expr: testlist end_stmt
523 return self.Expr(pnode.GetChild(0))
524
525 #
526 # Python-like Expressions / Operators
527 #
528
529 if typ == grammar_nt.atom:
530 if pnode.NumChildren() == 1:
531 return self.Expr(pnode.GetChild(0))
532 return self._Atom(pnode)
533
534 if typ == grammar_nt.testlist:
535 # testlist: test (',' test)* [',']
536 return self._Tuple(pnode)
537
538 if typ == grammar_nt.test:
539 # test: or_test ['if' or_test 'else' test] | lambdef
540 if pnode.NumChildren() == 1:
541 return self.Expr(pnode.GetChild(0))
542
543 # TODO: Handle lambdef
544
545 test = self.Expr(pnode.GetChild(2))
546 body = self.Expr(pnode.GetChild(0))
547 orelse = self.Expr(pnode.GetChild(4))
548 return expr.IfExp(test, body, orelse)
549
550 if typ == grammar_nt.lambdef:
551 # lambdef: '|' [name_type_list] '|' test
552
553 n = pnode.NumChildren()
554 if n == 4:
555 params = self._NameTypeList(pnode.GetChild(1))
556 else:
557 params = []
558
559 body = self.Expr(pnode.GetChild(n - 1))
560 return expr.Lambda(params, body)
561
562 #
563 # Operators with Precedence
564 #
565
566 if typ == grammar_nt.or_test:
567 # or_test: and_test ('or' and_test)*
568 return self._LeftAssoc(pnode)
569
570 if typ == grammar_nt.and_test:
571 # and_test: not_test ('and' not_test)*
572 return self._LeftAssoc(pnode)
573
574 if typ == grammar_nt.not_test:
575 # not_test: 'not' not_test | comparison
576 if pnode.NumChildren() == 1:
577 return self.Expr(pnode.GetChild(0))
578
579 op_tok = pnode.GetChild(0).tok # not
580 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
581
582 elif typ == grammar_nt.comparison:
583 if pnode.NumChildren() == 1:
584 return self.Expr(pnode.GetChild(0))
585
586 return self._CompareChain(pnode)
587
588 elif typ == grammar_nt.range_expr:
589 n = pnode.NumChildren()
590 if n == 1:
591 return self.Expr(pnode.GetChild(0))
592
593 if n == 3:
594 return expr.Range(self.Expr(pnode.GetChild(0)),
595 pnode.GetChild(1).tok,
596 self.Expr(pnode.GetChild(2)))
597
598 raise AssertionError(n)
599
600 elif typ == grammar_nt.expr:
601 # expr: xor_expr ('|' xor_expr)*
602 return self._LeftAssoc(pnode)
603
604 if typ == grammar_nt.xor_expr:
605 # xor_expr: and_expr ('xor' and_expr)*
606 return self._LeftAssoc(pnode)
607
608 if typ == grammar_nt.and_expr: # a & b
609 # and_expr: shift_expr ('&' shift_expr)*
610 return self._LeftAssoc(pnode)
611
612 elif typ == grammar_nt.shift_expr:
613 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
614 return self._LeftAssoc(pnode)
615
616 elif typ == grammar_nt.arith_expr:
617 # arith_expr: term (('+'|'-') term)*
618 return self._LeftAssoc(pnode)
619
620 elif typ == grammar_nt.term:
621 # term: factor (('*'|'/'|'div'|'mod') factor)*
622 return self._LeftAssoc(pnode)
623
624 elif typ == grammar_nt.factor:
625 # factor: ('+'|'-'|'~') factor | power
626 # the power would have already been reduced
627 if pnode.NumChildren() == 1:
628 return self.Expr(pnode.GetChild(0))
629
630 assert pnode.NumChildren() == 2
631 op = pnode.GetChild(0)
632 e = pnode.GetChild(1)
633
634 assert isinstance(op.tok, Token)
635 return expr.Unary(op.tok, self.Expr(e))
636
637 elif typ == grammar_nt.power:
638 # power: atom trailer* ['**' factor]
639
640 node = self.Expr(pnode.GetChild(0))
641 if pnode.NumChildren() == 1: # No trailers
642 return node
643
644 # Support a->startswith(b) and mydict.key
645 n = pnode.NumChildren()
646 i = 1
647 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
648 node = self._Trailer(node, pnode.GetChild(i))
649 i += 1
650
651 if i != n: # ['**' factor]
652 op_tok = pnode.GetChild(i).tok
653 assert op_tok.id == Id.Arith_DStar, op_tok
654 factor = self.Expr(pnode.GetChild(i + 1))
655 node = expr.Binary(op_tok, node, factor)
656
657 return node
658
659 elif typ == grammar_nt.eggex:
660 return self._Eggex(pnode)
661
662 elif typ == grammar_nt.ysh_expr_sub:
663 return self.Expr(pnode.GetChild(0))
664
665 #
666 # YSH Lexer Modes
667 #
668
669 elif typ == grammar_nt.sh_array_literal:
670 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
671
672 elif typ == grammar_nt.old_sh_array_literal:
673 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
674
675 elif typ == grammar_nt.sh_command_sub:
676 return cast(CommandSub, pnode.GetChild(1).tok)
677
678 elif typ == grammar_nt.braced_var_sub:
679 return cast(BracedVarSub, pnode.GetChild(1).tok)
680
681 elif typ == grammar_nt.dq_string:
682 s = cast(DoubleQuoted, pnode.GetChild(1).tok)
683 # sugar: ^"..." is short for ^["..."]
684 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
685 return expr.Literal(s)
686 return s
687
688 elif typ == grammar_nt.sq_string:
689 return cast(SingleQuoted, pnode.GetChild(1).tok)
690
691 elif typ == grammar_nt.simple_var_sub:
692 tok = pnode.GetChild(0).tok
693
694 if tok.id == Id.VSub_DollarName: # $foo is disallowed
695 bare = lexer.TokenSliceLeft(tok, 1)
696 p_die(
697 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
698 % (bare, bare), tok)
699
700 # $? is allowed
701 return SimpleVarSub(tok)
702
703 #
704 # Terminals
705 #
706
707 tok = pnode.tok
708 if typ == Id.Expr_Name:
709 return expr.Var(tok, lexer.TokenVal(tok))
710
711 # Everything else is an expr.Const
712 tok_str = lexer.TokenVal(tok)
713 # Remove underscores from 1_000_000. The lexer is responsible for
714 # validation.
715 c_under = tok_str.replace('_', '')
716
717 if typ == Id.Expr_DecInt:
718 try:
719 cval = value.Int(mops.FromStr(c_under)) # type: value_t
720 except ValueError:
721 p_die('Decimal int constant is too large', tok)
722
723 elif typ == Id.Expr_BinInt:
724 assert c_under[:2] in ('0b', '0B'), c_under
725 try:
726 cval = value.Int(mops.FromStr(c_under[2:], 2))
727 except ValueError:
728 p_die('Binary int constant is too large', tok)
729
730 elif typ == Id.Expr_OctInt:
731 assert c_under[:2] in ('0o', '0O'), c_under
732 try:
733 cval = value.Int(mops.FromStr(c_under[2:], 8))
734 except ValueError:
735 p_die('Octal int constant is too large', tok)
736
737 elif typ == Id.Expr_HexInt:
738 assert c_under[:2] in ('0x', '0X'), c_under
739 try:
740 cval = value.Int(mops.FromStr(c_under[2:], 16))
741 except ValueError:
742 p_die('Hex int constant is too large', tok)
743
744 elif typ == Id.Expr_Float:
745 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
746 # I think this never raises ValueError, because the lexer
747 # should only accept strings that strtod() does?
748 cval = value.Float(float(c_under))
749
750 elif typ == Id.Expr_Null:
751 cval = value.Null
752
753 elif typ == Id.Expr_True:
754 cval = value.Bool(True)
755
756 elif typ == Id.Expr_False:
757 cval = value.Bool(False)
758
759 # What to do with the char constants?
760 # \n \u{3bc} #'a'
761 # Are they integers or strings?
762 #
763 # Integers could be ord(\n), or strings could chr(\n)
764 # Or just remove them, with ord(u'\n') and chr(u'\n')
765 #
766 # I think this relies on small string optimization. If we have it,
767 # then 1-4 byte characters are efficient, and don't require heap
768 # allocation.
769
770 elif typ == Id.Char_OneChar:
771 # TODO: look up integer directly?
772 cval = num.ToBig(ord(consts.LookupCharC(tok_str[1])))
773 elif typ == Id.Char_UBraced:
774 hex_str = tok_str[3:-1] # \u{123}
775 # ValueError shouldn't happen because lexer validates
776 cval = value.Int(mops.FromStr(hex_str, 16))
777
778 # This could be a char integer? Not sure
779 elif typ == Id.Char_Pound:
780 # TODO: accept UTF-8 code point instead of single byte
781 byte = tok_str[2] # the a in #'a'
782 cval = num.ToBig(ord(byte)) # It's an integer
783
784 else:
785 raise AssertionError(typ)
786
787 return expr.Const(tok, cval)
788
789 def _CheckLhs(self, lhs):
790 # type: (expr_t) -> None
791
792 UP_lhs = lhs
793 with tagswitch(lhs) as case:
794 if case(expr_e.Var):
795 # OK - e.g. setvar a.b.c[i] = 42
796 pass
797
798 elif case(expr_e.Subscript):
799 lhs = cast(Subscript, UP_lhs)
800 self._CheckLhs(lhs.obj) # recurse on LHS
801
802 elif case(expr_e.Attribute):
803 lhs = cast(Attribute, UP_lhs)
804 self._CheckLhs(lhs.obj) # recurse on LHS
805
806 else:
807 # Illegal - e.g. setglobal {}["key"] = 42
808 p_die("Subscript/Attribute not allowed on this LHS expression",
809 location.TokenForExpr(lhs))
810
811 def _LhsExprList(self, p_node):
812 # type: (PNode) -> List[y_lhs_t]
813 """lhs_list: expr (',' expr)*"""
814 assert p_node.typ == grammar_nt.lhs_list
815
816 lhs_list = [] # type: List[y_lhs_t]
817 n = p_node.NumChildren()
818 for i in xrange(0, n, 2):
819 p = p_node.GetChild(i)
820 #self.p_printer.Print(p)
821
822 e = self.Expr(p)
823 UP_e = e
824 with tagswitch(e) as case:
825 if case(expr_e.Var):
826 e = cast(expr.Var, UP_e)
827 lhs_list.append(e.left)
828
829 elif case(expr_e.Subscript):
830 e = cast(Subscript, UP_e)
831 self._CheckLhs(e)
832 lhs_list.append(e)
833
834 elif case(expr_e.Attribute):
835 e = cast(Attribute, UP_e)
836 self._CheckLhs(e)
837 if e.op.id != Id.Expr_Dot:
838 # e.g. setvar obj->method is not valid
839 p_die("Can't assign to this attribute expr", e.op)
840 lhs_list.append(e)
841
842 else:
843 pass # work around mycpp bug
844
845 # TODO: could blame arbitary expr_t, bu this works most of
846 # the time
847 if p.tok:
848 blame = p.tok # type: loc_t
849 else:
850 blame = loc.Missing
851 p_die("Can't assign to this expression", blame)
852
853 return lhs_list
854
855 def MakeVarDecl(self, p_node):
856 # type: (PNode) -> command.VarDecl
857 """
858 ysh_var_decl: name_type_list ['=' testlist] end_stmt
859 """
860 assert p_node.typ == grammar_nt.ysh_var_decl
861
862 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
863
864 # This syntax is confusing, and different than JavaScript
865 # var x, y = 1, 2
866 # But this is useful:
867 # var flag, i = parseArgs(spec, argv)
868
869 n = p_node.NumChildren()
870 if n >= 3:
871 rhs = self.Expr(p_node.GetChild(2))
872 else:
873 rhs = None
874
875 # The caller should fill in the keyword token.
876 return command.VarDecl(None, lhs, rhs)
877
878 def MakeMutation(self, p_node):
879 # type: (PNode) -> command.Mutation
880 """
881 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
882 """
883 typ = p_node.typ
884 assert typ == grammar_nt.ysh_mutation
885
886 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
887 op_tok = p_node.GetChild(1).tok
888 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
889 p_die('Multiple assignment must use =', op_tok)
890 rhs = self.Expr(p_node.GetChild(2))
891 return command.Mutation(None, lhs_list, op_tok, rhs)
892
893 def _EggexFlag(self, p_node):
894 # type: (PNode) -> EggexFlag
895 n = p_node.NumChildren()
896 if n == 1:
897 return EggexFlag(False, p_node.GetChild(0).tok)
898 elif n == 2:
899 return EggexFlag(True, p_node.GetChild(1).tok)
900 else:
901 raise AssertionError()
902
903 def _Eggex(self, p_node):
904 # type: (PNode) -> Eggex
905 """
906 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
907 """
908 left = p_node.GetChild(0).tok
909 regex = self._Regex(p_node.GetChild(1))
910
911 flags = [] # type: List[EggexFlag]
912 trans_pref = None # type: Optional[Token]
913
914 i = 2
915 current = p_node.GetChild(i)
916 if current.typ == Id.Op_Semi:
917 i += 1
918 while True:
919 current = p_node.GetChild(i)
920 if current.typ != grammar_nt.re_flag:
921 break
922 flags.append(self._EggexFlag(current))
923 i += 1
924
925 if current.typ == Id.Op_Semi:
926 i += 1
927 trans_pref = p_node.GetChild(i).tok
928
929 # Canonicalize and validate flags for ERE only. Default is ERE.
930 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
931 canonical_flags = regex_translate.CanonicalFlags(flags)
932 else:
933 canonical_flags = None
934
935 return Eggex(left, regex, flags, trans_pref, canonical_flags)
936
937 def YshCasePattern(self, pnode):
938 # type: (PNode) -> pat_t
939 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
940
941 pattern = pnode.GetChild(0)
942 typ = pattern.typ
943 if typ == Id.Op_LParen:
944 # pat_expr or pat_else
945 pattern = pnode.GetChild(1)
946 typ = pattern.typ
947
948 if typ == grammar_nt.pat_else:
949 return pat.Else
950
951 if typ == grammar_nt.pat_exprs:
952 exprs = [] # type: List[expr_t]
953 for i in xrange(pattern.NumChildren()):
954 child = pattern.GetChild(i)
955 if child.typ == grammar_nt.expr:
956 expr = self.Expr(child)
957 exprs.append(expr)
958 return pat.YshExprs(exprs)
959
960 if typ == grammar_nt.eggex:
961 return self._Eggex(pattern)
962
963 raise AssertionError()
964
965 def _BlockArg(self, p_node):
966 # type: (PNode) -> expr_t
967
968 n = p_node.NumChildren()
969 if n == 1:
970 child = p_node.GetChild(0)
971 return self.Expr(child)
972
973 # It can only be an expression, not a=42, or ...expr
974 p_die('Invalid block expression argument', p_node.tok)
975
976 def _Argument(self, p_node, after_semi, arglist):
977 # type: (PNode, bool, ArgList) -> None
978 """
979 argument: (
980 test [comp_for]
981 | test '=' test # named arg
982 | '...' test # var args
983 )
984 """
985 pos_args = arglist.pos_args
986 named_args = arglist.named_args
987
988 assert p_node.typ == grammar_nt.argument, p_node
989 n = p_node.NumChildren()
990 if n == 1:
991 child = p_node.GetChild(0)
992 if after_semi:
993 p_die(POS_ARG_MISPLACED, child.tok)
994 arg = self.Expr(child)
995 pos_args.append(arg)
996 return
997
998 if n == 2:
999 # Note: We allow multiple spreads, just like Julia. They are
1000 # concatenated as in lists and dicts.
1001 tok0 = p_node.GetChild(0).tok
1002 if tok0.id == Id.Expr_Ellipsis:
1003 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
1004 if after_semi: # f(; ... named)
1005 named_args.append(NamedArg(None, spread_expr))
1006 else: # f(...named)
1007 pos_args.append(spread_expr)
1008 return
1009
1010 # Note: generator expression not implemented
1011 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1012 child = p_node.GetChild(0)
1013 if after_semi:
1014 p_die(POS_ARG_MISPLACED, child.tok)
1015
1016 elt = self.Expr(child)
1017 comp = self._CompFor(p_node.GetChild(1))
1018 arg = expr.GeneratorExp(elt, [comp])
1019 pos_args.append(arg)
1020 return
1021
1022 raise AssertionError()
1023
1024 if n == 3: # named args can come before or after the semicolon
1025 n1 = NamedArg(
1026 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1027 named_args.append(n1)
1028 return
1029
1030 raise AssertionError()
1031
1032 def _ArgGroup(self, p_node, after_semi, arglist):
1033 # type: (PNode, bool, ArgList) -> None
1034 """
1035 arg_group: argument (',' argument)* [',']
1036 """
1037 for i in xrange(p_node.NumChildren()):
1038 p_child = p_node.GetChild(i)
1039 if p_child.typ == grammar_nt.argument:
1040 self._Argument(p_child, after_semi, arglist)
1041
1042 def _ArgList(self, p_node, arglist):
1043 # type: (PNode, ArgList) -> None
1044 """For both funcs and procs
1045
1046 arglist: (
1047 [arg_group]
1048 [';' [arg_group]]
1049 )
1050
1051 arglist3: ...
1052 """
1053 n = p_node.NumChildren()
1054 if n == 0:
1055 return
1056
1057 i = 0
1058
1059 if i >= n:
1060 return
1061 child = p_node.GetChild(i)
1062 if child.typ == grammar_nt.arg_group:
1063 self._ArgGroup(child, False, arglist)
1064 i += 1
1065
1066 if i >= n:
1067 return
1068 child = p_node.GetChild(i)
1069 if child.typ == Id.Op_Semi:
1070 arglist.semi_tok = child.tok
1071 i += 1
1072
1073 # Named args after first semi-colon
1074 if i >= n:
1075 return
1076 child = p_node.GetChild(i)
1077 if child.typ == grammar_nt.arg_group:
1078 self._ArgGroup(child, True, arglist)
1079 i += 1
1080
1081 #
1082 # Special third group may have block expression - only for arglist3,
1083 # used for procs!
1084 #
1085
1086 if i >= n:
1087 return
1088 assert p_node.typ == grammar_nt.arglist3, p_node
1089
1090 child = p_node.GetChild(i)
1091 if child.typ == Id.Op_Semi:
1092 arglist.semi_tok2 = child.tok
1093 i += 1
1094
1095 if i >= n:
1096 return
1097 child = p_node.GetChild(i)
1098 if child.typ == grammar_nt.argument:
1099 arglist.block_expr = self._BlockArg(child)
1100 i += 1
1101
1102 def ProcCallArgs(self, pnode, arglist):
1103 # type: (PNode, ArgList) -> None
1104 """
1105 ysh_eager_arglist: '(' [arglist3] ')'
1106 ysh_lazy_arglist: '[' [arglist] ']'
1107 """
1108 n = pnode.NumChildren()
1109 if n == 2: # f()
1110 return
1111
1112 if n == 3:
1113 child1 = pnode.GetChild(1) # the X in '( X )'
1114
1115 self._ArgList(child1, arglist)
1116 return
1117
1118 raise AssertionError()
1119
1120 def _TypeExpr(self, pnode):
1121 # type: (PNode) -> TypeExpr
1122 """
1123 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1124 """
1125 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1126
1127 ty = TypeExpr.CreateNull() # don't allocate children
1128
1129 ty.tok = pnode.GetChild(0).tok
1130 ty.name = lexer.TokenVal(ty.tok)
1131
1132 n = pnode.NumChildren()
1133 if n == 1:
1134 return ty
1135
1136 ty.params = []
1137 i = 2
1138 while i < n:
1139 p = self._TypeExpr(pnode.GetChild(i))
1140 ty.params.append(p)
1141 i += 2 # skip comma
1142
1143 return ty
1144
1145 def _Param(self, pnode):
1146 # type: (PNode) -> Param
1147 """
1148 param: Expr_Name [type_expr] ['=' expr]
1149 """
1150 assert pnode.typ == grammar_nt.param
1151
1152 name_tok = pnode.GetChild(0).tok
1153 n = pnode.NumChildren()
1154
1155 assert name_tok.id == Id.Expr_Name, name_tok
1156
1157 default_val = None # type: expr_t
1158 type_ = None # type: TypeExpr
1159
1160 if n == 1:
1161 # proc p(a)
1162 pass
1163
1164 elif n == 2:
1165 # proc p(a Int)
1166 type_ = self._TypeExpr(pnode.GetChild(1))
1167
1168 elif n == 3:
1169 # proc p(a = 3)
1170 default_val = self.Expr(pnode.GetChild(2))
1171
1172 elif n == 4:
1173 # proc p(a Int = 3)
1174 type_ = self._TypeExpr(pnode.GetChild(1))
1175 default_val = self.Expr(pnode.GetChild(3))
1176
1177 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1178
1179 def _ParamGroup(self, p_node):
1180 # type: (PNode) -> ParamGroup
1181 """
1182 param_group:
1183 (param ',')*
1184 [ (param | '...' Expr_Name) [,] ]
1185 """
1186 assert p_node.typ == grammar_nt.param_group, p_node
1187
1188 params = [] # type: List[Param]
1189 rest_of = None # type: Optional[RestParam]
1190
1191 n = p_node.NumChildren()
1192 i = 0
1193 while i < n:
1194 child = p_node.GetChild(i)
1195 if child.typ == grammar_nt.param:
1196 params.append(self._Param(child))
1197
1198 elif child.typ == Id.Expr_Ellipsis:
1199 tok = p_node.GetChild(i + 1).tok
1200 rest_of = RestParam(tok, lexer.TokenVal(tok))
1201
1202 i += 2
1203
1204 return ParamGroup(params, rest_of)
1205
1206 def Proc(self, p_node):
1207 # type: (PNode) -> proc_sig_t
1208 """
1209 ysh_proc: (
1210 [ '('
1211 [ param_group ] # word params, with defaults
1212 [ ';' [ param_group ] ] # positional typed params, with defaults
1213 [ ';' [ param_group ] ] # named params, with defaults
1214 [ ';' Expr_Name ] # optional block param, with no type or default
1215 ')'
1216 ]
1217 '{' # opening { for pgen2
1218 )
1219 """
1220 typ = p_node.typ
1221 assert typ == grammar_nt.ysh_proc
1222
1223 n = p_node.NumChildren()
1224 if n == 1: # proc f {
1225 return proc_sig.Open
1226
1227 if n == 3: # proc f () {
1228 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1229
1230 # proc f( three param groups, and block group )
1231 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1232
1233 # Word args
1234 i = 1
1235 child = p_node.GetChild(i)
1236 if child.typ == grammar_nt.param_group:
1237 sig.word = self._ParamGroup(p_node.GetChild(i))
1238
1239 # Validate word args
1240 for word in sig.word.params:
1241 if word.type:
1242 if word.type.name not in ('Str', 'Ref'):
1243 p_die('Word params may only have type Str or Ref',
1244 word.type.tok)
1245 if word.type.params is not None:
1246 p_die('Unexpected type parameters', word.type.tok)
1247
1248 i += 2
1249 else:
1250 i += 1
1251
1252 #log('i %d n %d', i, n)
1253 if i >= n:
1254 return sig
1255
1256 # Positional args
1257 child = p_node.GetChild(i)
1258 if child.typ == grammar_nt.param_group:
1259 sig.positional = self._ParamGroup(p_node.GetChild(i))
1260 i += 2
1261 else:
1262 i += 1
1263
1264 #log('i %d n %d', i, n)
1265 if i >= n:
1266 return sig
1267
1268 # Keyword args
1269 child = p_node.GetChild(i)
1270 if child.typ == grammar_nt.param_group:
1271 sig.named = self._ParamGroup(p_node.GetChild(i))
1272 i += 2
1273 else:
1274 i += 1
1275
1276 #log('i %d n %d', i, n)
1277 if i >= n:
1278 return sig
1279
1280 child = p_node.GetChild(i)
1281 if child.typ == grammar_nt.param_group:
1282 group = self._ParamGroup(p_node.GetChild(i))
1283 params = group.params
1284 if len(params) > 1:
1285 p_die('Only 1 block param is allowed', params[1].blame_tok)
1286 if group.rest_of:
1287 p_die("Rest param isn't allowed for blocks",
1288 group.rest_of.blame_tok)
1289
1290 if len(params) == 1:
1291 if params[0].type:
1292 if params[0].type.name != 'Command':
1293 p_die('Block param must have type Command',
1294 params[0].type.tok)
1295 if params[0].type.params is not None:
1296 p_die('Unexpected type parameters', params[0].type.tok)
1297
1298 sig.block_param = params[0]
1299
1300 return sig
1301
1302 def YshFunc(self, p_node, out):
1303 # type: (PNode, Func) -> None
1304 """
1305 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1306 """
1307 assert p_node.typ == grammar_nt.ysh_func
1308
1309 #self.p_printer.Print(p_node)
1310
1311 out.name = p_node.GetChild(0).tok
1312
1313 n = p_node.NumChildren()
1314 i = 2 # after (
1315
1316 child = p_node.GetChild(i)
1317 if child.typ == grammar_nt.param_group:
1318 out.positional = self._ParamGroup(child)
1319 i += 2 # skip past ;
1320 else:
1321 i += 1
1322
1323 if i >= n:
1324 return
1325
1326 child = p_node.GetChild(i)
1327 if child.typ == grammar_nt.param_group:
1328 out.named = self._ParamGroup(child)
1329
1330 #
1331 # Eggex Language
1332 #
1333
1334 def _RangeCharSingleQuoted(self, p_node):
1335 # type: (PNode) -> Optional[CharCode]
1336
1337 assert p_node.typ == grammar_nt.range_char, p_node
1338
1339 # 'a' in 'a'-'b'
1340
1341 child0 = p_node.GetChild(0)
1342 if child0.typ == grammar_nt.sq_string:
1343 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1344 n = len(sq_part.sval)
1345 if n == 0:
1346 p_die("Quoted range char can't be empty",
1347 loc.WordPart(sq_part))
1348 elif n == 1:
1349 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1350 else:
1351 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1352 return None
1353
1354 def _OtherRangeToken(self, p_node):
1355 # type: (PNode) -> Token
1356 """An endpoint of a range (single char)
1357
1358 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1359 a-z 0-9 'a'-'z' \x00-\xff
1360 """
1361 assert p_node.typ == grammar_nt.range_char, p_node
1362
1363 child0 = p_node.GetChild(0)
1364 if child0.typ == grammar_nt.char_literal:
1365 # \x00 in /[\x00 - \x20]/
1366 tok = child0.GetChild(0).tok
1367 return tok
1368
1369 tok = p_node.tok
1370 # a in a-z is Expr_Name
1371 # 0 in 0-9 is Expr_DecInt
1372 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1373
1374 if tok.length != 1:
1375 p_die(RANGE_POINT_TOO_LONG, tok)
1376 return tok
1377
1378 def _NonRangeChars(self, p_node):
1379 # type: (PNode) -> class_literal_term_t
1380 """
1381 \" \u1234 '#'
1382 """
1383 assert p_node.typ == grammar_nt.range_char, p_node
1384
1385 child0 = p_node.GetChild(0)
1386 typ0 = p_node.GetChild(0).typ
1387
1388 if typ0 == grammar_nt.sq_string:
1389 return cast(SingleQuoted, child0.GetChild(1).tok)
1390
1391 if typ0 == grammar_nt.char_literal:
1392 return word_compile.EvalCharLiteralForRegex(child0.tok)
1393
1394 if typ0 == Id.Expr_Name:
1395 # Look up PerlClass and PosixClass
1396 return self._NameInClass(None, child0.tok)
1397
1398 raise AssertionError()
1399
1400 def _ClassLiteralTerm(self, p_node):
1401 # type: (PNode) -> class_literal_term_t
1402 """
1403 class_literal_term:
1404 range_char ['-' range_char ]
1405 | '@' Expr_Name # splice
1406 | '!' Expr_Name # negate char class
1407 ...
1408 """
1409 assert p_node.typ == grammar_nt.class_literal_term, p_node
1410
1411 typ0 = p_node.GetChild(0).typ
1412
1413 if typ0 == grammar_nt.range_char:
1414 n = p_node.NumChildren()
1415
1416 if n == 1:
1417 return self._NonRangeChars(p_node.GetChild(0))
1418
1419 # 'a'-'z' etc.
1420 if n == 3:
1421 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1422
1423 left = p_node.GetChild(0)
1424 right = p_node.GetChild(2)
1425
1426 code1 = self._RangeCharSingleQuoted(left)
1427 if code1 is None:
1428 tok1 = self._OtherRangeToken(left)
1429 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1430
1431 code2 = self._RangeCharSingleQuoted(right)
1432 if code2 is None:
1433 tok2 = self._OtherRangeToken(right)
1434 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1435 return CharRange(code1, code2)
1436
1437 raise AssertionError()
1438
1439 if typ0 == Id.Expr_At:
1440 tok1 = p_node.GetChild(1).tok
1441 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1442
1443 if typ0 == Id.Expr_Bang:
1444 return self._NameInClass(
1445 p_node.GetChild(0).tok,
1446 p_node.GetChild(1).tok)
1447
1448 p_die("This kind of class literal term isn't implemented",
1449 p_node.GetChild(0).tok)
1450
1451 def _ClassLiteral(self, p_node):
1452 # type: (PNode) -> List[class_literal_term_t]
1453 """class_literal: '[' class_literal_term+ ']'."""
1454 assert p_node.typ == grammar_nt.class_literal
1455 # skip [ and ]
1456 terms = [] # type: List[class_literal_term_t]
1457 for i in xrange(1, p_node.NumChildren() - 1):
1458 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1459
1460 return terms
1461
1462 def _NameInRegex(self, negated_tok, tok):
1463 # type: (Token, Token) -> re_t
1464 tok_str = lexer.TokenVal(tok)
1465 if tok_str == 'dot':
1466 if negated_tok:
1467 p_die("Can't negate this symbol", tok)
1468 return re.Primitive(tok, Id.Eggex_Dot)
1469
1470 if tok_str in POSIX_CLASSES:
1471 return PosixClass(negated_tok, tok_str)
1472
1473 perl = PERL_CLASSES.get(tok_str)
1474 if perl is not None:
1475 return PerlClass(negated_tok, perl)
1476
1477 if tok_str[0].isupper(): # e.g. HexDigit
1478 return re.Splice(tok, lexer.TokenVal(tok))
1479
1480 p_die("%r isn't a character class" % tok_str, tok)
1481
1482 def _NameInClass(self, negated_tok, tok):
1483 # type: (Token, Token) -> class_literal_term_t
1484 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1485 tok_str = lexer.TokenVal(tok)
1486
1487 # A bare, unquoted character literal. In the grammar, this is expressed as
1488 # range_char without an ending.
1489
1490 # d is NOT 'digit', it's a literal 'd'!
1491 if len(tok_str) == 1:
1492 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1493 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1494
1495 if negated_tok: # [~d] is not allowed, only [~digit]
1496 p_die("Can't negate this symbol", tok)
1497 return word_compile.EvalCharLiteralForRegex(tok)
1498
1499 # digit, word, but not d, w, etc.
1500 if tok_str in POSIX_CLASSES:
1501 return PosixClass(negated_tok, tok_str)
1502
1503 perl = PERL_CLASSES.get(tok_str)
1504 if perl is not None:
1505 return PerlClass(negated_tok, perl)
1506 p_die("%r isn't a character class" % tok_str, tok)
1507
1508 def _ReAtom(self, p_atom):
1509 # type: (PNode) -> re_t
1510 """
1511 re_atom: ( char_literal | ...
1512 """
1513 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1514
1515 child0 = p_atom.GetChild(0)
1516
1517 typ0 = p_atom.GetChild(0).typ
1518 tok0 = p_atom.GetChild(0).tok
1519
1520 # Non-terminals
1521
1522 if typ0 == grammar_nt.class_literal:
1523 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1524
1525 if typ0 == grammar_nt.sq_string:
1526 return cast(SingleQuoted, child0.GetChild(1).tok)
1527
1528 if typ0 == grammar_nt.char_literal:
1529 # Must be Id.Char_{OneChar,Hex,UBraced}
1530 assert consts.GetKind(tok0.id) == Kind.Char
1531 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1532 return re.LiteralChars(tok0, s)
1533
1534 # Special punctuation
1535 if typ0 == Id.Expr_Dot: # .
1536 return re.Primitive(tok0, Id.Eggex_Dot)
1537
1538 if typ0 == Id.Arith_Caret: # ^
1539 return re.Primitive(tok0, Id.Eggex_Start)
1540
1541 if typ0 == Id.Expr_Dollar: # $
1542 return re.Primitive(tok0, Id.Eggex_End)
1543
1544 if typ0 == Id.Expr_Name:
1545 # d digit -> PosixClass PerlClass etc.
1546 return self._NameInRegex(None, tok0)
1547
1548 if typ0 == Id.Expr_Symbol:
1549 # Validate symbols here, like we validate PerlClass, etc.
1550 tok_str = lexer.TokenVal(tok0)
1551 if tok_str == '%start':
1552 return re.Primitive(tok0, Id.Eggex_Start)
1553 if tok_str == '%end':
1554 return re.Primitive(tok0, Id.Eggex_End)
1555 p_die("Unexpected token %r in regex" % tok_str, tok0)
1556
1557 if typ0 == Id.Expr_At:
1558 # | '@' Expr_Name
1559 tok1 = p_atom.GetChild(1).tok
1560 return re.Splice(tok0, lexer.TokenVal(tok1))
1561
1562 if typ0 == Id.Expr_Bang:
1563 # | '!' (Expr_Name | class_literal)
1564 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1565 n = p_atom.NumChildren()
1566 if n == 2:
1567 child1 = p_atom.GetChild(1)
1568 if child1.typ == grammar_nt.class_literal:
1569 return re.CharClassLiteral(True,
1570 self._ClassLiteral(child1))
1571 else:
1572 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1573 else:
1574 # Note: !! conflicts with shell history
1575 p_die(
1576 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1577 p_atom.GetChild(1).tok)
1578
1579 if typ0 == Id.Op_LParen:
1580 # | '(' regex ')'
1581
1582 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1583 # Capture.
1584 return re.Group(self._Regex(p_atom.GetChild(1)))
1585
1586 if typ0 == Id.Arith_Less:
1587 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1588
1589 n = p_atom.NumChildren()
1590 assert n == 4 or n == 6 or n == 8, n
1591
1592 # < capture d+ >
1593 regex = self._Regex(p_atom.GetChild(2))
1594
1595 as_name = None # type: Optional[Token]
1596 func_name = None # type: Optional[Token]
1597
1598 i = 3 # points at any of > as :
1599
1600 typ = p_atom.GetChild(i).typ
1601 if typ == Id.Expr_As:
1602 as_name = p_atom.GetChild(i + 1).tok
1603 i += 2
1604
1605 typ = p_atom.GetChild(i).typ
1606 if typ == Id.Arith_Colon:
1607 func_name = p_atom.GetChild(i + 1).tok
1608
1609 return re.Capture(regex, as_name, func_name)
1610
1611 raise AssertionError(typ0)
1612
1613 def _RepeatOp(self, p_repeat):
1614 # type: (PNode) -> re_repeat_t
1615 """
1616 repeat_op: '+' | '*' | '?'
1617 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1618 """
1619 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1620
1621 tok = p_repeat.GetChild(0).tok
1622 id_ = tok.id
1623
1624 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1625 return tok # a+ a* a?
1626
1627 if id_ == Id.Op_LBrace:
1628 child1 = p_repeat.GetChild(1)
1629 if child1.typ != grammar_nt.repeat_range:
1630 # e.g. dot{N *} is .*?
1631 p_die("Perl-style repetition isn't implemented with libc",
1632 child1.tok)
1633
1634 # repeat_range: (
1635 # Expr_DecInt [',']
1636 # | ',' Expr_DecInt
1637 # | Expr_DecInt ',' Expr_DecInt
1638 # )
1639
1640 n = child1.NumChildren()
1641 if n == 1: # {3}
1642 tok = child1.GetChild(0).tok
1643 return tok # different operator than + * ?
1644
1645 if n == 2:
1646 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1647 left = child1.GetChild(0).tok
1648 return re_repeat.Range(left, lexer.TokenVal(left), '',
1649 None)
1650 else: # {1,}
1651 right = child1.GetChild(1).tok
1652 return re_repeat.Range(None, '', lexer.TokenVal(right),
1653 right)
1654
1655 if n == 3: # {1,3}
1656 left = child1.GetChild(0).tok
1657 right = child1.GetChild(2).tok
1658 return re_repeat.Range(left, lexer.TokenVal(left),
1659 lexer.TokenVal(right), right)
1660
1661 raise AssertionError(n)
1662
1663 raise AssertionError(id_)
1664
1665 def _ReAlt(self, p_node):
1666 # type: (PNode) -> re_t
1667 """
1668 re_alt: (re_atom [repeat_op])+
1669 """
1670 assert p_node.typ == grammar_nt.re_alt
1671
1672 i = 0
1673 n = p_node.NumChildren()
1674 seq = [] # type: List[re_t]
1675 while i < n:
1676 r = self._ReAtom(p_node.GetChild(i))
1677 i += 1
1678 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1679 repeat_op = self._RepeatOp(p_node.GetChild(i))
1680 r = re.Repeat(r, repeat_op)
1681 i += 1
1682 seq.append(r)
1683
1684 if len(seq) == 1:
1685 return seq[0]
1686 else:
1687 return re.Seq(seq)
1688
1689 def _Regex(self, p_node):
1690 # type: (PNode) -> re_t
1691 """
1692 regex: [re_alt] (('|'|'or') re_alt)*
1693 """
1694 assert p_node.typ == grammar_nt.regex
1695
1696 n = p_node.NumChildren()
1697 alts = [] # type: List[re_t]
1698 for i in xrange(0, n, 2): # was children[::2]
1699 c = p_node.GetChild(i)
1700 alts.append(self._ReAlt(c))
1701
1702 if len(alts) == 1:
1703 return alts[0]
1704 else:
1705 return re.Alt(alts)
1706
1707
1708# vim: sw=4