OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1701 lines, 1020 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from data_lang import j8
53from frontend import consts
54from frontend import lexer
55from frontend import location
56from mycpp import mops
57from mycpp import mylib
58from mycpp.mylib import log, tagswitch
59from osh import word_compile
60from ysh import expr_parse
61from ysh import regex_translate
62
63from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
64if TYPE_CHECKING:
65 from pgen2.grammar import Grammar
66 from pgen2.pnode import PNode
67
68_ = log
69
70PERL_CLASSES = {
71 'd': 'd',
72 'w': 'w',
73 'word': 'w',
74 's': 's',
75}
76# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
77POSIX_CLASSES = [
78 'alnum',
79 'cntrl',
80 'lower',
81 'space',
82 'alpha',
83 'digit',
84 'print',
85 'upper',
86 'blank',
87 'graph',
88 'punct',
89 'xdigit',
90]
91# NOTE: There are also things like \p{Greek} that we could put in the
92# "non-sigil" namespace.
93
94RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
95
96POS_ARG_MISPLACED = "Positional arg can't appear in group of named args"
97
98# Copied from pgen2/token.py to avoid dependency.
99NT_OFFSET = 256
100
101if mylib.PYTHON:
102
103 def MakeGrammarNames(ysh_grammar):
104 # type: (Grammar) -> Dict[int, str]
105
106 # TODO: Break this dependency
107 from frontend import lexer_def
108
109 names = {}
110
111 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
112 # Hm some are out of range
113 #assert k < 256, (k, id_name)
114
115 # TODO: Some tokens have values greater than NT_OFFSET
116 if k < NT_OFFSET:
117 names[k] = id_name
118
119 for k, v in ysh_grammar.number2symbol.items():
120 assert k >= NT_OFFSET, (k, v)
121 names[k] = v
122
123 return names
124
125
126class Transformer(object):
127 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
128
129 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
130 so this "transformer" is the equivalent.
131
132 Files to refer to when modifying this function:
133
134 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
135 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
136
137 Related examples:
138
139 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
140 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
141
142 Other:
143 frontend/parse_lib.py (turn on print_parse_tree)
144
145 Public methods:
146 Expr, VarDecl
147 atom, trailer, etc. are private, named after productions in grammar.pgen2.
148 """
149
150 def __init__(self, gr):
151 # type: (Grammar) -> None
152 self.number2symbol = gr.number2symbol
153 if mylib.PYTHON:
154 names = MakeGrammarNames(gr)
155 # print raw nodes
156 self.p_printer = expr_parse.ParseTreePrinter(names)
157
158 def _LeftAssoc(self, p_node):
159 # type: (PNode) -> expr_t
160 """For an associative binary operation.
161
162 Examples:
163 xor_expr: and_expr ('xor' and_expr)*
164 term: factor (('*'|'/'|'%'|'div') factor)*
165
166 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
167 """
168 # Note: Compare the iteractive com_binary() method in
169 # opy/compiler2/transformer.py.
170
171 # Examples:
172 # - The PNode for '3 - 1' will have 3 children
173 # - The PNode for '3 - 1 - 2' will have 5 children
174
175 #self.p_printer.Print(p_node)
176
177 i = 1 # index of the operator
178 n = p_node.NumChildren()
179
180 left = self.Expr(p_node.GetChild(0))
181 while i < n:
182 op = p_node.GetChild(i)
183 right = self.Expr(p_node.GetChild(i + 1))
184
185 # create a new left node
186 left = expr.Binary(op.tok, left, right)
187 i += 2
188
189 return left
190
191 def _Trailer(self, base, p_trailer):
192 # type: (expr_t, PNode) -> expr_t
193 """
194 trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
195 | '.' NAME | '->' NAME | '::' NAME
196 )
197 """
198 tok0 = p_trailer.GetChild(0).tok
199 typ0 = p_trailer.GetChild(0).typ
200
201 if typ0 == Id.Op_LParen:
202 lparen = tok0
203 rparen = p_trailer.GetChild(-1).tok
204 arglist = ArgList(lparen, [], None, [], None, None, rparen)
205 if p_trailer.NumChildren() == 2: # ()
206 return expr.FuncCall(base, arglist)
207
208 p = p_trailer.GetChild(1) # the X in ( X )
209 assert p.typ == grammar_nt.arglist # f(x, y)
210 self._ArgList(p, arglist)
211 return expr.FuncCall(base, arglist)
212
213 if typ0 == Id.Op_LBracket:
214 p_args = p_trailer.GetChild(1)
215 assert p_args.typ == grammar_nt.subscriptlist
216 n = p_args.NumChildren()
217 if n > 1:
218 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
219
220 a = p_args.GetChild(0)
221 return Subscript(tok0, base, self._Subscript(a))
222
223 if typ0 in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
224 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
225 return Attribute(base, tok0, attr, lexer.TokenVal(attr),
226 expr_context_e.Store)
227
228 raise AssertionError(typ0)
229
230 def _DictPair(self, p_node):
231 # type: (PNode) -> Tuple[expr_t, expr_t]
232 """
233 dict_pair: ( Expr_Name [':' test]
234 | '[' testlist ']' ':' test )
235 | sq_string ':' test
236 | dq_string ':' test )
237 """
238 assert p_node.typ == grammar_nt.dict_pair
239
240 typ = p_node.GetChild(0).typ
241
242 if typ in (grammar_nt.sq_string, grammar_nt.dq_string):
243 key = self.Expr(p_node.GetChild(0)) # type: expr_t
244 val = self.Expr(p_node.GetChild(2))
245 return key, val
246
247 tok0 = p_node.GetChild(0).tok
248 id_ = tok0.id
249
250 if id_ == Id.Expr_Name:
251 key_str = value.Str(lexer.TokenVal(tok0))
252 key = expr.Const(tok0, key_str)
253 if p_node.NumChildren() >= 3:
254 val = self.Expr(p_node.GetChild(2))
255 else:
256 val = expr.Implicit
257
258 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
259 key = self.Expr(p_node.GetChild(1))
260 val = self.Expr(p_node.GetChild(4))
261 return key, val
262
263 return key, val
264
265 def _Dict(self, parent, p_node):
266 # type: (PNode, PNode) -> expr.Dict
267 """
268 dict: dict_pair (comma_newline dict_pair)* [comma_newline]
269 """
270 if p_node.typ == Id.Op_RBrace: # {}
271 return expr.Dict(parent.tok, [], [])
272
273 assert p_node.typ == grammar_nt.dict
274
275 keys = [] # type: List[expr_t]
276 values = [] # type: List[expr_t]
277
278 n = p_node.NumChildren()
279 for i in xrange(0, n, 2):
280 key, val = self._DictPair(p_node.GetChild(i))
281 keys.append(key)
282 values.append(val)
283
284 return expr.Dict(parent.tok, keys, values)
285
286 def _Tuple(self, parent):
287 # type: (PNode) -> expr_t
288
289 n = parent.NumChildren()
290
291 # (x) -- not a tuple
292 if n == 1:
293 return self.Expr(parent.GetChild(0))
294
295 # x, and (x,) aren't allowed
296 if n == 2:
297 p_die('Invalid trailing comma', parent.GetChild(1).tok)
298
299 elts = [] # type: List[expr_t]
300 for i in xrange(0, n, 2): # skip commas
301 p_node = parent.GetChild(i)
302 elts.append(self.Expr(p_node))
303
304 return expr.Tuple(parent.tok, elts,
305 expr_context_e.Store) # unused expr_context_e
306
307 def _TestlistComp(self, parent, p_node, id0):
308 # type: (PNode, PNode, Id_t) -> expr_t
309 """
310 testlist_comp:
311 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
312 """
313 assert p_node.typ == grammar_nt.testlist_comp
314
315 n = p_node.NumChildren()
316 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
317 elt = self.Expr(p_node.GetChild(0))
318 comp = self._CompFor(p_node.GetChild(1))
319 if id0 == Id.Op_LParen: # (x+1 for x in y)
320 return expr.GeneratorExp(elt, [comp])
321 if id0 == Id.Op_LBracket: # [x+1 for x in y]
322 return expr.ListComp(parent.tok, elt, [comp])
323 raise AssertionError()
324
325 if id0 == Id.Op_LParen:
326 # Parenthesized expression like (x+1) or (x)
327 if n == 1:
328 return self.Expr(p_node.GetChild(0))
329
330 # Tuples (1,) (1, 2) etc. - TODO: should be a list literal?
331 if p_node.GetChild(1).typ == Id.Arith_Comma:
332 return self._Tuple(p_node)
333
334 raise AssertionError()
335
336 if id0 == Id.Op_LBracket: # List [1,2,3]
337 elts = [] # type: List[expr_t]
338 for i in xrange(0, n, 2): # skip commas
339 elts.append(self.Expr(p_node.GetChild(i)))
340
341 return expr.List(parent.tok, elts,
342 expr_context_e.Store) # unused expr_context_e
343
344 raise AssertionError(Id_str(id0))
345
346 def _Atom(self, parent):
347 # type: (PNode) -> expr_t
348 """Handle alternatives of 'atom' where there's more than one child."""
349
350 tok = parent.GetChild(0).tok
351 id_ = tok.id
352 n = parent.NumChildren()
353
354 if id_ == Id.Op_LParen:
355 # atom: '(' [yield_expr|testlist_comp] ')' | ...
356 if n == 2: # () is a tuple
357 assert (
358 parent.GetChild(1).typ == Id.Op_RParen), parent.GetChild(1)
359 return expr.Tuple(tok, [], expr_context_e.Store)
360
361 return self._TestlistComp(parent, parent.GetChild(1), id_)
362
363 if id_ == Id.Op_LBracket:
364 # atom: ... | '[' [testlist_comp] ']' | ...
365
366 if n == 2: # []
367 assert (parent.GetChild(1).typ == Id.Op_RBracket
368 ), parent.GetChild(1)
369 return expr.List(tok, [],
370 expr_context_e.Store) # unused expr_context_e
371
372 return self._TestlistComp(parent, parent.GetChild(1), id_)
373
374 if id_ == Id.Left_CaretBracket: # ^[42 + x]
375 child = self.Expr(parent.GetChild(1))
376 return expr.Literal(child)
377
378 if id_ == Id.Op_LBrace:
379 # atom: ... | '{' [Op_Newline] [dict] '}'
380 i = 1
381 if parent.GetChild(i).typ == Id.Op_Newline:
382 i += 1
383 return self._Dict(parent, parent.GetChild(i))
384
385 if id_ == Id.Arith_Amp:
386 n = parent.NumChildren()
387 if n >= 3:
388 p_die("Places in containers not implemented yet",
389 parent.GetChild(2).tok)
390
391 name_tok = parent.GetChild(1).tok
392 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
393
394 if id_ == Id.Expr_Func:
395 # STUB. This should really be a Func, not Lambda.
396 return expr.Lambda([], expr.Implicit)
397
398 # 100 M
399 # Ignoring the suffix for now
400 if id_ == Id.Expr_DecInt:
401 assert n > 1
402 p_die("Units suffix not implemented", parent.GetChild(1).tok)
403 #return self.Expr(parent.GetChild(0))
404
405 # 100.5 M
406 # Ignoring the suffix for now
407 if id_ == Id.Expr_Float:
408 assert n > 1
409 p_die("unix suffix implemented", parent.GetChild(1).tok)
410 #return self.Expr(parent.GetChild(0))
411
412 raise AssertionError(Id_str(id_))
413
414 def _NameType(self, p_node):
415 # type: (PNode) -> NameType
416 """ name_type: Expr_Name [':'] [type_expr] """
417 name_tok = p_node.GetChild(0).tok
418 typ = None # type: Optional[TypeExpr]
419
420 n = p_node.NumChildren()
421 if n == 2:
422 typ = self._TypeExpr(p_node.GetChild(1))
423 if n == 3:
424 typ = self._TypeExpr(p_node.GetChild(2))
425
426 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
427
428 def _NameTypeList(self, p_node):
429 # type: (PNode) -> List[NameType]
430 """ name_type_list: name_type (',' name_type)* """
431 assert p_node.typ == grammar_nt.name_type_list
432 results = [] # type: List[NameType]
433
434 n = p_node.NumChildren()
435 for i in xrange(0, n, 2): # was children[::2]
436 results.append(self._NameType(p_node.GetChild(i)))
437 return results
438
439 def _CompFor(self, p_node):
440 # type: (PNode) -> Comprehension
441 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
442 lhs = self._NameTypeList(p_node.GetChild(1))
443 iterable = self.Expr(p_node.GetChild(3))
444
445 if p_node.NumChildren() >= 6:
446 cond = self.Expr(p_node.GetChild(5))
447 else:
448 cond = None
449
450 return Comprehension(lhs, iterable, cond)
451
452 def _CompareChain(self, parent):
453 # type: (PNode) -> expr_t
454 """comparison: expr (comp_op expr)*"""
455 cmp_ops = [] # type: List[Token]
456 comparators = [] # type: List[expr_t]
457 left = self.Expr(parent.GetChild(0))
458
459 i = 1
460 n = parent.NumChildren()
461 while i < n:
462 p = parent.GetChild(i)
463 op = p.GetChild(0).tok
464 if p.NumChildren() == 2:
465 # Blame the first token, and change its type
466 if op.id == Id.Expr_Not: # not in
467 op.id = Id.Node_NotIn
468 elif op.id == Id.Expr_Is: # is not
469 op.id = Id.Node_IsNot
470 else:
471 raise AssertionError()
472 else:
473 # is, <, ==, etc.
474 pass
475
476 cmp_ops.append(op)
477 i += 1
478 comparators.append(self.Expr(parent.GetChild(i)))
479 i += 1
480 return expr.Compare(left, cmp_ops, comparators)
481
482 def _Subscript(self, parent):
483 # type: (PNode) -> expr_t
484 """subscript: expr | [expr] ':' [expr]"""
485 typ0 = parent.GetChild(0).typ
486
487 n = parent.NumChildren()
488
489 if typ0 == grammar_nt.expr:
490 if n == 3: # a[1:2]
491 lower = self.Expr(parent.GetChild(0))
492 upper = self.Expr(parent.GetChild(2))
493 elif n == 2: # a[1:]
494 lower = self.Expr(parent.GetChild(0))
495 upper = None
496 else: # a[1]
497 return self.Expr(parent.GetChild(0))
498 else:
499 assert typ0 == Id.Arith_Colon
500 lower = None
501 if n == 1: # a[:]
502 upper = None
503 else: # a[:3]
504 upper = self.Expr(parent.GetChild(1))
505
506 return expr.Slice(lower, parent.GetChild(0).tok, upper)
507
508 def Expr(self, pnode):
509 # type: (PNode) -> expr_t
510 """Transform expressions (as opposed to statements)"""
511 typ = pnode.typ
512
513 #
514 # YSH Entry Points / Additions
515 #
516
517 if typ == grammar_nt.ysh_expr: # for if/while
518 # ysh_expr: '(' testlist ')'
519 return self.Expr(pnode.GetChild(1))
520
521 if typ == grammar_nt.command_expr:
522 # return_expr: testlist end_stmt
523 return self.Expr(pnode.GetChild(0))
524
525 #
526 # Python-like Expressions / Operators
527 #
528
529 if typ == grammar_nt.atom:
530 if pnode.NumChildren() == 1:
531 return self.Expr(pnode.GetChild(0))
532 return self._Atom(pnode)
533
534 if typ == grammar_nt.testlist:
535 # testlist: test (',' test)* [',']
536 return self._Tuple(pnode)
537
538 if typ == grammar_nt.test:
539 # test: or_test ['if' or_test 'else' test] | lambdef
540 if pnode.NumChildren() == 1:
541 return self.Expr(pnode.GetChild(0))
542
543 # TODO: Handle lambdef
544
545 test = self.Expr(pnode.GetChild(2))
546 body = self.Expr(pnode.GetChild(0))
547 orelse = self.Expr(pnode.GetChild(4))
548 return expr.IfExp(test, body, orelse)
549
550 if typ == grammar_nt.lambdef:
551 # lambdef: '|' [name_type_list] '|' test
552
553 n = pnode.NumChildren()
554 if n == 4:
555 params = self._NameTypeList(pnode.GetChild(1))
556 else:
557 params = []
558
559 body = self.Expr(pnode.GetChild(n - 1))
560 return expr.Lambda(params, body)
561
562 #
563 # Operators with Precedence
564 #
565
566 if typ == grammar_nt.or_test:
567 # or_test: and_test ('or' and_test)*
568 return self._LeftAssoc(pnode)
569
570 if typ == grammar_nt.and_test:
571 # and_test: not_test ('and' not_test)*
572 return self._LeftAssoc(pnode)
573
574 if typ == grammar_nt.not_test:
575 # not_test: 'not' not_test | comparison
576 if pnode.NumChildren() == 1:
577 return self.Expr(pnode.GetChild(0))
578
579 op_tok = pnode.GetChild(0).tok # not
580 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
581
582 elif typ == grammar_nt.comparison:
583 if pnode.NumChildren() == 1:
584 return self.Expr(pnode.GetChild(0))
585
586 return self._CompareChain(pnode)
587
588 elif typ == grammar_nt.range_expr:
589 n = pnode.NumChildren()
590 if n == 1:
591 return self.Expr(pnode.GetChild(0))
592
593 if n == 3:
594 return expr.Range(self.Expr(pnode.GetChild(0)),
595 pnode.GetChild(1).tok,
596 self.Expr(pnode.GetChild(2)))
597
598 raise AssertionError(n)
599
600 elif typ == grammar_nt.expr:
601 # expr: xor_expr ('|' xor_expr)*
602 return self._LeftAssoc(pnode)
603
604 if typ == grammar_nt.xor_expr:
605 # xor_expr: and_expr ('xor' and_expr)*
606 return self._LeftAssoc(pnode)
607
608 if typ == grammar_nt.and_expr: # a & b
609 # and_expr: shift_expr ('&' shift_expr)*
610 return self._LeftAssoc(pnode)
611
612 elif typ == grammar_nt.shift_expr:
613 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
614 return self._LeftAssoc(pnode)
615
616 elif typ == grammar_nt.arith_expr:
617 # arith_expr: term (('+'|'-') term)*
618 return self._LeftAssoc(pnode)
619
620 elif typ == grammar_nt.term:
621 # term: factor (('*'|'/'|'div'|'mod') factor)*
622 return self._LeftAssoc(pnode)
623
624 elif typ == grammar_nt.factor:
625 # factor: ('+'|'-'|'~') factor | power
626 # the power would have already been reduced
627 if pnode.NumChildren() == 1:
628 return self.Expr(pnode.GetChild(0))
629
630 assert pnode.NumChildren() == 2
631 op = pnode.GetChild(0)
632 e = pnode.GetChild(1)
633
634 assert isinstance(op.tok, Token)
635 return expr.Unary(op.tok, self.Expr(e))
636
637 elif typ == grammar_nt.power:
638 # power: atom trailer* ['**' factor]
639
640 node = self.Expr(pnode.GetChild(0))
641 if pnode.NumChildren() == 1: # No trailers
642 return node
643
644 # Support a->startswith(b) and mydict.key
645 n = pnode.NumChildren()
646 i = 1
647 while i < n and pnode.GetChild(i).typ == grammar_nt.trailer:
648 node = self._Trailer(node, pnode.GetChild(i))
649 i += 1
650
651 if i != n: # ['**' factor]
652 op_tok = pnode.GetChild(i).tok
653 assert op_tok.id == Id.Arith_DStar, op_tok
654 factor = self.Expr(pnode.GetChild(i + 1))
655 node = expr.Binary(op_tok, node, factor)
656
657 return node
658
659 elif typ == grammar_nt.eggex:
660 return self._Eggex(pnode)
661
662 elif typ == grammar_nt.ysh_expr_sub:
663 return self.Expr(pnode.GetChild(0))
664
665 #
666 # YSH Lexer Modes
667 #
668
669 elif typ == grammar_nt.sh_array_literal:
670 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
671
672 elif typ == grammar_nt.old_sh_array_literal:
673 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
674
675 elif typ == grammar_nt.sh_command_sub:
676 return cast(CommandSub, pnode.GetChild(1).tok)
677
678 elif typ == grammar_nt.braced_var_sub:
679 return cast(BracedVarSub, pnode.GetChild(1).tok)
680
681 elif typ == grammar_nt.dq_string:
682 s = cast(DoubleQuoted, pnode.GetChild(1).tok)
683 # sugar: ^"..." is short for ^["..."]
684 if pnode.GetChild(0).typ == Id.Left_CaretDoubleQuote:
685 return expr.Literal(s)
686 return s
687
688 elif typ == grammar_nt.sq_string:
689 return cast(SingleQuoted, pnode.GetChild(1).tok)
690
691 elif typ == grammar_nt.simple_var_sub:
692 tok = pnode.GetChild(0).tok
693
694 if tok.id == Id.VSub_DollarName: # $foo is disallowed
695 bare = lexer.TokenSliceLeft(tok, 1)
696 p_die(
697 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
698 % (bare, bare), tok)
699
700 # $? is allowed
701 return SimpleVarSub(tok)
702
703 #
704 # Terminals
705 #
706
707 tok = pnode.tok
708 if typ == Id.Expr_Name:
709 return expr.Var(tok, lexer.TokenVal(tok))
710
711 # Everything else is an expr.Const
712 tok_str = lexer.TokenVal(tok)
713 # Remove underscores from 1_000_000. The lexer is responsible for
714 # validation.
715 c_under = tok_str.replace('_', '')
716
717 if typ == Id.Expr_DecInt:
718 try:
719 cval = value.Int(mops.FromStr(c_under)) # type: value_t
720 except ValueError:
721 p_die('Decimal int constant is too large', tok)
722
723 elif typ == Id.Expr_BinInt:
724 assert c_under[:2] in ('0b', '0B'), c_under
725 try:
726 cval = value.Int(mops.FromStr(c_under[2:], 2))
727 except ValueError:
728 p_die('Binary int constant is too large', tok)
729
730 elif typ == Id.Expr_OctInt:
731 assert c_under[:2] in ('0o', '0O'), c_under
732 try:
733 cval = value.Int(mops.FromStr(c_under[2:], 8))
734 except ValueError:
735 p_die('Octal int constant is too large', tok)
736
737 elif typ == Id.Expr_HexInt:
738 assert c_under[:2] in ('0x', '0X'), c_under
739 try:
740 cval = value.Int(mops.FromStr(c_under[2:], 16))
741 except ValueError:
742 p_die('Hex int constant is too large', tok)
743
744 elif typ == Id.Expr_Float:
745 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
746 # I think this never raises ValueError, because the lexer
747 # should only accept strings that strtod() does?
748 cval = value.Float(float(c_under))
749
750 elif typ == Id.Expr_Null:
751 cval = value.Null
752
753 elif typ == Id.Expr_True:
754 cval = value.Bool(True)
755
756 elif typ == Id.Expr_False:
757 cval = value.Bool(False)
758
759 # What to do with the char constants?
760 # \n \u{3bc} #'a'
761 # Are they integers or strings?
762 #
763 # Integers could be ord(\n), or strings could chr(\n)
764 # Or just remove them, with ord(u'\n') and chr(u'\n')
765 #
766 # I think this relies on small string optimization. If we have it,
767 # then 1-4 byte characters are efficient, and don't require heap
768 # allocation.
769
770 elif typ == Id.Char_OneChar:
771 cval = value.Str(consts.LookupCharC(tok_str[1]))
772
773 elif typ == Id.Char_UBraced:
774 hex_str = tok_str[3:-1] # \u{123}
775 code_point = int(hex_str, 16)
776 cval = value.Str(j8.Utf8Encode(code_point))
777 else:
778 raise AssertionError(typ)
779
780 return expr.Const(tok, cval)
781
782 def _CheckLhs(self, lhs):
783 # type: (expr_t) -> None
784
785 UP_lhs = lhs
786 with tagswitch(lhs) as case:
787 if case(expr_e.Var):
788 # OK - e.g. setvar a.b.c[i] = 42
789 pass
790
791 elif case(expr_e.Subscript):
792 lhs = cast(Subscript, UP_lhs)
793 self._CheckLhs(lhs.obj) # recurse on LHS
794
795 elif case(expr_e.Attribute):
796 lhs = cast(Attribute, UP_lhs)
797 self._CheckLhs(lhs.obj) # recurse on LHS
798
799 else:
800 # Illegal - e.g. setglobal {}["key"] = 42
801 p_die("Subscript/Attribute not allowed on this LHS expression",
802 location.TokenForExpr(lhs))
803
804 def _LhsExprList(self, p_node):
805 # type: (PNode) -> List[y_lhs_t]
806 """lhs_list: expr (',' expr)*"""
807 assert p_node.typ == grammar_nt.lhs_list
808
809 lhs_list = [] # type: List[y_lhs_t]
810 n = p_node.NumChildren()
811 for i in xrange(0, n, 2):
812 p = p_node.GetChild(i)
813 #self.p_printer.Print(p)
814
815 e = self.Expr(p)
816 UP_e = e
817 with tagswitch(e) as case:
818 if case(expr_e.Var):
819 e = cast(expr.Var, UP_e)
820 lhs_list.append(e.left)
821
822 elif case(expr_e.Subscript):
823 e = cast(Subscript, UP_e)
824 self._CheckLhs(e)
825 lhs_list.append(e)
826
827 elif case(expr_e.Attribute):
828 e = cast(Attribute, UP_e)
829 self._CheckLhs(e)
830 if e.op.id != Id.Expr_Dot:
831 # e.g. setvar obj->method is not valid
832 p_die("Can't assign to this attribute expr", e.op)
833 lhs_list.append(e)
834
835 else:
836 pass # work around mycpp bug
837
838 # TODO: could blame arbitary expr_t, bu this works most of
839 # the time
840 if p.tok:
841 blame = p.tok # type: loc_t
842 else:
843 blame = loc.Missing
844 p_die("Can't assign to this expression", blame)
845
846 return lhs_list
847
848 def MakeVarDecl(self, p_node):
849 # type: (PNode) -> command.VarDecl
850 """
851 ysh_var_decl: name_type_list ['=' testlist] end_stmt
852 """
853 assert p_node.typ == grammar_nt.ysh_var_decl
854
855 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
856
857 # This syntax is confusing, and different than JavaScript
858 # var x, y = 1, 2
859 # But this is useful:
860 # var flag, i = parseArgs(spec, argv)
861
862 n = p_node.NumChildren()
863 if n >= 3:
864 rhs = self.Expr(p_node.GetChild(2))
865 else:
866 rhs = None
867
868 # The caller should fill in the keyword token.
869 return command.VarDecl(None, lhs, rhs)
870
871 def MakeMutation(self, p_node):
872 # type: (PNode) -> command.Mutation
873 """
874 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
875 """
876 typ = p_node.typ
877 assert typ == grammar_nt.ysh_mutation
878
879 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
880 op_tok = p_node.GetChild(1).tok
881 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
882 p_die('Multiple assignment must use =', op_tok)
883 rhs = self.Expr(p_node.GetChild(2))
884 return command.Mutation(None, lhs_list, op_tok, rhs)
885
886 def _EggexFlag(self, p_node):
887 # type: (PNode) -> EggexFlag
888 n = p_node.NumChildren()
889 if n == 1:
890 return EggexFlag(False, p_node.GetChild(0).tok)
891 elif n == 2:
892 return EggexFlag(True, p_node.GetChild(1).tok)
893 else:
894 raise AssertionError()
895
896 def _Eggex(self, p_node):
897 # type: (PNode) -> Eggex
898 """
899 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
900 """
901 left = p_node.GetChild(0).tok
902 regex = self._Regex(p_node.GetChild(1))
903
904 flags = [] # type: List[EggexFlag]
905 trans_pref = None # type: Optional[Token]
906
907 i = 2
908 current = p_node.GetChild(i)
909 if current.typ == Id.Op_Semi:
910 i += 1
911 while True:
912 current = p_node.GetChild(i)
913 if current.typ != grammar_nt.re_flag:
914 break
915 flags.append(self._EggexFlag(current))
916 i += 1
917
918 if current.typ == Id.Op_Semi:
919 i += 1
920 trans_pref = p_node.GetChild(i).tok
921
922 # Canonicalize and validate flags for ERE only. Default is ERE.
923 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
924 canonical_flags = regex_translate.CanonicalFlags(flags)
925 else:
926 canonical_flags = None
927
928 return Eggex(left, regex, flags, trans_pref, canonical_flags)
929
930 def YshCasePattern(self, pnode):
931 # type: (PNode) -> pat_t
932 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
933
934 pattern = pnode.GetChild(0)
935 typ = pattern.typ
936 if typ == Id.Op_LParen:
937 # pat_expr or pat_else
938 pattern = pnode.GetChild(1)
939 typ = pattern.typ
940
941 if typ == grammar_nt.pat_else:
942 return pat.Else
943
944 if typ == grammar_nt.pat_exprs:
945 exprs = [] # type: List[expr_t]
946 for i in xrange(pattern.NumChildren()):
947 child = pattern.GetChild(i)
948 if child.typ == grammar_nt.expr:
949 expr = self.Expr(child)
950 exprs.append(expr)
951 return pat.YshExprs(exprs)
952
953 if typ == grammar_nt.eggex:
954 return self._Eggex(pattern)
955
956 raise AssertionError()
957
958 def _BlockArg(self, p_node):
959 # type: (PNode) -> expr_t
960
961 n = p_node.NumChildren()
962 if n == 1:
963 child = p_node.GetChild(0)
964 return self.Expr(child)
965
966 # It can only be an expression, not a=42, or ...expr
967 p_die('Invalid block expression argument', p_node.tok)
968
969 def _Argument(self, p_node, after_semi, arglist):
970 # type: (PNode, bool, ArgList) -> None
971 """
972 argument: (
973 test [comp_for]
974 | test '=' test # named arg
975 | '...' test # var args
976 )
977 """
978 pos_args = arglist.pos_args
979 named_args = arglist.named_args
980
981 assert p_node.typ == grammar_nt.argument, p_node
982 n = p_node.NumChildren()
983 if n == 1:
984 child = p_node.GetChild(0)
985 if after_semi:
986 p_die(POS_ARG_MISPLACED, child.tok)
987 arg = self.Expr(child)
988 pos_args.append(arg)
989 return
990
991 if n == 2:
992 # Note: We allow multiple spreads, just like Julia. They are
993 # concatenated as in lists and dicts.
994 tok0 = p_node.GetChild(0).tok
995 if tok0.id == Id.Expr_Ellipsis:
996 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
997 if after_semi: # f(; ... named)
998 named_args.append(NamedArg(None, spread_expr))
999 else: # f(...named)
1000 pos_args.append(spread_expr)
1001 return
1002
1003 # Note: generator expression not implemented
1004 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1005 child = p_node.GetChild(0)
1006 if after_semi:
1007 p_die(POS_ARG_MISPLACED, child.tok)
1008
1009 elt = self.Expr(child)
1010 comp = self._CompFor(p_node.GetChild(1))
1011 arg = expr.GeneratorExp(elt, [comp])
1012 pos_args.append(arg)
1013 return
1014
1015 raise AssertionError()
1016
1017 if n == 3: # named args can come before or after the semicolon
1018 n1 = NamedArg(
1019 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1020 named_args.append(n1)
1021 return
1022
1023 raise AssertionError()
1024
1025 def _ArgGroup(self, p_node, after_semi, arglist):
1026 # type: (PNode, bool, ArgList) -> None
1027 """
1028 arg_group: argument (',' argument)* [',']
1029 """
1030 for i in xrange(p_node.NumChildren()):
1031 p_child = p_node.GetChild(i)
1032 if p_child.typ == grammar_nt.argument:
1033 self._Argument(p_child, after_semi, arglist)
1034
1035 def _ArgList(self, p_node, arglist):
1036 # type: (PNode, ArgList) -> None
1037 """For both funcs and procs
1038
1039 arglist: (
1040 [arg_group]
1041 [';' [arg_group]]
1042 )
1043
1044 arglist3: ...
1045 """
1046 n = p_node.NumChildren()
1047 if n == 0:
1048 return
1049
1050 i = 0
1051
1052 if i >= n:
1053 return
1054 child = p_node.GetChild(i)
1055 if child.typ == grammar_nt.arg_group:
1056 self._ArgGroup(child, False, arglist)
1057 i += 1
1058
1059 if i >= n:
1060 return
1061 child = p_node.GetChild(i)
1062 if child.typ == Id.Op_Semi:
1063 arglist.semi_tok = child.tok
1064 i += 1
1065
1066 # Named args after first semi-colon
1067 if i >= n:
1068 return
1069 child = p_node.GetChild(i)
1070 if child.typ == grammar_nt.arg_group:
1071 self._ArgGroup(child, True, arglist)
1072 i += 1
1073
1074 #
1075 # Special third group may have block expression - only for arglist3,
1076 # used for procs!
1077 #
1078
1079 if i >= n:
1080 return
1081 assert p_node.typ == grammar_nt.arglist3, p_node
1082
1083 child = p_node.GetChild(i)
1084 if child.typ == Id.Op_Semi:
1085 arglist.semi_tok2 = child.tok
1086 i += 1
1087
1088 if i >= n:
1089 return
1090 child = p_node.GetChild(i)
1091 if child.typ == grammar_nt.argument:
1092 arglist.block_expr = self._BlockArg(child)
1093 i += 1
1094
1095 def ProcCallArgs(self, pnode, arglist):
1096 # type: (PNode, ArgList) -> None
1097 """
1098 ysh_eager_arglist: '(' [arglist3] ')'
1099 ysh_lazy_arglist: '[' [arglist] ']'
1100 """
1101 n = pnode.NumChildren()
1102 if n == 2: # f()
1103 return
1104
1105 if n == 3:
1106 child1 = pnode.GetChild(1) # the X in '( X )'
1107
1108 self._ArgList(child1, arglist)
1109 return
1110
1111 raise AssertionError()
1112
1113 def _TypeExpr(self, pnode):
1114 # type: (PNode) -> TypeExpr
1115 """
1116 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1117 """
1118 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1119
1120 ty = TypeExpr.CreateNull() # don't allocate children
1121
1122 ty.tok = pnode.GetChild(0).tok
1123 ty.name = lexer.TokenVal(ty.tok)
1124
1125 n = pnode.NumChildren()
1126 if n == 1:
1127 return ty
1128
1129 ty.params = []
1130 i = 2
1131 while i < n:
1132 p = self._TypeExpr(pnode.GetChild(i))
1133 ty.params.append(p)
1134 i += 2 # skip comma
1135
1136 return ty
1137
1138 def _Param(self, pnode):
1139 # type: (PNode) -> Param
1140 """
1141 param: Expr_Name [type_expr] ['=' expr]
1142 """
1143 assert pnode.typ == grammar_nt.param
1144
1145 name_tok = pnode.GetChild(0).tok
1146 n = pnode.NumChildren()
1147
1148 assert name_tok.id == Id.Expr_Name, name_tok
1149
1150 default_val = None # type: expr_t
1151 type_ = None # type: TypeExpr
1152
1153 if n == 1:
1154 # proc p(a)
1155 pass
1156
1157 elif n == 2:
1158 # proc p(a Int)
1159 type_ = self._TypeExpr(pnode.GetChild(1))
1160
1161 elif n == 3:
1162 # proc p(a = 3)
1163 default_val = self.Expr(pnode.GetChild(2))
1164
1165 elif n == 4:
1166 # proc p(a Int = 3)
1167 type_ = self._TypeExpr(pnode.GetChild(1))
1168 default_val = self.Expr(pnode.GetChild(3))
1169
1170 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1171
1172 def _ParamGroup(self, p_node):
1173 # type: (PNode) -> ParamGroup
1174 """
1175 param_group:
1176 (param ',')*
1177 [ (param | '...' Expr_Name) [,] ]
1178 """
1179 assert p_node.typ == grammar_nt.param_group, p_node
1180
1181 params = [] # type: List[Param]
1182 rest_of = None # type: Optional[RestParam]
1183
1184 n = p_node.NumChildren()
1185 i = 0
1186 while i < n:
1187 child = p_node.GetChild(i)
1188 if child.typ == grammar_nt.param:
1189 params.append(self._Param(child))
1190
1191 elif child.typ == Id.Expr_Ellipsis:
1192 tok = p_node.GetChild(i + 1).tok
1193 rest_of = RestParam(tok, lexer.TokenVal(tok))
1194
1195 i += 2
1196
1197 return ParamGroup(params, rest_of)
1198
1199 def Proc(self, p_node):
1200 # type: (PNode) -> proc_sig_t
1201 """
1202 ysh_proc: (
1203 [ '('
1204 [ param_group ] # word params, with defaults
1205 [ ';' [ param_group ] ] # positional typed params, with defaults
1206 [ ';' [ param_group ] ] # named params, with defaults
1207 [ ';' Expr_Name ] # optional block param, with no type or default
1208 ')'
1209 ]
1210 '{' # opening { for pgen2
1211 )
1212 """
1213 typ = p_node.typ
1214 assert typ == grammar_nt.ysh_proc
1215
1216 n = p_node.NumChildren()
1217 if n == 1: # proc f {
1218 return proc_sig.Open
1219
1220 if n == 3: # proc f () {
1221 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1222
1223 # proc f( three param groups, and block group )
1224 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1225
1226 # Word args
1227 i = 1
1228 child = p_node.GetChild(i)
1229 if child.typ == grammar_nt.param_group:
1230 sig.word = self._ParamGroup(p_node.GetChild(i))
1231
1232 # Validate word args
1233 for word in sig.word.params:
1234 if word.type:
1235 if word.type.name not in ('Str', 'Ref'):
1236 p_die('Word params may only have type Str or Ref',
1237 word.type.tok)
1238 if word.type.params is not None:
1239 p_die('Unexpected type parameters', word.type.tok)
1240
1241 i += 2
1242 else:
1243 i += 1
1244
1245 #log('i %d n %d', i, n)
1246 if i >= n:
1247 return sig
1248
1249 # Positional args
1250 child = p_node.GetChild(i)
1251 if child.typ == grammar_nt.param_group:
1252 sig.positional = self._ParamGroup(p_node.GetChild(i))
1253 i += 2
1254 else:
1255 i += 1
1256
1257 #log('i %d n %d', i, n)
1258 if i >= n:
1259 return sig
1260
1261 # Keyword args
1262 child = p_node.GetChild(i)
1263 if child.typ == grammar_nt.param_group:
1264 sig.named = self._ParamGroup(p_node.GetChild(i))
1265 i += 2
1266 else:
1267 i += 1
1268
1269 #log('i %d n %d', i, n)
1270 if i >= n:
1271 return sig
1272
1273 child = p_node.GetChild(i)
1274 if child.typ == grammar_nt.param_group:
1275 group = self._ParamGroup(p_node.GetChild(i))
1276 params = group.params
1277 if len(params) > 1:
1278 p_die('Only 1 block param is allowed', params[1].blame_tok)
1279 if group.rest_of:
1280 p_die("Rest param isn't allowed for blocks",
1281 group.rest_of.blame_tok)
1282
1283 if len(params) == 1:
1284 if params[0].type:
1285 if params[0].type.name != 'Command':
1286 p_die('Block param must have type Command',
1287 params[0].type.tok)
1288 if params[0].type.params is not None:
1289 p_die('Unexpected type parameters', params[0].type.tok)
1290
1291 sig.block_param = params[0]
1292
1293 return sig
1294
1295 def YshFunc(self, p_node, out):
1296 # type: (PNode, Func) -> None
1297 """
1298 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1299 """
1300 assert p_node.typ == grammar_nt.ysh_func
1301
1302 #self.p_printer.Print(p_node)
1303
1304 out.name = p_node.GetChild(0).tok
1305
1306 n = p_node.NumChildren()
1307 i = 2 # after (
1308
1309 child = p_node.GetChild(i)
1310 if child.typ == grammar_nt.param_group:
1311 out.positional = self._ParamGroup(child)
1312 i += 2 # skip past ;
1313 else:
1314 i += 1
1315
1316 if i >= n:
1317 return
1318
1319 child = p_node.GetChild(i)
1320 if child.typ == grammar_nt.param_group:
1321 out.named = self._ParamGroup(child)
1322
1323 #
1324 # Eggex Language
1325 #
1326
1327 def _RangeCharSingleQuoted(self, p_node):
1328 # type: (PNode) -> Optional[CharCode]
1329
1330 assert p_node.typ == grammar_nt.range_char, p_node
1331
1332 # 'a' in 'a'-'b'
1333
1334 child0 = p_node.GetChild(0)
1335 if child0.typ == grammar_nt.sq_string:
1336 sq_part = cast(SingleQuoted, child0.GetChild(1).tok)
1337 n = len(sq_part.sval)
1338 if n == 0:
1339 p_die("Quoted range char can't be empty",
1340 loc.WordPart(sq_part))
1341 elif n == 1:
1342 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1343 else:
1344 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1345 return None
1346
1347 def _OtherRangeToken(self, p_node):
1348 # type: (PNode) -> Token
1349 """An endpoint of a range (single char)
1350
1351 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1352 a-z 0-9 'a'-'z' \x00-\xff
1353 """
1354 assert p_node.typ == grammar_nt.range_char, p_node
1355
1356 child0 = p_node.GetChild(0)
1357 if child0.typ == grammar_nt.char_literal:
1358 # \x00 in /[\x00 - \x20]/
1359 tok = child0.GetChild(0).tok
1360 return tok
1361
1362 tok = p_node.tok
1363 # a in a-z is Expr_Name
1364 # 0 in 0-9 is Expr_DecInt
1365 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1366
1367 if tok.length != 1:
1368 p_die(RANGE_POINT_TOO_LONG, tok)
1369 return tok
1370
1371 def _NonRangeChars(self, p_node):
1372 # type: (PNode) -> class_literal_term_t
1373 """
1374 \" \u1234 '#'
1375 """
1376 assert p_node.typ == grammar_nt.range_char, p_node
1377
1378 child0 = p_node.GetChild(0)
1379 typ0 = p_node.GetChild(0).typ
1380
1381 if typ0 == grammar_nt.sq_string:
1382 return cast(SingleQuoted, child0.GetChild(1).tok)
1383
1384 if typ0 == grammar_nt.char_literal:
1385 return word_compile.EvalCharLiteralForRegex(child0.tok)
1386
1387 if typ0 == Id.Expr_Name:
1388 # Look up PerlClass and PosixClass
1389 return self._NameInClass(None, child0.tok)
1390
1391 raise AssertionError()
1392
1393 def _ClassLiteralTerm(self, p_node):
1394 # type: (PNode) -> class_literal_term_t
1395 """
1396 class_literal_term:
1397 range_char ['-' range_char ]
1398 | '@' Expr_Name # splice
1399 | '!' Expr_Name # negate char class
1400 ...
1401 """
1402 assert p_node.typ == grammar_nt.class_literal_term, p_node
1403
1404 typ0 = p_node.GetChild(0).typ
1405
1406 if typ0 == grammar_nt.range_char:
1407 n = p_node.NumChildren()
1408
1409 if n == 1:
1410 return self._NonRangeChars(p_node.GetChild(0))
1411
1412 # 'a'-'z' etc.
1413 if n == 3:
1414 assert p_node.GetChild(1).typ == Id.Arith_Minus, p_node
1415
1416 left = p_node.GetChild(0)
1417 right = p_node.GetChild(2)
1418
1419 code1 = self._RangeCharSingleQuoted(left)
1420 if code1 is None:
1421 tok1 = self._OtherRangeToken(left)
1422 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1423
1424 code2 = self._RangeCharSingleQuoted(right)
1425 if code2 is None:
1426 tok2 = self._OtherRangeToken(right)
1427 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1428 return CharRange(code1, code2)
1429
1430 raise AssertionError()
1431
1432 if typ0 == Id.Expr_At:
1433 tok1 = p_node.GetChild(1).tok
1434 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1435
1436 if typ0 == Id.Expr_Bang:
1437 return self._NameInClass(
1438 p_node.GetChild(0).tok,
1439 p_node.GetChild(1).tok)
1440
1441 p_die("This kind of class literal term isn't implemented",
1442 p_node.GetChild(0).tok)
1443
1444 def _ClassLiteral(self, p_node):
1445 # type: (PNode) -> List[class_literal_term_t]
1446 """class_literal: '[' class_literal_term+ ']'."""
1447 assert p_node.typ == grammar_nt.class_literal
1448 # skip [ and ]
1449 terms = [] # type: List[class_literal_term_t]
1450 for i in xrange(1, p_node.NumChildren() - 1):
1451 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1452
1453 return terms
1454
1455 def _NameInRegex(self, negated_tok, tok):
1456 # type: (Token, Token) -> re_t
1457 tok_str = lexer.TokenVal(tok)
1458 if tok_str == 'dot':
1459 if negated_tok:
1460 p_die("Can't negate this symbol", tok)
1461 return re.Primitive(tok, Id.Eggex_Dot)
1462
1463 if tok_str in POSIX_CLASSES:
1464 return PosixClass(negated_tok, tok_str)
1465
1466 perl = PERL_CLASSES.get(tok_str)
1467 if perl is not None:
1468 return PerlClass(negated_tok, perl)
1469
1470 if tok_str[0].isupper(): # e.g. HexDigit
1471 return re.Splice(tok, lexer.TokenVal(tok))
1472
1473 p_die("%r isn't a character class" % tok_str, tok)
1474
1475 def _NameInClass(self, negated_tok, tok):
1476 # type: (Token, Token) -> class_literal_term_t
1477 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1478 tok_str = lexer.TokenVal(tok)
1479
1480 # A bare, unquoted character literal. In the grammar, this is expressed as
1481 # range_char without an ending.
1482
1483 # d is NOT 'digit', it's a literal 'd'!
1484 if len(tok_str) == 1:
1485 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1486 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1487
1488 if negated_tok: # [~d] is not allowed, only [~digit]
1489 p_die("Can't negate this symbol", tok)
1490 return word_compile.EvalCharLiteralForRegex(tok)
1491
1492 # digit, word, but not d, w, etc.
1493 if tok_str in POSIX_CLASSES:
1494 return PosixClass(negated_tok, tok_str)
1495
1496 perl = PERL_CLASSES.get(tok_str)
1497 if perl is not None:
1498 return PerlClass(negated_tok, perl)
1499 p_die("%r isn't a character class" % tok_str, tok)
1500
1501 def _ReAtom(self, p_atom):
1502 # type: (PNode) -> re_t
1503 """
1504 re_atom: ( char_literal | ...
1505 """
1506 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1507
1508 child0 = p_atom.GetChild(0)
1509
1510 typ0 = p_atom.GetChild(0).typ
1511 tok0 = p_atom.GetChild(0).tok
1512
1513 # Non-terminals
1514
1515 if typ0 == grammar_nt.class_literal:
1516 return re.CharClassLiteral(False, self._ClassLiteral(child0))
1517
1518 if typ0 == grammar_nt.sq_string:
1519 return cast(SingleQuoted, child0.GetChild(1).tok)
1520
1521 if typ0 == grammar_nt.char_literal:
1522 # Must be Id.Char_{OneChar,Hex,UBraced}
1523 assert consts.GetKind(tok0.id) == Kind.Char
1524 s = word_compile.EvalCStringToken(tok0.id, lexer.TokenVal(tok0))
1525 return re.LiteralChars(tok0, s)
1526
1527 # Special punctuation
1528 if typ0 == Id.Expr_Dot: # .
1529 return re.Primitive(tok0, Id.Eggex_Dot)
1530
1531 if typ0 == Id.Arith_Caret: # ^
1532 return re.Primitive(tok0, Id.Eggex_Start)
1533
1534 if typ0 == Id.Expr_Dollar: # $
1535 return re.Primitive(tok0, Id.Eggex_End)
1536
1537 if typ0 == Id.Expr_Name:
1538 # d digit -> PosixClass PerlClass etc.
1539 return self._NameInRegex(None, tok0)
1540
1541 if typ0 == Id.Expr_Symbol:
1542 # Validate symbols here, like we validate PerlClass, etc.
1543 tok_str = lexer.TokenVal(tok0)
1544 if tok_str == '%start':
1545 return re.Primitive(tok0, Id.Eggex_Start)
1546 if tok_str == '%end':
1547 return re.Primitive(tok0, Id.Eggex_End)
1548 p_die("Unexpected token %r in regex" % tok_str, tok0)
1549
1550 if typ0 == Id.Expr_At:
1551 # | '@' Expr_Name
1552 tok1 = p_atom.GetChild(1).tok
1553 return re.Splice(tok0, lexer.TokenVal(tok1))
1554
1555 if typ0 == Id.Expr_Bang:
1556 # | '!' (Expr_Name | class_literal)
1557 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1558 n = p_atom.NumChildren()
1559 if n == 2:
1560 child1 = p_atom.GetChild(1)
1561 if child1.typ == grammar_nt.class_literal:
1562 return re.CharClassLiteral(True,
1563 self._ClassLiteral(child1))
1564 else:
1565 return self._NameInRegex(tok0, p_atom.GetChild(1).tok)
1566 else:
1567 # Note: !! conflicts with shell history
1568 p_die(
1569 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1570 p_atom.GetChild(1).tok)
1571
1572 if typ0 == Id.Op_LParen:
1573 # | '(' regex ')'
1574
1575 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1576 # Capture.
1577 return re.Group(self._Regex(p_atom.GetChild(1)))
1578
1579 if typ0 == Id.Arith_Less:
1580 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1581
1582 n = p_atom.NumChildren()
1583 assert n == 4 or n == 6 or n == 8, n
1584
1585 # < capture d+ >
1586 regex = self._Regex(p_atom.GetChild(2))
1587
1588 as_name = None # type: Optional[Token]
1589 func_name = None # type: Optional[Token]
1590
1591 i = 3 # points at any of > as :
1592
1593 typ = p_atom.GetChild(i).typ
1594 if typ == Id.Expr_As:
1595 as_name = p_atom.GetChild(i + 1).tok
1596 i += 2
1597
1598 typ = p_atom.GetChild(i).typ
1599 if typ == Id.Arith_Colon:
1600 func_name = p_atom.GetChild(i + 1).tok
1601
1602 return re.Capture(regex, as_name, func_name)
1603
1604 raise AssertionError(typ0)
1605
1606 def _RepeatOp(self, p_repeat):
1607 # type: (PNode) -> re_repeat_t
1608 """
1609 repeat_op: '+' | '*' | '?'
1610 | '{' [Expr_Name] ('+' | '*' | '?' | repeat_range) '}'
1611 """
1612 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1613
1614 tok = p_repeat.GetChild(0).tok
1615 id_ = tok.id
1616
1617 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1618 return tok # a+ a* a?
1619
1620 if id_ == Id.Op_LBrace:
1621 child1 = p_repeat.GetChild(1)
1622 if child1.typ != grammar_nt.repeat_range:
1623 # e.g. dot{N *} is .*?
1624 p_die("Perl-style repetition isn't implemented with libc",
1625 child1.tok)
1626
1627 # repeat_range: (
1628 # Expr_DecInt [',']
1629 # | ',' Expr_DecInt
1630 # | Expr_DecInt ',' Expr_DecInt
1631 # )
1632
1633 n = child1.NumChildren()
1634 if n == 1: # {3}
1635 tok = child1.GetChild(0).tok
1636 return tok # different operator than + * ?
1637
1638 if n == 2:
1639 if child1.GetChild(0).typ == Id.Expr_DecInt: # {,3}
1640 left = child1.GetChild(0).tok
1641 return re_repeat.Range(left, lexer.TokenVal(left), '',
1642 None)
1643 else: # {1,}
1644 right = child1.GetChild(1).tok
1645 return re_repeat.Range(None, '', lexer.TokenVal(right),
1646 right)
1647
1648 if n == 3: # {1,3}
1649 left = child1.GetChild(0).tok
1650 right = child1.GetChild(2).tok
1651 return re_repeat.Range(left, lexer.TokenVal(left),
1652 lexer.TokenVal(right), right)
1653
1654 raise AssertionError(n)
1655
1656 raise AssertionError(id_)
1657
1658 def _ReAlt(self, p_node):
1659 # type: (PNode) -> re_t
1660 """
1661 re_alt: (re_atom [repeat_op])+
1662 """
1663 assert p_node.typ == grammar_nt.re_alt
1664
1665 i = 0
1666 n = p_node.NumChildren()
1667 seq = [] # type: List[re_t]
1668 while i < n:
1669 r = self._ReAtom(p_node.GetChild(i))
1670 i += 1
1671 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1672 repeat_op = self._RepeatOp(p_node.GetChild(i))
1673 r = re.Repeat(r, repeat_op)
1674 i += 1
1675 seq.append(r)
1676
1677 if len(seq) == 1:
1678 return seq[0]
1679 else:
1680 return re.Seq(seq)
1681
1682 def _Regex(self, p_node):
1683 # type: (PNode) -> re_t
1684 """
1685 regex: [re_alt] (('|'|'or') re_alt)*
1686 """
1687 assert p_node.typ == grammar_nt.regex
1688
1689 n = p_node.NumChildren()
1690 alts = [] # type: List[re_t]
1691 for i in xrange(0, n, 2): # was children[::2]
1692 c = p_node.GetChild(i)
1693 alts.append(self._ReAlt(c))
1694
1695 if len(alts) == 1:
1696 return alts[0]
1697 else:
1698 return re.Alt(alts)
1699
1700
1701# vim: sw=4