| 1 | """expr_parse.py."""
 | 
| 2 | from __future__ import print_function
 | 
| 3 | 
 | 
| 4 | from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
 | 
| 5 |                                        CommandSub, ShArrayLiteral,
 | 
| 6 |                                        CompoundWord, word_part_t, word_e)
 | 
| 7 | from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
 | 
| 8 | from _devbuild.gen.types_asdl import lex_mode_e
 | 
| 9 | 
 | 
| 10 | from core import ui
 | 
| 11 | from core.error import p_die
 | 
| 12 | from frontend import consts
 | 
| 13 | from frontend import lexer
 | 
| 14 | from frontend import reader
 | 
| 15 | from mycpp import mylib
 | 
| 16 | from mycpp.mylib import log, tagswitch
 | 
| 17 | from osh import braces
 | 
| 18 | from osh import word_
 | 
| 19 | from osh import word_compile
 | 
| 20 | from pgen2 import parse
 | 
| 21 | from pgen2.pnode import PNodeAllocator
 | 
| 22 | 
 | 
| 23 | _ = log
 | 
| 24 | 
 | 
| 25 | from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
 | 
| 26 | if TYPE_CHECKING:
 | 
| 27 |     from frontend.lexer import Lexer
 | 
| 28 |     from frontend.parse_lib import ParseContext
 | 
| 29 |     from pgen2.grammar import Grammar
 | 
| 30 |     from pgen2.pnode import PNode
 | 
| 31 | 
 | 
| 32 | if mylib.PYTHON:
 | 
| 33 | 
 | 
| 34 |     class ParseTreePrinter(object):
 | 
| 35 |         """Prints a tree of PNode instances."""
 | 
| 36 | 
 | 
| 37 |         def __init__(self, names):
 | 
| 38 |             # type: (Dict[int, str]) -> None
 | 
| 39 |             self.names = names
 | 
| 40 |             self.f = mylib.Stdout()
 | 
| 41 | 
 | 
| 42 |         def _Print(self, pnode, indent, i):
 | 
| 43 |             # type: (PNode, int, int) -> None
 | 
| 44 | 
 | 
| 45 |             ind = '  ' * indent
 | 
| 46 |             # NOTE:
 | 
| 47 |             # - why isn't 'tok' None for PRODUCTIONS?  There is some redundancy to get
 | 
| 48 |             #   rid of.
 | 
| 49 |             if pnode.tok:
 | 
| 50 |                 if isinstance(pnode.tok, Token):
 | 
| 51 |                     v = lexer.TokenVal(pnode.tok)
 | 
| 52 |                 else:
 | 
| 53 |                     # e.g. CommandSub for x = $(echo hi)
 | 
| 54 |                     v = repr(pnode.tok)
 | 
| 55 |             else:
 | 
| 56 |                 v = '-'
 | 
| 57 |             self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
 | 
| 58 |             if pnode.children is not None:
 | 
| 59 |                 for i, child in enumerate(pnode.children):
 | 
| 60 |                     self._Print(child, indent + 1, i)
 | 
| 61 | 
 | 
| 62 |         def Print(self, pnode):
 | 
| 63 |             # type: (PNode) -> None
 | 
| 64 |             self._Print(pnode, 0, 0)
 | 
| 65 | 
 | 
| 66 | 
 | 
| 67 | def _Classify(gr, tok):
 | 
| 68 |     # type: (Grammar, Token) -> int
 | 
| 69 | 
 | 
| 70 |     # We have to match up what ParserGenerator.make_grammar() did when
 | 
| 71 |     # calling make_label() and make_first().  See classify() in
 | 
| 72 |     # opy/pgen2/driver.py.
 | 
| 73 | 
 | 
| 74 |     id_ = tok.id  # mycpp fix: we need C++ to do uint16_t -> int conversion
 | 
| 75 | 
 | 
| 76 |     # TODO: use something more efficient than a Dict
 | 
| 77 |     if id_ in gr.tokens:
 | 
| 78 |         return gr.tokens[id_]
 | 
| 79 | 
 | 
| 80 |     if id_ == Id.Unknown_DEqual:
 | 
| 81 |         p_die('Use === to be exact, or ~== to convert types', tok)
 | 
| 82 | 
 | 
| 83 |     if id_ == Id.Unknown_Tok:
 | 
| 84 |         type_str = ''
 | 
| 85 |     else:
 | 
| 86 |         type_str = ' (%s)' % ui.PrettyId(tok.id)
 | 
| 87 |     p_die('Unexpected token in expression mode%s' % type_str, tok)
 | 
| 88 | 
 | 
| 89 | 
 | 
| 90 | # Newlines are ignored between these pairs.
 | 
| 91 | # yapf: disable
 | 
| 92 | _OTHER_BALANCE = {
 | 
| 93 | 
 | 
| 94 |     # Parenthesized expressions (tuples) and func/proc parameter lists
 | 
| 95 |     Id.Op_LParen: 1,
 | 
| 96 |     Id.Op_RParen: -1,
 | 
| 97 |     Id.Op_LBracket: 1,
 | 
| 98 |     Id.Op_RBracket: -1,
 | 
| 99 | 
 | 
| 100 |     # Dicts are {}, and the grammar respects Op_Newline.
 | 
| 101 | }
 | 
| 102 | # yapf: enable
 | 
| 103 | 
 | 
| 104 | 
 | 
| 105 | def _PushYshTokens(parse_ctx, gr, p, lex):
 | 
| 106 |     # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
 | 
| 107 |     """Push tokens onto pgen2's parser.
 | 
| 108 | 
 | 
| 109 |     Returns the last token so it can be reused/seen by the CommandParser.
 | 
| 110 |     """
 | 
| 111 |     #log('keywords = %s', gr.keywords)
 | 
| 112 |     #log('tokens = %s', gr.tokens)
 | 
| 113 | 
 | 
| 114 |     last_token = None  # type: Optional[Token]
 | 
| 115 |     prev_was_newline = False
 | 
| 116 | 
 | 
| 117 |     balance = 0  # to ignore newlines
 | 
| 118 | 
 | 
| 119 |     while True:
 | 
| 120 |         if last_token:  # e.g. left over from WordParser
 | 
| 121 |             tok = last_token
 | 
| 122 |             #log('last_token = %s', last_token)
 | 
| 123 |             last_token = None
 | 
| 124 |         else:
 | 
| 125 |             tok = lex.Read(lex_mode_e.Expr)
 | 
| 126 |             #log('tok = %s', tok)
 | 
| 127 | 
 | 
| 128 |         # Comments and whitespace.  Newlines aren't ignored.
 | 
| 129 |         if consts.GetKind(tok.id) == Kind.Ignored:
 | 
| 130 |             continue
 | 
| 131 | 
 | 
| 132 |         # For multiline lists, maps, etc.
 | 
| 133 |         if tok.id == Id.Op_Newline:
 | 
| 134 |             if balance > 0:
 | 
| 135 |                 #log('*** SKIPPING NEWLINE')
 | 
| 136 |                 continue
 | 
| 137 |             # Eliminate duplicate newline tokens.  It makes the grammar simpler, and
 | 
| 138 |             # it's consistent with CPython's lexer and our own WordParser.
 | 
| 139 |             if prev_was_newline:
 | 
| 140 |                 continue
 | 
| 141 |             prev_was_newline = True
 | 
| 142 |         else:
 | 
| 143 |             prev_was_newline = False
 | 
| 144 | 
 | 
| 145 |         balance += _OTHER_BALANCE.get(tok.id, 0)
 | 
| 146 |         #log('BALANCE after seeing %s = %d', tok.id, balance)
 | 
| 147 | 
 | 
| 148 |         if tok.id == Id.Op_LParen:
 | 
| 149 |             # For nesting inside $()
 | 
| 150 |             lex.PushHint(Id.Op_RParen, Id.Op_RParen)
 | 
| 151 | 
 | 
| 152 |         #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
 | 
| 153 |         #  tok.id = KEYWORDS[tok.val]
 | 
| 154 |         #  log('Replaced with %s', tok.id)
 | 
| 155 | 
 | 
| 156 |         assert tok.id < 256, Id_str(tok.id)
 | 
| 157 | 
 | 
| 158 |         ilabel = _Classify(gr, tok)
 | 
| 159 |         #log('tok = %s, ilabel = %d', tok, ilabel)
 | 
| 160 | 
 | 
| 161 |         if p.addtoken(tok.id, tok, ilabel):
 | 
| 162 |             return tok
 | 
| 163 | 
 | 
| 164 |         #
 | 
| 165 |         # Mututally recursive calls into the command/word parsers.
 | 
| 166 |         #
 | 
| 167 | 
 | 
| 168 |         if tok.id in (Id.Left_ColonPipe,
 | 
| 169 |                       Id.Left_PercentParen):  # :|  %(  LEGACY!
 | 
| 170 |             left_tok = tok
 | 
| 171 |             if tok.id == Id.Left_PercentParen:
 | 
| 172 |                 lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
 | 
| 173 | 
 | 
| 174 |             # Blame the opening token
 | 
| 175 |             line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
 | 
| 176 |             w_parser = parse_ctx.MakeWordParser(lex, line_reader)
 | 
| 177 |             words = []  # type: List[CompoundWord]
 | 
| 178 |             close_tok = None  # type: Optional[Token]
 | 
| 179 |             done = False
 | 
| 180 |             while not done:
 | 
| 181 |                 w = w_parser.ReadWord(lex_mode_e.ShCommand)
 | 
| 182 |                 with tagswitch(w) as case:
 | 
| 183 |                     if case(word_e.Operator):
 | 
| 184 |                         tok = cast(Token, w)
 | 
| 185 |                         if tok.id == Id.Right_ShArrayLiteral:
 | 
| 186 |                             if left_tok.id != Id.Left_PercentParen:
 | 
| 187 |                                 p_die('Expected ) to close', left_tok)
 | 
| 188 |                             close_tok = tok
 | 
| 189 |                             done = True  # can't use break here
 | 
| 190 |                         elif tok.id == Id.Op_Pipe:
 | 
| 191 |                             if left_tok.id != Id.Left_ColonPipe:
 | 
| 192 |                                 p_die('Expected ) to close', left_tok)
 | 
| 193 |                             close_tok = tok
 | 
| 194 |                             done = True  # can't use break here
 | 
| 195 |                         elif tok.id == Id.Op_Newline:  # internal newlines allowed
 | 
| 196 |                             continue
 | 
| 197 |                         else:
 | 
| 198 |                             p_die('Unexpected token in array literal',
 | 
| 199 |                                   loc.Word(w))
 | 
| 200 | 
 | 
| 201 |                     elif case(word_e.Compound):
 | 
| 202 |                         words.append(cast(CompoundWord, w))
 | 
| 203 | 
 | 
| 204 |                     else:
 | 
| 205 |                         raise AssertionError()
 | 
| 206 | 
 | 
| 207 |             words2 = braces.BraceDetectAll(words)
 | 
| 208 |             words3 = word_.TildeDetectAll(words2)
 | 
| 209 | 
 | 
| 210 |             typ = Id.Expr_CastedDummy
 | 
| 211 | 
 | 
| 212 |             lit_part = ShArrayLiteral(left_tok, words3, close_tok)
 | 
| 213 |             opaque = cast(Token, lit_part)  # HACK for expr_to_ast
 | 
| 214 |             done = p.addtoken(typ, opaque, gr.tokens[typ])
 | 
| 215 |             assert not done  # can't end the expression
 | 
| 216 | 
 | 
| 217 |             # Now push the closing )
 | 
| 218 |             ilabel = _Classify(gr, close_tok)
 | 
| 219 |             done = p.addtoken(tok.id, close_tok, ilabel)
 | 
| 220 |             assert not done  # can't end the expression
 | 
| 221 | 
 | 
| 222 |             continue
 | 
| 223 | 
 | 
| 224 |         # $(  @(  ^(
 | 
| 225 |         if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
 | 
| 226 |                       Id.Left_CaretParen):
 | 
| 227 | 
 | 
| 228 |             left_token = tok
 | 
| 229 | 
 | 
| 230 |             lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
 | 
| 231 |             line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
 | 
| 232 |             c_parser = parse_ctx.MakeParserForCommandSub(
 | 
| 233 |                 line_reader, lex, Id.Eof_RParen)
 | 
| 234 |             node = c_parser.ParseCommandSub()
 | 
| 235 |             # A little gross: Copied from osh/word_parse.py
 | 
| 236 |             right_token = c_parser.w_parser.cur_token
 | 
| 237 | 
 | 
| 238 |             cs_part = CommandSub(left_token, node, right_token)
 | 
| 239 | 
 | 
| 240 |             typ = Id.Expr_CastedDummy
 | 
| 241 |             opaque = cast(Token, cs_part)  # HACK for expr_to_ast
 | 
| 242 |             done = p.addtoken(typ, opaque, gr.tokens[typ])
 | 
| 243 |             assert not done  # can't end the expression
 | 
| 244 | 
 | 
| 245 |             # Now push the closing )
 | 
| 246 |             ilabel = _Classify(gr, right_token)
 | 
| 247 |             done = p.addtoken(right_token.id, right_token, ilabel)
 | 
| 248 |             assert not done  # can't end the expression
 | 
| 249 | 
 | 
| 250 |             continue
 | 
| 251 | 
 | 
| 252 |         # "   $"   """   $"""   ^"
 | 
| 253 |         if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
 | 
| 254 |                       Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
 | 
| 255 |                       Id.Left_CaretDoubleQuote):
 | 
| 256 | 
 | 
| 257 |             left_token = tok
 | 
| 258 |             line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
 | 
| 259 |             w_parser = parse_ctx.MakeWordParser(lex, line_reader)
 | 
| 260 | 
 | 
| 261 |             parts = []  # type: List[word_part_t]
 | 
| 262 |             last_token = w_parser.ReadDoubleQuoted(left_token, parts)
 | 
| 263 |             expr_dq_part = DoubleQuoted(left_token, parts, last_token)
 | 
| 264 | 
 | 
| 265 |             typ = Id.Expr_CastedDummy
 | 
| 266 |             opaque = cast(Token, expr_dq_part)  # HACK for expr_to_ast
 | 
| 267 |             done = p.addtoken(typ, opaque, gr.tokens[typ])
 | 
| 268 |             assert not done  # can't end the expression
 | 
| 269 | 
 | 
| 270 |             continue
 | 
| 271 | 
 | 
| 272 |         # ${
 | 
| 273 |         if tok.id == Id.Left_DollarBrace:
 | 
| 274 |             left_token = tok
 | 
| 275 |             line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
 | 
| 276 |             w_parser = parse_ctx.MakeWordParser(lex, line_reader)
 | 
| 277 | 
 | 
| 278 |             part, last_token = w_parser.ReadBracedVarSub(left_token)
 | 
| 279 | 
 | 
| 280 |             # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
 | 
| 281 |             typ = Id.Expr_CastedDummy
 | 
| 282 |             opaque = cast(Token, part)  # HACK for expr_to_ast
 | 
| 283 |             done = p.addtoken(typ, opaque, gr.tokens[typ])
 | 
| 284 |             assert not done  # can't end the expression
 | 
| 285 | 
 | 
| 286 |             continue
 | 
| 287 | 
 | 
| 288 |         # 'x'  '''x'''
 | 
| 289 |         # r'x'  r'''x'''
 | 
| 290 |         # u'x'  u'''x'''
 | 
| 291 |         # b'x'  b'''x'''
 | 
| 292 |         # $'x'
 | 
| 293 |         if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
 | 
| 294 |                       Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
 | 
| 295 |                       Id.Left_USingleQuote, Id.Left_UTSingleQuote,
 | 
| 296 |                       Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
 | 
| 297 |                       Id.Left_DollarSingleQuote):
 | 
| 298 |             if tok.id == Id.Left_DollarSingleQuote:
 | 
| 299 |                 sq_mode = lex_mode_e.SQ_C
 | 
| 300 |             elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
 | 
| 301 |                             Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
 | 
| 302 |                 sq_mode = lex_mode_e.J8_Str
 | 
| 303 |             else:
 | 
| 304 |                 sq_mode = lex_mode_e.SQ_Raw
 | 
| 305 | 
 | 
| 306 |             left_token = tok
 | 
| 307 |             line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
 | 
| 308 |             w_parser = parse_ctx.MakeWordParser(lex, line_reader)
 | 
| 309 | 
 | 
| 310 |             tokens = []  # type: List[Token]
 | 
| 311 |             last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
 | 
| 312 |                                                    True)
 | 
| 313 | 
 | 
| 314 |             sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
 | 
| 315 |             sq_part = SingleQuoted(left_token, sval, last_token)
 | 
| 316 | 
 | 
| 317 |             typ = Id.Expr_CastedDummy
 | 
| 318 |             opaque = cast(Token, sq_part)  # HACK for expr_to_ast
 | 
| 319 |             done = p.addtoken(typ, opaque, gr.tokens[typ])
 | 
| 320 |             assert not done  # can't end the expression
 | 
| 321 |             continue
 | 
| 322 | 
 | 
| 323 |     else:
 | 
| 324 |         # We never broke out -- EOF is too soon (how can this happen???)
 | 
| 325 |         raise parse.ParseError("incomplete input", tok.id, tok)
 | 
| 326 | 
 | 
| 327 | 
 | 
| 328 | class ExprParser(object):
 | 
| 329 |     """A wrapper around a pgen2 parser."""
 | 
| 330 | 
 | 
| 331 |     def __init__(self, parse_ctx, gr):
 | 
| 332 |         # type: (ParseContext, Grammar) -> None
 | 
| 333 |         self.parse_ctx = parse_ctx
 | 
| 334 |         self.gr = gr
 | 
| 335 |         # Reused multiple times.
 | 
| 336 |         self.push_parser = parse.Parser(gr)
 | 
| 337 |         self.pnode_alloc = None  # type: Optional[PNodeAllocator]
 | 
| 338 | 
 | 
| 339 |     def Parse(self, lexer, start_symbol):
 | 
| 340 |         # type: (Lexer, int) -> Tuple[PNode, Token]
 | 
| 341 | 
 | 
| 342 |         # Reuse the parser
 | 
| 343 |         self.push_parser.setup(start_symbol, self.pnode_alloc)
 | 
| 344 |         try:
 | 
| 345 |             last_token = _PushYshTokens(self.parse_ctx, self.gr,
 | 
| 346 |                                         self.push_parser, lexer)
 | 
| 347 |         except parse.ParseError as e:
 | 
| 348 |             #log('ERROR %s', e)
 | 
| 349 |             # TODO:
 | 
| 350 |             # - Describe what lexer mode we're in (Invalid syntax in regex)
 | 
| 351 |             #   - Maybe say where the mode started
 | 
| 352 |             # - Id.Unknown_Tok could say "This character is invalid"
 | 
| 353 | 
 | 
| 354 |             # ParseError has a "too much input" case but I haven't been able to
 | 
| 355 |             # tickle it.  Maybe it's because of the Eof tokens?
 | 
| 356 | 
 | 
| 357 |             p_die(
 | 
| 358 |                 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
 | 
| 359 |                 e.tok)
 | 
| 360 | 
 | 
| 361 |         return self.push_parser.rootnode, last_token
 | 
| 362 | 
 | 
| 363 | 
 | 
| 364 | class ctx_PNodeAllocator(object):
 | 
| 365 | 
 | 
| 366 |     def __init__(self, ep):
 | 
| 367 |         # type: (ExprParser) -> None
 | 
| 368 |         self.expr_parser = ep
 | 
| 369 |         self.expr_parser.pnode_alloc = PNodeAllocator()
 | 
| 370 | 
 | 
| 371 |     def __enter__(self):
 | 
| 372 |         # type: () -> None
 | 
| 373 |         pass
 | 
| 374 | 
 | 
| 375 |     def __exit__(self, type, value, traceback):
 | 
| 376 |         # type: (Any, Any, Any) -> None
 | 
| 377 |         self.expr_parser.pnode_alloc.Clear()
 | 
| 378 |         self.expr_parser.pnode_alloc = None
 |