| 1 | #!/usr/bin/env python
 | 
| 2 | # Copyright 2016 Andy Chu. All rights reserved.
 | 
| 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 | 
| 4 | # you may not use this file except in compliance with the License.
 | 
| 5 | # You may obtain a copy of the License at
 | 
| 6 | #
 | 
| 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 | 
| 8 | """
 | 
| 9 | word_parse.py - Parse the shell word language.
 | 
| 10 | """
 | 
| 11 | 
 | 
| 12 | from asdl import const
 | 
| 13 | 
 | 
| 14 | from osh.meta import Id, Kind, LookupKind
 | 
| 15 | from core import braces
 | 
| 16 | from core import word
 | 
| 17 | from core import tdop
 | 
| 18 | from core import util
 | 
| 19 | 
 | 
| 20 | from osh import arith_parse
 | 
| 21 | from osh.meta import ast, types
 | 
| 22 | 
 | 
| 23 | word_part_e = ast.word_part_e
 | 
| 24 | word_e = ast.word_e
 | 
| 25 | lex_mode_e = types.lex_mode_e
 | 
| 26 | 
 | 
| 27 | p_die = util.p_die
 | 
| 28 | log = util.log
 | 
| 29 | 
 | 
| 30 | # Substitutions can be nested, but which inner subs are allowed depends on the
 | 
| 31 | # outer sub.  See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
 | 
| 32 | 
 | 
| 33 | # lex_mode_e.OUTER
 | 
| 34 | #   All subs and quotes are allowed --
 | 
| 35 | #   $v ${v}   $() ``   $(())   '' ""   $'' $""  <()  >()
 | 
| 36 | #
 | 
| 37 | # lex_mode_e.DQ
 | 
| 38 | #   Var, Command, Arith, but no quotes
 | 
| 39 | #   $v ${v}   $() ``   $(())
 | 
| 40 | #   No process substitution.
 | 
| 41 | #
 | 
| 42 | # lex_mode_e.ARITH:
 | 
| 43 | #   Similar to DQ: Var, Command, Arith sub.  No process sub.  bash has no
 | 
| 44 | #   quotes, but we are changing this in oil.  We are adding ALL FOUR kinds of
 | 
| 45 | #   quotes , because we need those for associtative array indexing.
 | 
| 46 | #
 | 
| 47 | # lex_mode_e.VS_ARG_UNQ
 | 
| 48 | #   Like UNQUOTED, except we stop at }.  Everything is allowed, even process
 | 
| 49 | #   substitution.
 | 
| 50 | #
 | 
| 51 | #   ${X:-$v}   ${X:-${v}}  ${X:-$(echo hi)}  ${X:-`echo hi`}  ${X:-$((1+2))}
 | 
| 52 | #   ${X:-'single'}  ${X:-"double"}  ${X:-$'\n'}  ${X:-<(echo hi)}
 | 
| 53 | #
 | 
| 54 | #   But space is SIGNIFICANT.  ${a:-  b   }
 | 
| 55 | #   So you should NOT just read a bunch of words after :-, unless you also
 | 
| 56 | #   preserve the space tokens between.
 | 
| 57 | #   In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
 | 
| 58 | #
 | 
| 59 | # lex_mode_e.VS_ARG_DQ
 | 
| 60 | #   Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
 | 
| 61 | #   token is not respected.
 | 
| 62 | #
 | 
| 63 | #   Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
 | 
| 64 | #   literally), and process substitution is not respected (ditto).
 | 
| 65 | #
 | 
| 66 | #   "" and $'' and $"" are respected, but not ''.  I need a matrix for this.
 | 
| 67 | #
 | 
| 68 | #   Like DQ, except nested "" and $'' and $"" are RESPECTED.
 | 
| 69 | #
 | 
| 70 | #   It's weird that double quotes are allowed.  Not sure why that would be.
 | 
| 71 | #   Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
 | 
| 72 | #   don't appear in the DQ context.  I think I should parse those but DISALLOW.
 | 
| 73 | #   You should always make $'' and $"" as a separate var!
 | 
| 74 | 
 | 
| 75 | class WordParser(object):
 | 
| 76 | 
 | 
| 77 |   def __init__(self, lexer, line_reader, lex_mode=lex_mode_e.OUTER):
 | 
| 78 |     self.lexer = lexer
 | 
| 79 |     self.line_reader = line_reader
 | 
| 80 |     self.Reset(lex_mode=lex_mode)
 | 
| 81 | 
 | 
| 82 |   def _Peek(self):
 | 
| 83 |     """Helper method."""
 | 
| 84 |     if self.next_lex_mode is not None:
 | 
| 85 |       self.prev_token = self.cur_token  # for completion
 | 
| 86 |       self.cur_token = self.lexer.Read(self.next_lex_mode)
 | 
| 87 |       self.token_kind = LookupKind(self.cur_token.id)
 | 
| 88 |       self.token_type = self.cur_token.id
 | 
| 89 | 
 | 
| 90 |       self.next_lex_mode = None
 | 
| 91 |     return self.cur_token
 | 
| 92 | 
 | 
| 93 |   def _Next(self, lex_mode):
 | 
| 94 |     """Set the next lex state, but don't actually read a token.
 | 
| 95 | 
 | 
| 96 |     We need this for proper interactive parsing.
 | 
| 97 |     """
 | 
| 98 |     self.next_lex_mode = lex_mode
 | 
| 99 | 
 | 
| 100 |   def Reset(self, lex_mode=lex_mode_e.OUTER):
 | 
| 101 |     """Called by interactive loop."""
 | 
| 102 |     # For _Peek()
 | 
| 103 |     self.prev_token = None  # for completion
 | 
| 104 |     self.cur_token = None
 | 
| 105 |     self.token_kind = Kind.Undefined
 | 
| 106 |     self.token_type = Id.Undefined_Tok
 | 
| 107 | 
 | 
| 108 |     self.next_lex_mode = lex_mode
 | 
| 109 | 
 | 
| 110 |     # For newline.  TODO: I think we can do this iteratively, without member
 | 
| 111 |     # state.
 | 
| 112 |     self.cursor = None
 | 
| 113 |     self.cursor_was_newline = False
 | 
| 114 | 
 | 
| 115 |     self.error_stack = []
 | 
| 116 | 
 | 
| 117 |   def AddErrorContext(self, msg, *args, **kwargs):
 | 
| 118 |     err = util.ParseError(msg, *args, **kwargs)
 | 
| 119 |     self.error_stack.append(err)
 | 
| 120 | 
 | 
| 121 |   def Error(self):
 | 
| 122 |     return self.error_stack
 | 
| 123 | 
 | 
| 124 |   def _BadToken(self, msg, token):
 | 
| 125 |     """
 | 
| 126 |       Args:
 | 
| 127 |         msg: format string with a single %s token
 | 
| 128 |         token: Token
 | 
| 129 |     """
 | 
| 130 |     self.AddErrorContext(msg, token, token=token)
 | 
| 131 | 
 | 
| 132 |   def PrevToken(self):
 | 
| 133 |     """Inspect state.  Used by completion.
 | 
| 134 | 
 | 
| 135 |     cur_token is usually Id.Op_Newline \n, so we need the previous one.
 | 
| 136 |     """
 | 
| 137 |     return self.prev_token
 | 
| 138 | 
 | 
| 139 |   def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
 | 
| 140 |                     empty_ok=True):
 | 
| 141 |     # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
 | 
| 142 |     # valid, even when unquoted.
 | 
| 143 |     self._Next(arg_lex_mode)
 | 
| 144 |     self._Peek()
 | 
| 145 | 
 | 
| 146 |     w = self._ReadCompoundWord(
 | 
| 147 |         lex_mode=arg_lex_mode, eof_type=eof_type, empty_ok=empty_ok)
 | 
| 148 |     # This is for "${s:-}", ${s/a//}, etc.  It is analogous to
 | 
| 149 |     # LooksLikeAssignment where we turn x= into x=''.  It has the same
 | 
| 150 |     # potential problem of not having spids.
 | 
| 151 |     #
 | 
| 152 |     # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
 | 
| 153 |     # return a CompoundWord with no parts, which is explicitly checked with a
 | 
| 154 |     # custom error message.
 | 
| 155 |     if not w.parts and arg_lex_mode == lex_mode_e.VS_ARG_DQ and empty_ok:
 | 
| 156 |       w.parts.append(ast.EmptyPart())
 | 
| 157 |     return w
 | 
| 158 | 
 | 
| 159 |   def _ReadSliceArg(self):
 | 
| 160 |     """Read an arithmetic expression for either part of ${a : i+1 : i+2}."""
 | 
| 161 |     anode = self._ReadArithExpr(do_next=False)
 | 
| 162 |     return anode
 | 
| 163 | 
 | 
| 164 |   def _ReadSliceVarOp(self):
 | 
| 165 |     """ VarOf ':' ArithExpr (':' ArithExpr )? """
 | 
| 166 |     self._Next(lex_mode_e.ARITH)
 | 
| 167 |     self._Peek()
 | 
| 168 |     if self.token_type == Id.Arith_Colon:  # A pun for Id.VOp2_Colon
 | 
| 169 |       begin = None  # no beginning specified
 | 
| 170 |     else:
 | 
| 171 |       begin = self._ReadSliceArg()
 | 
| 172 |       if not begin: return None
 | 
| 173 |       #print('BEGIN', begin)
 | 
| 174 |       #print('BVS2', self.cur_token)
 | 
| 175 | 
 | 
| 176 |     if self.token_type == Id.Arith_RBrace:
 | 
| 177 |       return ast.Slice(begin, None)  # No length specified
 | 
| 178 | 
 | 
| 179 |     # Id.Arith_Colon is a pun for Id.VOp2_Colon
 | 
| 180 |     elif self.token_type == Id.Arith_Colon:
 | 
| 181 |       self._Next(lex_mode_e.ARITH)
 | 
| 182 |       length = self._ReadSliceArg()
 | 
| 183 |       if not length: return None
 | 
| 184 | 
 | 
| 185 |       #print('after colon', self.cur_token)
 | 
| 186 |       return ast.Slice(begin, length)
 | 
| 187 | 
 | 
| 188 |     else:
 | 
| 189 |       self.AddErrorContext("Unexpected token in slice: %s", self.cur_token)
 | 
| 190 |       return None
 | 
| 191 | 
 | 
| 192 |   def _ReadPatSubVarOp(self, lex_mode):
 | 
| 193 |     """
 | 
| 194 |     Match     = ('/' | '#' | '%') WORD
 | 
| 195 |     VarSub    = ...
 | 
| 196 |               | VarOf '/' Match '/' WORD
 | 
| 197 |     """
 | 
| 198 |     do_all = False
 | 
| 199 |     do_prefix = False
 | 
| 200 |     do_suffix = False
 | 
| 201 | 
 | 
| 202 |     pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
 | 
| 203 |     if not pat: return None
 | 
| 204 | 
 | 
| 205 |     if len(pat.parts) == 1:
 | 
| 206 |       ok, s, quoted = word.StaticEval(pat)
 | 
| 207 |       if ok and s == '/' and not quoted:  # Looks like ${a////c}, read again
 | 
| 208 |         self._Next(lex_mode)
 | 
| 209 |         self._Peek()
 | 
| 210 |         p = ast.LiteralPart(self.cur_token)
 | 
| 211 |         pat.parts.append(p)
 | 
| 212 | 
 | 
| 213 |     if len(pat.parts) == 0:
 | 
| 214 |       self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
 | 
| 215 |       return None
 | 
| 216 |     else:
 | 
| 217 |       first_part = pat.parts[0]
 | 
| 218 |       if first_part.tag == word_part_e.LiteralPart:
 | 
| 219 |         lit_id = first_part.token.id
 | 
| 220 |         if lit_id == Id.Lit_Slash:
 | 
| 221 |           do_all = True
 | 
| 222 |           pat.parts.pop(0)
 | 
| 223 |         elif lit_id == Id.Lit_Pound:
 | 
| 224 |           do_prefix = True
 | 
| 225 |           pat.parts.pop(0)
 | 
| 226 |         elif lit_id == Id.Lit_Percent:
 | 
| 227 |           do_suffix = True
 | 
| 228 |           pat.parts.pop(0)
 | 
| 229 | 
 | 
| 230 |     #self._Peek()
 | 
| 231 |     if self.token_type == Id.Right_VarSub:
 | 
| 232 |       # e.g. ${v/a} is the same as ${v/a/}  -- empty replacement string
 | 
| 233 |       return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
 | 
| 234 | 
 | 
| 235 |     elif self.token_type == Id.Lit_Slash:
 | 
| 236 |       replace = self._ReadVarOpArg(lex_mode)  # do not stop at /
 | 
| 237 |       if not replace: return None
 | 
| 238 | 
 | 
| 239 |       self._Peek()
 | 
| 240 |       if self.token_type == Id.Right_VarSub:
 | 
| 241 |         return ast.PatSub(pat, replace, do_all, do_prefix, do_suffix)
 | 
| 242 | 
 | 
| 243 |       else:
 | 
| 244 |         self._BadToken("Expected } after pat sub, got %s", self.cur_token)
 | 
| 245 |         return None
 | 
| 246 | 
 | 
| 247 |     else:
 | 
| 248 |       self._BadToken("Expected } after pat sub, got %s", self.cur_token)
 | 
| 249 |       return None
 | 
| 250 | 
 | 
| 251 |   def _ReadSubscript(self):
 | 
| 252 |     """ Subscript = '[' ('@' | '*' | ArithExpr) ']'
 | 
| 253 |     """
 | 
| 254 |     # Lookahead to see if we get @ or *.  Otherwise read a full arithmetic
 | 
| 255 |     # expression.
 | 
| 256 |     t2 = self.lexer.LookAhead(lex_mode_e.ARITH)
 | 
| 257 |     if t2.id in (Id.Lit_At, Id.Arith_Star):
 | 
| 258 |       op = ast.WholeArray(t2.id)
 | 
| 259 | 
 | 
| 260 |       self._Next(lex_mode_e.ARITH)  # skip past [
 | 
| 261 |       self._Peek()
 | 
| 262 |       self._Next(lex_mode_e.ARITH)  # skip past @
 | 
| 263 |       self._Peek()
 | 
| 264 |     else:
 | 
| 265 |       anode = self._ReadArithExpr()
 | 
| 266 |       if not anode:
 | 
| 267 |         return None
 | 
| 268 |       op = ast.ArrayIndex(anode)
 | 
| 269 | 
 | 
| 270 |     #self._Peek()    # Can't do this here.  Should the test go elsewhere?
 | 
| 271 |     if self.token_type != Id.Arith_RBracket:  # Should be looking at ]
 | 
| 272 |       self._BadToken('Expected ] after subscript, got %s', self.cur_token)
 | 
| 273 |       return None
 | 
| 274 | 
 | 
| 275 |     self._Next(lex_mode_e.VS_2)  # skip past ]
 | 
| 276 |     self._Peek()  # Needed to be in the same spot as no subscript
 | 
| 277 | 
 | 
| 278 |     return op
 | 
| 279 | 
 | 
| 280 |   def _ParseVarOf(self):
 | 
| 281 |     """
 | 
| 282 |     VarOf     = NAME Subscript?
 | 
| 283 |               | NUMBER      # no subscript allowed, none of these are arrays
 | 
| 284 |                             # ${@[1]} doesn't work, even though slicing does
 | 
| 285 |               | VarSymbol
 | 
| 286 |     """
 | 
| 287 |     self._Peek()
 | 
| 288 |     name_token = self.cur_token
 | 
| 289 |     self._Next(lex_mode_e.VS_2)
 | 
| 290 | 
 | 
| 291 |     self._Peek()  # Check for []
 | 
| 292 |     if self.token_type == Id.VOp2_LBracket:
 | 
| 293 |       bracket_op = self._ReadSubscript()
 | 
| 294 |       if not bracket_op: return None
 | 
| 295 |     else:
 | 
| 296 |       bracket_op = None
 | 
| 297 | 
 | 
| 298 |     part = ast.BracedVarSub(name_token)
 | 
| 299 |     part.bracket_op = bracket_op
 | 
| 300 |     return part
 | 
| 301 | 
 | 
| 302 |   def _ParseVarExpr(self, arg_lex_mode):
 | 
| 303 |     """
 | 
| 304 |     Start parsing at the op -- we already skipped past the name.
 | 
| 305 |     """
 | 
| 306 |     part = self._ParseVarOf()
 | 
| 307 |     if not part: return None
 | 
| 308 | 
 | 
| 309 |     self._Peek()
 | 
| 310 |     if self.token_type == Id.Right_VarSub:
 | 
| 311 |       return part  # no ops
 | 
| 312 | 
 | 
| 313 |     # Or maybe this is a VarOpKind
 | 
| 314 | 
 | 
| 315 |     op_kind = self.token_kind
 | 
| 316 | 
 | 
| 317 |     if op_kind == Kind.VTest:
 | 
| 318 |       op_id = self.token_type
 | 
| 319 |       arg_word = self._ReadVarOpArg(arg_lex_mode)
 | 
| 320 |       if self.token_type != Id.Right_VarSub:
 | 
| 321 |         self._BadToken('Unexpected token after test arg: %s', self.cur_token)
 | 
| 322 |         return None
 | 
| 323 | 
 | 
| 324 |       part.suffix_op = ast.StringUnary(op_id, arg_word)
 | 
| 325 | 
 | 
| 326 |     elif op_kind == Kind.VOp1:
 | 
| 327 |       op_id = self.token_type
 | 
| 328 |       arg_word = self._ReadVarOpArg(arg_lex_mode)
 | 
| 329 |       if self.token_type != Id.Right_VarSub:
 | 
| 330 |         self._BadToken('Unexpected token after unary op: %s', self.cur_token)
 | 
| 331 |         return None
 | 
| 332 | 
 | 
| 333 |       op = ast.StringUnary(op_id, arg_word)
 | 
| 334 |       part.suffix_op = op
 | 
| 335 | 
 | 
| 336 |     elif op_kind == Kind.VOp2:
 | 
| 337 |       if self.token_type == Id.VOp2_Slash:
 | 
| 338 |         op = self._ReadPatSubVarOp(arg_lex_mode)
 | 
| 339 |         if not op: return None
 | 
| 340 |         # Checked by the method above
 | 
| 341 |         assert self.token_type == Id.Right_VarSub, self.cur_token
 | 
| 342 | 
 | 
| 343 |       elif self.token_type == Id.VOp2_Colon:
 | 
| 344 |         op = self._ReadSliceVarOp()
 | 
| 345 |         if not op: return None
 | 
| 346 |         if self.token_type != Id.Arith_RBrace:
 | 
| 347 |           self._BadToken('Unexpected token after slice: %s', self.cur_token)
 | 
| 348 |           return None
 | 
| 349 | 
 | 
| 350 |       else:
 | 
| 351 |         p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
 | 
| 352 | 
 | 
| 353 |       part.suffix_op = op
 | 
| 354 | 
 | 
| 355 |     # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
 | 
| 356 |     # mode.  It's redundantly checked above.
 | 
| 357 |     if self.token_type not in (Id.Right_VarSub, Id.Arith_RBrace):
 | 
| 358 |       self._BadToken('Unexpected token after var sub: %s', self.cur_token)
 | 
| 359 |       return None
 | 
| 360 | 
 | 
| 361 |     # Now look for ops
 | 
| 362 |     return part
 | 
| 363 | 
 | 
| 364 |   def _ReadBracedBracedVarSub(self, d_quoted=False):
 | 
| 365 |     """For the ${} expression language.
 | 
| 366 | 
 | 
| 367 |     NAME        = [a-zA-Z_][a-zA-Z0-9_]*
 | 
| 368 |     NUMBER      = [0-9]+                    # ${10}, ${11}, ...
 | 
| 369 | 
 | 
| 370 |     Subscript   = '[' ('@' | '*' | ArithExpr) ']'
 | 
| 371 |     VarSymbol   = '!' | '@' | '#' | ...
 | 
| 372 |     VarOf       = NAME Subscript?
 | 
| 373 |                 | NUMBER      # no subscript allowed, none of these are arrays
 | 
| 374 |                               # ${@[1]} doesn't work, even though slicing does
 | 
| 375 |                 | VarSymbol
 | 
| 376 | 
 | 
| 377 |     TEST_OP     = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
 | 
| 378 |     STRIP_OP    = '#' | '##' | '%' | '%%'
 | 
| 379 |     CASE_OP     = ',' | ',,' | '^' | '^^'
 | 
| 380 | 
 | 
| 381 |     UnaryOp     = TEST_OP | STRIP_OP | CASE_OP | ...
 | 
| 382 |     Match       = ('/' | '#' | '%') WORD       # match all / prefix / suffix
 | 
| 383 |     VarExpr     = VarOf
 | 
| 384 |                 | VarOf UnaryOp WORD
 | 
| 385 |                 | VarOf ':' ArithExpr (':' ArithExpr )?
 | 
| 386 |                 | VarOf '/' Match '/' WORD
 | 
| 387 | 
 | 
| 388 |     LengthExpr  = '#' VarOf  # can't apply operators after length
 | 
| 389 | 
 | 
| 390 |     RefOrKeys   = '!' VarExpr  # CAN apply operators after a named ref
 | 
| 391 |                                # ${!ref[0]} vs ${!keys[@]} resolved later
 | 
| 392 | 
 | 
| 393 |     PrefixQuery = '!' NAME ('*' | '@')  # list variable names with a prefix
 | 
| 394 | 
 | 
| 395 |     VarSub      = LengthExpr
 | 
| 396 |                 | RefOrKeys
 | 
| 397 |                 | PrefixQuery
 | 
| 398 |                 | VarExpr
 | 
| 399 | 
 | 
| 400 |     NOTES:
 | 
| 401 |     - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
 | 
| 402 |       slicing ${a:x+1:y+2}
 | 
| 403 |     - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
 | 
| 404 |     - @ and * are technically arithmetic expressions in this implementation
 | 
| 405 |     - We don't account for bash 4.4: ${param@operator} -- Q E P A a.  Note that
 | 
| 406 |       it's also vectorized.
 | 
| 407 | 
 | 
| 408 |     Strictness over bash:
 | 
| 409 |     echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
 | 
| 410 |     grammar
 | 
| 411 |     ! and # prefixes can't be composed, even though named refs can be composed
 | 
| 412 |     with other operators
 | 
| 413 |     '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
 | 
| 414 |     prefix, and it can also be a literal part of WORD.
 | 
| 415 | 
 | 
| 416 |     From the parser's point of view, the prefix # can't be combined with
 | 
| 417 |     UnaryOp/slicing/matching, and the ! can.  However
 | 
| 418 | 
 | 
| 419 |     ${a[@]:1:2} is not allowed
 | 
| 420 |     ${#a[@]:1:2} is allowed, but gives the wrong answer
 | 
| 421 |     """
 | 
| 422 |     left_spid = self.cur_token.span_id
 | 
| 423 | 
 | 
| 424 |     if d_quoted:
 | 
| 425 |       arg_lex_mode = lex_mode_e.VS_ARG_DQ
 | 
| 426 |     else:
 | 
| 427 |       arg_lex_mode = lex_mode_e.VS_ARG_UNQ
 | 
| 428 | 
 | 
| 429 |     self._Next(lex_mode_e.VS_1)
 | 
| 430 |     self._Peek()
 | 
| 431 | 
 | 
| 432 |     ty = self.token_type
 | 
| 433 | 
 | 
| 434 |     if ty == Id.VSub_Pound:
 | 
| 435 |       # Disambiguate
 | 
| 436 |       t = self.lexer.LookAhead(lex_mode_e.VS_1)
 | 
| 437 |       #print("\t# LOOKAHEAD", t)
 | 
| 438 |       if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
 | 
| 439 |         # e.g. a name, '#' is the prefix
 | 
| 440 |         self._Next(lex_mode_e.VS_1)
 | 
| 441 |         part = self._ParseVarOf()
 | 
| 442 | 
 | 
| 443 |         self._Peek()
 | 
| 444 |         if self.token_type != Id.Right_VarSub:
 | 
| 445 |           self._BadToken("Expected } after length expression, got %r",
 | 
| 446 |               self.cur_token)
 | 
| 447 |           return None
 | 
| 448 | 
 | 
| 449 |         part.prefix_op = Id.VSub_Pound  # length
 | 
| 450 | 
 | 
| 451 |       else:  # not a prefix, '#' is the variable
 | 
| 452 |         part = self._ParseVarExpr(arg_lex_mode)
 | 
| 453 |         if not part: return None
 | 
| 454 | 
 | 
| 455 |     elif ty == Id.VSub_Bang:
 | 
| 456 |       t = self.lexer.LookAhead(lex_mode_e.VS_1)
 | 
| 457 |       #print("\t! LOOKAHEAD", t)
 | 
| 458 |       if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
 | 
| 459 |         # e.g. a name, '!' is the prefix
 | 
| 460 |         # ${!a} -- this is a ref
 | 
| 461 |         # ${!3} -- this is ref
 | 
| 462 |         # ${!a[1]} -- this is a ref
 | 
| 463 |         # ${!a[@]} -- this is a keys
 | 
| 464 |         # No lookahead -- do it in a second step, or at runtime
 | 
| 465 |         self._Next(lex_mode_e.VS_1)
 | 
| 466 |         part = self._ParseVarExpr(arg_lex_mode)
 | 
| 467 |         if not part: return None
 | 
| 468 | 
 | 
| 469 |         part.prefix_op = Id.VSub_Bang
 | 
| 470 | 
 | 
| 471 |       else:  # not a prefix, '!' is the variable
 | 
| 472 |         part = self._ParseVarExpr(arg_lex_mode)
 | 
| 473 |         if not part: return None
 | 
| 474 | 
 | 
| 475 |     # VS_NAME, VS_NUMBER, symbol that isn't # or !
 | 
| 476 |     elif self.token_kind == Kind.VSub:
 | 
| 477 |       part = self._ParseVarExpr(arg_lex_mode)
 | 
| 478 |       if not part: return None
 | 
| 479 | 
 | 
| 480 |     else:
 | 
| 481 |       # e.g. ${^}
 | 
| 482 |       p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
 | 
| 483 | 
 | 
| 484 |     part.spids.append(left_spid)
 | 
| 485 | 
 | 
| 486 |     # Does this work?
 | 
| 487 |     right_spid = self.cur_token.span_id
 | 
| 488 |     part.spids.append(right_spid)
 | 
| 489 | 
 | 
| 490 |     return part
 | 
| 491 | 
 | 
| 492 |   def _ReadSingleQuotedPart(self, lex_mode):
 | 
| 493 |     left = self.cur_token
 | 
| 494 |     tokens = []
 | 
| 495 | 
 | 
| 496 |     done = False
 | 
| 497 |     while not done:
 | 
| 498 |       self._Next(lex_mode)
 | 
| 499 |       self._Peek()
 | 
| 500 | 
 | 
| 501 |       # Kind.Char emitted in DOLLAR_SQ state
 | 
| 502 |       if self.token_kind in (Kind.Lit, Kind.Char):
 | 
| 503 |         tokens.append(self.cur_token)
 | 
| 504 | 
 | 
| 505 |       elif self.token_kind == Kind.Eof:
 | 
| 506 |         self.AddErrorContext('Unexpected EOF in single-quoted string')
 | 
| 507 |         return False
 | 
| 508 | 
 | 
| 509 |       elif self.token_kind == Kind.Right:
 | 
| 510 |         done = True  # assume Id.Right_SingleQuote
 | 
| 511 | 
 | 
| 512 |       else:
 | 
| 513 |         raise AssertionError(
 | 
| 514 |             'Unhandled token in single-quoted part %s (%d)' %
 | 
| 515 |             (self.cur_token, self.token_kind))
 | 
| 516 | 
 | 
| 517 |     return ast.SingleQuotedPart(left, tokens)
 | 
| 518 | 
 | 
| 519 |   def _ReadDoubleQuotedLeftParts(self):
 | 
| 520 |     """Read substitution parts in a double quoted context."""
 | 
| 521 |     if self.token_type in (Id.Left_CommandSub, Id.Left_Backtick):
 | 
| 522 |       return self._ReadCommandSubPart(self.token_type)
 | 
| 523 | 
 | 
| 524 |     if self.token_type == Id.Left_VarSub:
 | 
| 525 |       return self._ReadBracedBracedVarSub(d_quoted=True)
 | 
| 526 | 
 | 
| 527 |     if self.token_type == Id.Left_ArithSub:
 | 
| 528 |       return self._ReadArithSubPart()
 | 
| 529 | 
 | 
| 530 |     if self.token_type == Id.Left_ArithSub2:
 | 
| 531 |       return self._ReadArithSub2Part()
 | 
| 532 | 
 | 
| 533 |     raise AssertionError(self.cur_token)
 | 
| 534 | 
 | 
| 535 |   def _ReadLeftParts(self):
 | 
| 536 |     """Read substitutions and quoted strings."""
 | 
| 537 | 
 | 
| 538 |     if self.token_type == Id.Left_DoubleQuote:
 | 
| 539 |       return self._ReadDoubleQuotedPart()
 | 
| 540 | 
 | 
| 541 |     if self.token_type == Id.Left_DollarDoubleQuote:
 | 
| 542 |       # NOTE: $"" is treated as "" for now.  Does it make sense to add the
 | 
| 543 |       # token to the part?
 | 
| 544 |       return self._ReadDoubleQuotedPart()
 | 
| 545 | 
 | 
| 546 |     if self.token_type == Id.Left_SingleQuote:
 | 
| 547 |       return self._ReadSingleQuotedPart(lex_mode_e.SQ)
 | 
| 548 | 
 | 
| 549 |     if self.token_type == Id.Left_DollarSingleQuote:
 | 
| 550 |       return self._ReadSingleQuotedPart(lex_mode_e.DOLLAR_SQ)
 | 
| 551 | 
 | 
| 552 |     if self.token_type in (
 | 
| 553 |         Id.Left_CommandSub, Id.Left_Backtick, Id.Left_ProcSubIn,
 | 
| 554 |         Id.Left_ProcSubOut):
 | 
| 555 |       return self._ReadCommandSubPart(self.token_type)
 | 
| 556 | 
 | 
| 557 |     if self.token_type == Id.Left_VarSub:
 | 
| 558 |       return self._ReadBracedBracedVarSub(d_quoted=False)
 | 
| 559 | 
 | 
| 560 |     if self.token_type == Id.Left_ArithSub:
 | 
| 561 |       return self._ReadArithSubPart()
 | 
| 562 | 
 | 
| 563 |     if self.token_type == Id.Left_ArithSub2:
 | 
| 564 |       return self._ReadArithSub2Part()
 | 
| 565 | 
 | 
| 566 |     raise AssertionError('%s not handled' % self.cur_token)
 | 
| 567 | 
 | 
| 568 |   def _ReadExtGlobPart(self):
 | 
| 569 |     """
 | 
| 570 |     Grammar:
 | 
| 571 |       Item         = CompoundWord | EPSILON  # important: @(foo|) is allowed
 | 
| 572 |       LEFT         = '@(' | '*(' | '+(' | '?(' | '!('
 | 
| 573 |       RIGHT        = ')'
 | 
| 574 |       ExtGlob      = LEFT (Item '|')* Item RIGHT  # ITEM may be empty
 | 
| 575 |       CompoundWord includes ExtGlobPart
 | 
| 576 |     """
 | 
| 577 |     left_token = self.cur_token
 | 
| 578 |     arms = []
 | 
| 579 |     part = ast.ExtGlobPart(left_token, arms)  # return value
 | 
| 580 |     part.spids.append(left_token.span_id)
 | 
| 581 | 
 | 
| 582 |     self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
 | 
| 583 |     self._Next(lex_mode_e.EXTGLOB)  # advance past LEFT
 | 
| 584 | 
 | 
| 585 |     read_word = False  # did we just a read a word?  To handle @(||).
 | 
| 586 | 
 | 
| 587 |     while True:
 | 
| 588 |       self._Peek()
 | 
| 589 |       #log('t %r', self.cur_token)
 | 
| 590 | 
 | 
| 591 |       if self.token_type == Id.Right_ExtGlob:
 | 
| 592 |         if not read_word:
 | 
| 593 |           arms.append(ast.CompoundWord())
 | 
| 594 |         part.spids.append(self.cur_token.span_id)
 | 
| 595 |         break
 | 
| 596 | 
 | 
| 597 |       elif self.token_type == Id.Op_Pipe:
 | 
| 598 |         if not read_word:
 | 
| 599 |           arms.append(ast.CompoundWord())
 | 
| 600 |         read_word = False
 | 
| 601 |         self._Next(lex_mode_e.EXTGLOB)
 | 
| 602 | 
 | 
| 603 |       # lex mode EXTGLOB should only produce these 4 kinds of tokens
 | 
| 604 |       elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
 | 
| 605 |         w = self._ReadCompoundWord(lex_mode=lex_mode_e.EXTGLOB)
 | 
| 606 |         arms.append(w)
 | 
| 607 |         read_word = True
 | 
| 608 | 
 | 
| 609 |       elif self.token_kind == Kind.Eof:
 | 
| 610 |         self.AddErrorContext(
 | 
| 611 |             'Unexpected EOF reading extended glob that began here',
 | 
| 612 |             token=left_token)
 | 
| 613 |         return None
 | 
| 614 | 
 | 
| 615 |       else:
 | 
| 616 |         raise AssertionError('Unexpected token %r' % self.cur_token)
 | 
| 617 | 
 | 
| 618 |     return part
 | 
| 619 | 
 | 
| 620 |   def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
 | 
| 621 |     """
 | 
| 622 |     Args:
 | 
| 623 |       eof_type: for stopping at }, Id.Lit_RBrace
 | 
| 624 |       here_doc: Whether we are reading in a here doc context
 | 
| 625 | 
 | 
| 626 |     Also ${foo%%a b c}  # treat this as double quoted.  until you hit
 | 
| 627 |     """
 | 
| 628 |     quoted_part = ast.DoubleQuotedPart()
 | 
| 629 |     left_spid = const.NO_INTEGER
 | 
| 630 |     right_spid = const.NO_INTEGER  # gets set later
 | 
| 631 | 
 | 
| 632 |     if self.cur_token is not None:  # None in here doc case
 | 
| 633 |       left_spid = self.cur_token.span_id
 | 
| 634 | 
 | 
| 635 |     done = False
 | 
| 636 |     while not done:
 | 
| 637 |       self._Next(lex_mode_e.DQ)
 | 
| 638 |       self._Peek()
 | 
| 639 |       #print(self.cur_token)
 | 
| 640 | 
 | 
| 641 |       if self.token_type == eof_type:  # e.g. stop at }
 | 
| 642 |         done = True
 | 
| 643 |         continue
 | 
| 644 | 
 | 
| 645 |       elif self.token_kind == Kind.Lit:
 | 
| 646 |         if self.token_type == Id.Lit_EscapedChar:
 | 
| 647 |           part = ast.EscapedLiteralPart(self.cur_token)
 | 
| 648 |         else:
 | 
| 649 |           part = ast.LiteralPart(self.cur_token)
 | 
| 650 |         quoted_part.parts.append(part)
 | 
| 651 | 
 | 
| 652 |       elif self.token_kind == Kind.Left:
 | 
| 653 |         part = self._ReadDoubleQuotedLeftParts()
 | 
| 654 |         if not part:
 | 
| 655 |           return None
 | 
| 656 |         quoted_part.parts.append(part)
 | 
| 657 | 
 | 
| 658 |       elif self.token_kind == Kind.VSub:
 | 
| 659 |         part = ast.SimpleVarSub(self.cur_token)
 | 
| 660 |         quoted_part.parts.append(part)
 | 
| 661 | 
 | 
| 662 |       elif self.token_kind == Kind.Right:
 | 
| 663 |         assert self.token_type == Id.Right_DoubleQuote
 | 
| 664 |         if here_doc:
 | 
| 665 |           # Turn Id.Right_DoubleQuote into a literal part
 | 
| 666 |           quoted_part.parts.append(ast.LiteralPart(self.cur_token))
 | 
| 667 |         else:
 | 
| 668 |           done = True  # assume Id.Right_DoubleQuote
 | 
| 669 |           right_spid = self.cur_token.span_id
 | 
| 670 | 
 | 
| 671 |       elif self.token_kind == Kind.Eof:
 | 
| 672 |         if here_doc:  # here docs will have an EOF in their token stream
 | 
| 673 |           done = True
 | 
| 674 |         else:
 | 
| 675 |           self.AddErrorContext(
 | 
| 676 |               'Unexpected EOF reading double-quoted string that began here',
 | 
| 677 |               span_id=left_spid)
 | 
| 678 |           return False
 | 
| 679 | 
 | 
| 680 |       else:
 | 
| 681 |         raise AssertionError(self.cur_token)
 | 
| 682 | 
 | 
| 683 |     quoted_part.spids.extend((left_spid, right_spid))
 | 
| 684 |     return quoted_part
 | 
| 685 | 
 | 
| 686 |   def _ReadCommandSubPart(self, token_type):
 | 
| 687 |     """
 | 
| 688 |     NOTE: This is not in the grammar, because word parts aren't in the grammar!
 | 
| 689 | 
 | 
| 690 |     command_sub = '$(' command_list ')'
 | 
| 691 |     """
 | 
| 692 |     left_token = self.cur_token
 | 
| 693 |     left_spid = left_token.span_id
 | 
| 694 | 
 | 
| 695 |     #print('_ReadCommandSubPart', self.cur_token)
 | 
| 696 |     self._Next(lex_mode_e.OUTER)  # advance past $( or `
 | 
| 697 | 
 | 
| 698 |     # Set the lexer in a state so ) becomes the EOF token.
 | 
| 699 |     #print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
 | 
| 700 |     if token_type in (
 | 
| 701 |         Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
 | 
| 702 |       self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
 | 
| 703 |     elif token_type == Id.Left_Backtick:
 | 
| 704 |       self.lexer.PushHint(Id.Left_Backtick, Id.Eof_Backtick)
 | 
| 705 |     else:
 | 
| 706 |       raise AssertionError(self.token_type)
 | 
| 707 | 
 | 
| 708 |     from osh import parse_lib
 | 
| 709 |     c_parser = parse_lib.MakeParserForCommandSub(self.line_reader, self.lexer)
 | 
| 710 | 
 | 
| 711 |     node = c_parser.ParseWholeFile()  # `` and $() allowed
 | 
| 712 |     if not node:
 | 
| 713 |       # Example of parse error:
 | 
| 714 |       # echo $(cat |)  OR
 | 
| 715 |       # echo `cat |`
 | 
| 716 |       error_stack = c_parser.Error()
 | 
| 717 |       self.error_stack.extend(error_stack)
 | 
| 718 |       print(self.error_stack)
 | 
| 719 |       self.AddErrorContext('Error parsing command list in command sub')
 | 
| 720 |       return None
 | 
| 721 | 
 | 
| 722 |     # Hm this creates its own word parser, which is thrown away?
 | 
| 723 |     #print('X', self.cur_token)
 | 
| 724 |     right_spid = c_parser.w_parser.cur_token.span_id
 | 
| 725 | 
 | 
| 726 |     cs_part = ast.CommandSubPart(node, left_token)
 | 
| 727 |     cs_part.spids.append(left_spid)
 | 
| 728 |     cs_part.spids.append(right_spid)
 | 
| 729 |     return cs_part
 | 
| 730 | 
 | 
| 731 |   def _ReadArithExpr(self, do_next=True):
 | 
| 732 |     """Read and parse an arithmetic expression in various contexts.
 | 
| 733 | 
 | 
| 734 |     $(( 1+2 ))
 | 
| 735 |     (( a=1+2 ))
 | 
| 736 |     ${a[ 1+2 ]}
 | 
| 737 |     ${a : 1+2 : 1+2}
 | 
| 738 | 
 | 
| 739 |     See tests/arith-context.test.sh for ambiguous cases.
 | 
| 740 | 
 | 
| 741 |     ${a[a[0]]} is valid  # VS_RBRACKET vs Id.Arith_RBracket
 | 
| 742 | 
 | 
| 743 |     ${s : a<b?0:1 : 1}  # VS_COLON vs Id.Arith_Colon
 | 
| 744 | 
 | 
| 745 |     TODO: Instead of having an eof_type.  I think we should use just run the
 | 
| 746 |     arith parser until it's done.  That will take care of both : and ].  We
 | 
| 747 |     switch the state back.
 | 
| 748 | 
 | 
| 749 |     See the assertion in ArithParser.Parse() -- unexpected extra input.
 | 
| 750 |     """
 | 
| 751 |     if do_next:
 | 
| 752 |       self._Next(lex_mode_e.ARITH)
 | 
| 753 |     # calls self.ReadWord(lex_mode_e.ARITH)
 | 
| 754 |     a_parser = tdop.TdopParser(arith_parse.SPEC, self)
 | 
| 755 |     anode = a_parser.Parse()
 | 
| 756 |     if not anode:
 | 
| 757 |       error_stack = a_parser.Error()
 | 
| 758 |       self.error_stack.extend(error_stack)
 | 
| 759 |     return anode  # could be None
 | 
| 760 | 
 | 
| 761 |   def _ReadArithSubPart(self):
 | 
| 762 |     """
 | 
| 763 |     Read an arith substitution, which contains an arith expression, e.g.
 | 
| 764 |     $((a + 1)).
 | 
| 765 |     """
 | 
| 766 |     left_span_id = self.cur_token.span_id
 | 
| 767 | 
 | 
| 768 |     # The second one needs to be disambiguated in stuff like stuff like:
 | 
| 769 |     # $(echo $(( 1+2 )) )
 | 
| 770 |     self.lexer.PushHint(Id.Op_RParen, Id.Right_ArithSub)
 | 
| 771 | 
 | 
| 772 |     # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
 | 
| 773 |     # could save the lexer/reader state here, and retry if the arithmetic parse
 | 
| 774 |     # fails.  But we can almost always catch this at parse time.  There could
 | 
| 775 |     # be some exceptions like:
 | 
| 776 |     # $((echo * foo))  # looks like multiplication
 | 
| 777 |     # $((echo / foo))  # looks like division
 | 
| 778 | 
 | 
| 779 |     anode = self._ReadArithExpr()
 | 
| 780 |     if not anode:
 | 
| 781 |       self.AddErrorContext("Error parsing arith sub part")
 | 
| 782 |       return None
 | 
| 783 | 
 | 
| 784 |     if self.token_type != Id.Arith_RParen:
 | 
| 785 |       self._BadToken('Expected first paren to end arith sub, got %s',
 | 
| 786 |           self.cur_token)
 | 
| 787 |       return None
 | 
| 788 | 
 | 
| 789 |     self._Next(lex_mode_e.OUTER)  # TODO: This could be DQ or ARITH too
 | 
| 790 | 
 | 
| 791 |     # PROBLEM: $(echo $(( 1 + 2 )) )
 | 
| 792 |     # Two right parens break the Id.Eof_RParen scheme
 | 
| 793 |     self._Peek()
 | 
| 794 |     if self.token_type != Id.Right_ArithSub:
 | 
| 795 |       self._BadToken('Expected second paren to end arith sub, got %s',
 | 
| 796 |           self.cur_token)
 | 
| 797 |       return None
 | 
| 798 |     right_span_id = self.cur_token.span_id
 | 
| 799 | 
 | 
| 800 |     node = ast.ArithSubPart(anode)
 | 
| 801 |     node.spids.append(left_span_id)
 | 
| 802 |     node.spids.append(right_span_id)
 | 
| 803 |     return node
 | 
| 804 | 
 | 
| 805 |   def _ReadArithSub2Part(self):
 | 
| 806 |     """Non-standard arith sub $[a + 1]."""
 | 
| 807 |     left_span_id = self.cur_token.span_id
 | 
| 808 | 
 | 
| 809 |     anode = self._ReadArithExpr()
 | 
| 810 |     if not anode:
 | 
| 811 |       self.AddErrorContext("Error parsing arith sub part")
 | 
| 812 |       return None
 | 
| 813 | 
 | 
| 814 |     if self.token_type != Id.Arith_RBracket:
 | 
| 815 |       self.AddErrorContext("Expected ], got %s", self.cur_token)
 | 
| 816 |       return None
 | 
| 817 |     right_span_id = self.cur_token.span_id
 | 
| 818 | 
 | 
| 819 |     node = ast.ArithSubPart(anode)
 | 
| 820 |     node.spids.append(left_span_id)
 | 
| 821 |     node.spids.append(right_span_id)
 | 
| 822 |     return node
 | 
| 823 | 
 | 
| 824 |   def ReadDParen(self):
 | 
| 825 |     """Read ((1+ 2))  -- command context.
 | 
| 826 | 
 | 
| 827 |     We're using the word parser because it's very similar to _ReadArithExpr
 | 
| 828 |     above.
 | 
| 829 |     """
 | 
| 830 |     # The second one needs to be disambiguated in stuff like stuff like:
 | 
| 831 |     # TODO: Be consistent with ReadForExpression below and use lex_mode_e.ARITH?
 | 
| 832 |     # Then you can get rid of this.
 | 
| 833 |     self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
 | 
| 834 | 
 | 
| 835 |     anode = self._ReadArithExpr()
 | 
| 836 |     if not anode:
 | 
| 837 |       self.AddErrorContext("Error parsing dparen statement")
 | 
| 838 |       return None
 | 
| 839 | 
 | 
| 840 |     #print('xx ((', self.cur_token)
 | 
| 841 |     if self.token_type != Id.Arith_RParen:
 | 
| 842 |       self._BadToken('Expected first paren to end arith sub, got %s',
 | 
| 843 |           self.cur_token)
 | 
| 844 |       return None
 | 
| 845 |     self._Next(lex_mode_e.OUTER)
 | 
| 846 | 
 | 
| 847 |     # PROBLEM: $(echo $(( 1 + 2 )) )
 | 
| 848 |     self._Peek()
 | 
| 849 |     if self.token_type != Id.Op_DRightParen:
 | 
| 850 |       self._BadToken('Expected second paren to end arith sub, got %s',
 | 
| 851 |           self.cur_token)
 | 
| 852 |       return None
 | 
| 853 |     self._Next(lex_mode_e.OUTER)
 | 
| 854 | 
 | 
| 855 |     return anode
 | 
| 856 | 
 | 
| 857 |   def ReadForExpression(self):
 | 
| 858 |     """Read ((i=0; i<5; ++i)) -- part of command context.
 | 
| 859 | 
 | 
| 860 |     """
 | 
| 861 |     # No PushHint because we're in arith state.
 | 
| 862 |     #self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
 | 
| 863 | 
 | 
| 864 |     self._Next(lex_mode_e.ARITH)  # skip over ((
 | 
| 865 | 
 | 
| 866 |     self._Peek()
 | 
| 867 |     if self.token_type == Id.Arith_Semi:
 | 
| 868 |       #print('Got empty init')
 | 
| 869 |       init_node = None
 | 
| 870 |     else:
 | 
| 871 |       init_node = self._ReadArithExpr(do_next=False)
 | 
| 872 |       if not init_node:
 | 
| 873 |         self.AddErrorContext("Error parsing for init")
 | 
| 874 |         return None
 | 
| 875 |     self._Next(lex_mode_e.ARITH)
 | 
| 876 |     #print('INIT',init_node)
 | 
| 877 | 
 | 
| 878 |     self._Peek()
 | 
| 879 |     if self.token_type == Id.Arith_Semi:
 | 
| 880 |       #print('Got empty condition')
 | 
| 881 |       cond_node = None
 | 
| 882 |     else:
 | 
| 883 |       cond_node = self._ReadArithExpr(do_next=False)
 | 
| 884 |       if not cond_node:
 | 
| 885 |         self.AddErrorContext("Error parsing for cond")
 | 
| 886 |         return None
 | 
| 887 |     self._Next(lex_mode_e.ARITH)
 | 
| 888 |     #print('COND',cond_node)
 | 
| 889 | 
 | 
| 890 |     self._Peek()
 | 
| 891 |     if self.token_type == Id.Arith_RParen:
 | 
| 892 |       #print('Got empty update')
 | 
| 893 |       update_node = None
 | 
| 894 |     else:
 | 
| 895 |       update_node = self._ReadArithExpr(do_next=False)
 | 
| 896 |       if not update_node:
 | 
| 897 |         self.AddErrorContext("Error parsing for update")
 | 
| 898 |         return None
 | 
| 899 |     self._Next(lex_mode_e.ARITH)
 | 
| 900 |     #print('UPDATE',update_node)
 | 
| 901 | 
 | 
| 902 |     #print('TT', self.cur_token)
 | 
| 903 |     # Second paren
 | 
| 904 |     self._Peek()
 | 
| 905 |     if self.token_type != Id.Arith_RParen:
 | 
| 906 |       self._BadToken('Expected right paren to end for loop expression, got %s',
 | 
| 907 |           self.cur_token)
 | 
| 908 |       return None
 | 
| 909 |     self._Next(lex_mode_e.OUTER)
 | 
| 910 | 
 | 
| 911 |     return ast.ForExpr(init_node, cond_node, update_node)
 | 
| 912 | 
 | 
| 913 |   def _ReadArrayLiteralPart(self):
 | 
| 914 |     self._Next(lex_mode_e.OUTER)  # advance past (
 | 
| 915 |     self._Peek()
 | 
| 916 |     if self.cur_token.id != Id.Op_LParen:
 | 
| 917 |       self.AddErrorContext('Expected ( after =', token=self.cur_token)
 | 
| 918 |       return None
 | 
| 919 | 
 | 
| 920 |     # MUST use a new word parser (with same lexer).
 | 
| 921 |     w_parser = WordParser(self.lexer, self.line_reader)
 | 
| 922 |     words = []
 | 
| 923 |     while True:
 | 
| 924 |       w = w_parser.ReadWord(lex_mode_e.OUTER)
 | 
| 925 |       if not w:
 | 
| 926 |         self.error_stack.extend(w_parser.Error())
 | 
| 927 |         return None
 | 
| 928 | 
 | 
| 929 |       if w.tag == word_e.TokenWord:
 | 
| 930 |         word_id = word.CommandId(w) 
 | 
| 931 |         if word_id == Id.Right_ArrayLiteral:
 | 
| 932 |           break
 | 
| 933 |         # Unlike command parsing, array parsing allows embedded \n.
 | 
| 934 |         elif word_id == Id.Op_Newline:
 | 
| 935 |           continue
 | 
| 936 |         else:
 | 
| 937 |           self.AddErrorContext(
 | 
| 938 |               'Unexpected word in array literal: %s', w, word=w)
 | 
| 939 |           return None
 | 
| 940 | 
 | 
| 941 |       words.append(w)
 | 
| 942 | 
 | 
| 943 |     words2 = braces.BraceDetectAll(words)
 | 
| 944 |     words3 = word.TildeDetectAll(words2)
 | 
| 945 | 
 | 
| 946 |     return ast.ArrayLiteralPart(words3)
 | 
| 947 | 
 | 
| 948 |   def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
 | 
| 949 |                         lex_mode=lex_mode_e.OUTER, empty_ok=True):
 | 
| 950 |     """
 | 
| 951 |     Precondition: Looking at the first token of the first word part
 | 
| 952 |     Postcondition: Looking at the token after, e.g. space or operator
 | 
| 953 | 
 | 
| 954 |     NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
 | 
| 955 |     could be an operator delimiting a compound word.  Can we change lexer modes
 | 
| 956 |     and remove this special case?
 | 
| 957 |     """
 | 
| 958 |     #print('_ReadCompoundWord', lex_mode)
 | 
| 959 |     word = ast.CompoundWord()
 | 
| 960 | 
 | 
| 961 |     num_parts = 0
 | 
| 962 |     done = False
 | 
| 963 |     while not done:
 | 
| 964 |       allow_done = empty_ok or num_parts != 0
 | 
| 965 |       self._Peek()
 | 
| 966 |       #print('CW',self.cur_token)
 | 
| 967 |       if allow_done and self.token_type == eof_type:
 | 
| 968 |         done = True  # e.g. for ${foo//pat/replace}
 | 
| 969 | 
 | 
| 970 |       # Keywords like "for" are treated like literals
 | 
| 971 |       elif self.token_kind in (
 | 
| 972 |           Kind.Lit, Kind.KW, Kind.Assign, Kind.ControlFlow, Kind.BoolUnary,
 | 
| 973 |           Kind.BoolBinary):
 | 
| 974 |         if self.token_type == Id.Lit_EscapedChar:
 | 
| 975 |           part = ast.EscapedLiteralPart(self.cur_token)
 | 
| 976 |         else:
 | 
| 977 |           part = ast.LiteralPart(self.cur_token)
 | 
| 978 |           #part.xspans.append(self.cur_token.span_id)
 | 
| 979 | 
 | 
| 980 |         word.parts.append(part)
 | 
| 981 | 
 | 
| 982 |         if self.token_type == Id.Lit_VarLike:
 | 
| 983 |           #print('@', self.cursor)
 | 
| 984 |           #print('@', self.cur_token)
 | 
| 985 | 
 | 
| 986 |           t = self.lexer.LookAhead(lex_mode_e.OUTER)
 | 
| 987 |           if t.id == Id.Op_LParen:
 | 
| 988 |             self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
 | 
| 989 |             part2 = self._ReadArrayLiteralPart()
 | 
| 990 |             if not part2:
 | 
| 991 |               self.AddErrorContext('_ReadArrayLiteralPart failed')
 | 
| 992 |               return False
 | 
| 993 |             word.parts.append(part2)
 | 
| 994 | 
 | 
| 995 |       elif self.token_kind == Kind.VSub:
 | 
| 996 |         part = ast.SimpleVarSub(self.cur_token)
 | 
| 997 |         word.parts.append(part)
 | 
| 998 | 
 | 
| 999 |       elif self.token_kind == Kind.ExtGlob:
 | 
| 1000 |         part = self._ReadExtGlobPart()
 | 
| 1001 |         if not part:
 | 
| 1002 |           return None
 | 
| 1003 |         word.parts.append(part)
 | 
| 1004 | 
 | 
| 1005 |       elif self.token_kind == Kind.Left:
 | 
| 1006 |         #print('_ReadLeftParts')
 | 
| 1007 |         part = self._ReadLeftParts()
 | 
| 1008 |         if not part:
 | 
| 1009 |           return None
 | 
| 1010 |         word.parts.append(part)
 | 
| 1011 | 
 | 
| 1012 |       # NOT done yet, will advance below
 | 
| 1013 |       elif self.token_kind == Kind.Right:
 | 
| 1014 |         # Still part of the word; will be done on the next iter.
 | 
| 1015 |         if self.token_type == Id.Right_DoubleQuote:
 | 
| 1016 |           pass
 | 
| 1017 |         elif self.token_type == Id.Right_CommandSub:
 | 
| 1018 |           pass
 | 
| 1019 |         elif self.token_type == Id.Right_Subshell:
 | 
| 1020 |           # LEXER HACK for (case x in x) ;; esac )
 | 
| 1021 |           assert self.next_lex_mode is None  # Rewind before it's used
 | 
| 1022 |           if self.lexer.MaybeUnreadOne():
 | 
| 1023 |             self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
 | 
| 1024 |             self._Next(lex_mode)
 | 
| 1025 |           done = True
 | 
| 1026 |         else:
 | 
| 1027 |           done = True
 | 
| 1028 | 
 | 
| 1029 |       elif self.token_kind == Kind.Ignored:
 | 
| 1030 |         done = True
 | 
| 1031 | 
 | 
| 1032 |       else:
 | 
| 1033 |         # LEXER HACK for unbalanced case clause.  'case foo in esac' is valid,
 | 
| 1034 |         # so to test for ESAC, we can read ) before getting a chance to
 | 
| 1035 |         # PushHint(Id.Op_RParen, Id.Right_CasePat).  So here we unread one
 | 
| 1036 |         # token and do it again.
 | 
| 1037 | 
 | 
| 1038 |         # We get Id.Op_RParen at top level:      case x in x) ;; esac
 | 
| 1039 |         # We get Id.Eof_RParen inside ComSub:  $(case x in x) ;; esac )
 | 
| 1040 |         if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
 | 
| 1041 |           assert self.next_lex_mode is None  # Rewind before it's used
 | 
| 1042 |           if self.lexer.MaybeUnreadOne():
 | 
| 1043 |             if self.token_type == Id.Eof_RParen:
 | 
| 1044 |               # Redo translation
 | 
| 1045 |               self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
 | 
| 1046 |             self._Next(lex_mode)
 | 
| 1047 | 
 | 
| 1048 |         done = True  # anything we don't recognize means we're done
 | 
| 1049 | 
 | 
| 1050 |       if not done:
 | 
| 1051 |         self._Next(lex_mode)
 | 
| 1052 |       num_parts += 1
 | 
| 1053 |     return word
 | 
| 1054 | 
 | 
| 1055 |   def _ReadArithWord(self):
 | 
| 1056 |     """Helper function for ReadArithWord."""
 | 
| 1057 |     #assert self.token_type != Id.Undefined_Tok
 | 
| 1058 |     self._Peek()
 | 
| 1059 |     #print('_ReadArithWord', self.cur_token)
 | 
| 1060 | 
 | 
| 1061 |     if self.token_kind == Kind.Unknown:
 | 
| 1062 |       self.AddErrorContext("Unknown token in arith context: %s",
 | 
| 1063 |           self.cur_token, token=self.cur_token)
 | 
| 1064 |       return None, False
 | 
| 1065 | 
 | 
| 1066 |     elif self.token_kind == Kind.Eof:
 | 
| 1067 |       # Just return EOF token
 | 
| 1068 |       w = ast.TokenWord(self.cur_token)
 | 
| 1069 |       return w, False
 | 
| 1070 |       #self.AddErrorContext("Unexpected EOF in arith context: %s",
 | 
| 1071 |       #    self.cur_token, token=self.cur_token)
 | 
| 1072 |       #return None, False
 | 
| 1073 | 
 | 
| 1074 |     elif self.token_kind == Kind.Ignored:
 | 
| 1075 |       # Space should be ignored.  TODO: change this to SPACE_SPACE and
 | 
| 1076 |       # SPACE_NEWLINE?  or SPACE_TOK.
 | 
| 1077 |       self._Next(lex_mode_e.ARITH)
 | 
| 1078 |       return None, True  # Tell wrapper to try again
 | 
| 1079 | 
 | 
| 1080 |     elif self.token_kind in (Kind.Arith, Kind.Right):
 | 
| 1081 |       # Id.Right_ArithSub IS just a normal token, handled by ArithParser
 | 
| 1082 |       self._Next(lex_mode_e.ARITH)
 | 
| 1083 |       w = ast.TokenWord(self.cur_token)
 | 
| 1084 |       return w, False
 | 
| 1085 | 
 | 
| 1086 |     elif self.token_kind in (Kind.Lit, Kind.Left):
 | 
| 1087 |       w = self._ReadCompoundWord(lex_mode=lex_mode_e.ARITH)
 | 
| 1088 |       if not w:
 | 
| 1089 |         return None, True
 | 
| 1090 |       return w, False
 | 
| 1091 | 
 | 
| 1092 |     elif self.token_kind == Kind.VSub:
 | 
| 1093 |       part = ast.SimpleVarSub(self.cur_token)
 | 
| 1094 |       self._Next(lex_mode_e.ARITH)
 | 
| 1095 |       w = ast.CompoundWord([part])
 | 
| 1096 |       return w, False
 | 
| 1097 | 
 | 
| 1098 |     else:
 | 
| 1099 |       self._BadToken("Unexpected token parsing arith sub: %s", self.cur_token)
 | 
| 1100 |       return None, False
 | 
| 1101 | 
 | 
| 1102 |     raise AssertionError("Shouldn't get here")
 | 
| 1103 | 
 | 
| 1104 |   def _ReadWord(self, lex_mode):
 | 
| 1105 |     """Helper function for Read().
 | 
| 1106 | 
 | 
| 1107 |     Returns:
 | 
| 1108 |       2-tuple (word, need_more)
 | 
| 1109 |         word: Word, or None if there was an error, or need_more is set
 | 
| 1110 |         need_more: True if the caller should call us again
 | 
| 1111 |     """
 | 
| 1112 |     #print('_Read', lex_mode, self.cur_token)
 | 
| 1113 |     self._Peek()
 | 
| 1114 | 
 | 
| 1115 |     if self.token_kind == Kind.Eof:
 | 
| 1116 |       # No advance
 | 
| 1117 |       return ast.TokenWord(self.cur_token), False
 | 
| 1118 | 
 | 
| 1119 |     # Allow Arith for ) at end of for loop?
 | 
| 1120 |     elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
 | 
| 1121 |       self._Next(lex_mode)
 | 
| 1122 |       if self.token_type == Id.Op_Newline:
 | 
| 1123 |         if self.cursor_was_newline:
 | 
| 1124 |           #print('SKIP(nl)', self.cur_token)
 | 
| 1125 |           return None, True
 | 
| 1126 | 
 | 
| 1127 |       return ast.TokenWord(self.cur_token), False
 | 
| 1128 | 
 | 
| 1129 |     elif self.token_kind == Kind.Right:
 | 
| 1130 |       #print('WordParser.Read: Kind.Right', self.cur_token)
 | 
| 1131 |       if self.token_type not in (
 | 
| 1132 |           Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
 | 
| 1133 |           Id.Right_ArrayLiteral):
 | 
| 1134 |         raise AssertionError(self.cur_token)
 | 
| 1135 | 
 | 
| 1136 |       self._Next(lex_mode)
 | 
| 1137 |       return ast.TokenWord(self.cur_token), False
 | 
| 1138 | 
 | 
| 1139 |     elif self.token_kind in (Kind.Ignored, Kind.WS):
 | 
| 1140 |       self._Next(lex_mode)
 | 
| 1141 |       return None, True  # tell Read() to try again
 | 
| 1142 | 
 | 
| 1143 |     elif self.token_kind in (
 | 
| 1144 |         Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
 | 
| 1145 |         Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
 | 
| 1146 |       # We're beginning a word.  If we see Id.Lit_Pound, change to
 | 
| 1147 |       # lex_mode_e.COMMENT and read until end of line.  (TODO: How to add
 | 
| 1148 |       # comments to AST?)
 | 
| 1149 | 
 | 
| 1150 |       # TODO: Can we do the same thing for Tilde here?  Enter a state where we
 | 
| 1151 |       # look for / too.
 | 
| 1152 |       if self.token_type == Id.Lit_Pound:
 | 
| 1153 |         self._Next(lex_mode_e.COMMENT)
 | 
| 1154 |         self._Peek()
 | 
| 1155 | 
 | 
| 1156 |         # NOTE: The # could be the last character in the file.  It can't be
 | 
| 1157 |         # Eof_{RParen,Backtick} because #) and #` are comments.
 | 
| 1158 |         assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
 | 
| 1159 |             self.cur_token
 | 
| 1160 | 
 | 
| 1161 |         # The next iteration will go into Kind.Ignored and set lex state to
 | 
| 1162 |         # lex_mode_e.OUTER/etc.
 | 
| 1163 |         return None, True  # tell Read() to try again after comment
 | 
| 1164 | 
 | 
| 1165 |       else:
 | 
| 1166 |         w = self._ReadCompoundWord(lex_mode=lex_mode)
 | 
| 1167 |         if not w:
 | 
| 1168 |           self.AddErrorContext(
 | 
| 1169 |               'Error reading command word', token=self.cur_token)
 | 
| 1170 |           return None, False
 | 
| 1171 |         return w, False
 | 
| 1172 | 
 | 
| 1173 |     else:
 | 
| 1174 |       raise AssertionError(
 | 
| 1175 |           'Unhandled: %s (%s)' % (self.cur_token, self.token_kind))
 | 
| 1176 | 
 | 
| 1177 |     raise AssertionError("Shouldn't get here")
 | 
| 1178 | 
 | 
| 1179 |   def LookAhead(self):
 | 
| 1180 |     """Look ahead to the next token.
 | 
| 1181 | 
 | 
| 1182 |     For the command parser to recognize func () { } and array= (1 2 3).  And
 | 
| 1183 |     probably coprocesses.
 | 
| 1184 |     """
 | 
| 1185 |     assert self.token_type != Id.Undefined_Tok
 | 
| 1186 |     if self.cur_token.id == Id.WS_Space:
 | 
| 1187 |       t = self.lexer.LookAhead(lex_mode_e.OUTER)
 | 
| 1188 |     else:
 | 
| 1189 |       t = self.cur_token
 | 
| 1190 |     return t.id
 | 
| 1191 | 
 | 
| 1192 |   def ReadWord(self, lex_mode):
 | 
| 1193 |     """Read the next Word.
 | 
| 1194 | 
 | 
| 1195 |     Returns:
 | 
| 1196 |       Word, or None if there was an error
 | 
| 1197 |     """
 | 
| 1198 |     # Implementation note: This is an stateful/iterative function that calls
 | 
| 1199 |     # the stateless "_ReadWord" function.
 | 
| 1200 |     while True:
 | 
| 1201 |       if lex_mode == lex_mode_e.ARITH:
 | 
| 1202 |         # TODO: Can this be unified?
 | 
| 1203 |         w, need_more = self._ReadArithWord()
 | 
| 1204 |       elif lex_mode in (
 | 
| 1205 |           lex_mode_e.OUTER, lex_mode_e.DBRACKET, lex_mode_e.BASH_REGEX):
 | 
| 1206 |         w, need_more = self._ReadWord(lex_mode)
 | 
| 1207 |       else:
 | 
| 1208 |         raise AssertionError('Invalid lex state %s' % lex_mode)
 | 
| 1209 |       if not need_more:
 | 
| 1210 |         break
 | 
| 1211 | 
 | 
| 1212 |     if not w:  # Assumes AddErrorContext was already called
 | 
| 1213 |       return None
 | 
| 1214 | 
 | 
| 1215 |     self.cursor = w
 | 
| 1216 | 
 | 
| 1217 |     # TODO: Do consolidation of newlines in the lexer?
 | 
| 1218 |     # Note that there can be an infinite (Id.Ignored_Comment Id.Op_Newline
 | 
| 1219 |     # Id.Ignored_Comment Id.Op_Newline) sequence, so we have to keep track of
 | 
| 1220 |     # the last non-ignored token.
 | 
| 1221 |     self.cursor_was_newline = (word.CommandId(self.cursor) == Id.Op_Newline)
 | 
| 1222 |     return self.cursor
 | 
| 1223 | 
 | 
| 1224 |   def ReadHereDocBody(self):
 | 
| 1225 |     """
 | 
| 1226 |     Sort of like Read(), except we're in a double quoted context, but not using
 | 
| 1227 |     double quotes.
 | 
| 1228 | 
 | 
| 1229 |     Returns:
 | 
| 1230 |       CompoundWord.  NOTE: We could also just use a DoubleQuotedPart for both
 | 
| 1231 |       cases?
 | 
| 1232 |     """
 | 
| 1233 |     w = ast.CompoundWord()
 | 
| 1234 |     dq = self._ReadDoubleQuotedPart(here_doc=True)
 | 
| 1235 |     if not dq:
 | 
| 1236 |       self.AddErrorContext('Error parsing here doc body')
 | 
| 1237 |       return False
 | 
| 1238 |     w.parts.append(dq)
 | 
| 1239 |     return w
 |