| 1 | # Copyright 2016 Andy Chu. All rights reserved.
|
| 2 | # Licensed under the Apache License, Version 2.0 (the "License");
|
| 3 | # you may not use this file except in compliance with the License.
|
| 4 | # You may obtain a copy of the License at
|
| 5 | #
|
| 6 | # http://www.apache.org/licenses/LICENSE-2.0
|
| 7 | """
|
| 8 | word_parse.py - Parse the shell word language.
|
| 9 |
|
| 10 | Hairy example:
|
| 11 |
|
| 12 | hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
|
| 13 |
|
| 14 | Substitutions can be nested, but which inner subs are allowed depends on the
|
| 15 | outer sub. Notes:
|
| 16 |
|
| 17 | lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
|
| 18 | All subs and quotes are allowed:
|
| 19 | $v ${v} $() `` $(()) '' "" $'' $"" <() >()
|
| 20 |
|
| 21 | lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
|
| 22 | Var, Command, Arith, but no quotes.
|
| 23 | $v ${v} $() `` $(())
|
| 24 | No process substitution.
|
| 25 |
|
| 26 | lex_mode_e.Arith
|
| 27 | Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
|
| 28 | allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
|
| 29 | need those for associative array indexing.
|
| 30 |
|
| 31 | lex_mode_e.VSub_ArgUnquoted
|
| 32 | Like ShCommand, everything is allowed (even process substitutions), but we
|
| 33 | stop at }, and space is SIGNIFICANT.
|
| 34 |
|
| 35 | Example: ${a:- b }
|
| 36 |
|
| 37 | ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
|
| 38 | ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
|
| 39 |
|
| 40 | lex_mode_e.VSub_ArgDQ
|
| 41 | In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
|
| 42 | "${x:-"default"}".
|
| 43 |
|
| 44 | In contrast, VSub_ArgUnquoted respects single quotes and process
|
| 45 | substitution.
|
| 46 |
|
| 47 | It's weird that double quotes are allowed. Space is also significant here,
|
| 48 | e.g. "${x:-a "b"}".
|
| 49 | """
|
| 50 |
|
| 51 | from _devbuild.gen import grammar_nt
|
| 52 | from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
|
| 53 | from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
|
| 54 | from _devbuild.gen.syntax_asdl import (
|
| 55 | BoolParamBox,
|
| 56 | Token,
|
| 57 | SimpleVarSub,
|
| 58 | loc,
|
| 59 | source,
|
| 60 | DoubleQuoted,
|
| 61 | SingleQuoted,
|
| 62 | BracedVarSub,
|
| 63 | CommandSub,
|
| 64 | ShArrayLiteral,
|
| 65 | AssocPair,
|
| 66 | bracket_op,
|
| 67 | bracket_op_t,
|
| 68 | suffix_op,
|
| 69 | suffix_op_t,
|
| 70 | rhs_word,
|
| 71 | rhs_word_e,
|
| 72 | rhs_word_t,
|
| 73 | word_e,
|
| 74 | word_t,
|
| 75 | CompoundWord,
|
| 76 | word_part,
|
| 77 | word_part_t,
|
| 78 | y_lhs_e,
|
| 79 | arith_expr_t,
|
| 80 | command,
|
| 81 | expr,
|
| 82 | expr_e,
|
| 83 | expr_t,
|
| 84 | pat_t,
|
| 85 | ArgList,
|
| 86 | Proc,
|
| 87 | Func,
|
| 88 | Subscript,
|
| 89 | Attribute,
|
| 90 | arith_expr,
|
| 91 | )
|
| 92 | from core import alloc
|
| 93 | from core.error import p_die
|
| 94 | from mycpp.mylib import log
|
| 95 | from core import pyutil
|
| 96 | from core import ui
|
| 97 | from frontend import consts
|
| 98 | from frontend import lexer
|
| 99 | from frontend import reader
|
| 100 | from osh import tdop
|
| 101 | from osh import arith_parse
|
| 102 | from osh import braces
|
| 103 | from osh import word_
|
| 104 | from osh import word_compile
|
| 105 | from mycpp.mylib import tagswitch
|
| 106 |
|
| 107 | from typing import List, Optional, Tuple, cast
|
| 108 | from typing import TYPE_CHECKING
|
| 109 | if TYPE_CHECKING:
|
| 110 | from frontend.lexer import Lexer
|
| 111 | from frontend.parse_lib import ParseContext
|
| 112 | from frontend.reader import _Reader
|
| 113 | from osh.cmd_parse import VarChecker
|
| 114 |
|
| 115 | unused1 = log
|
| 116 | unused2 = Id_str
|
| 117 |
|
| 118 | KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
|
| 119 |
|
| 120 |
|
| 121 | class WordEmitter(object):
|
| 122 | """Common interface for [ and [["""
|
| 123 |
|
| 124 | def __init__(self):
|
| 125 | # type: () -> None
|
| 126 | """Empty constructor for mycpp."""
|
| 127 | pass
|
| 128 |
|
| 129 | def ReadWord(self, lex_mode):
|
| 130 | # type: (lex_mode_t) -> word_t
|
| 131 | raise NotImplementedError()
|
| 132 |
|
| 133 |
|
| 134 | class WordParser(WordEmitter):
|
| 135 |
|
| 136 | def __init__(self, parse_ctx, lexer, line_reader):
|
| 137 | # type: (ParseContext, Lexer, _Reader) -> None
|
| 138 | self.parse_ctx = parse_ctx
|
| 139 | self.lexer = lexer
|
| 140 | self.line_reader = line_reader
|
| 141 | self.arena = line_reader.arena
|
| 142 |
|
| 143 | self.parse_opts = parse_ctx.parse_opts
|
| 144 | self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
|
| 145 | self.parse_opts)
|
| 146 | self.Reset()
|
| 147 |
|
| 148 | def Init(self, lex_mode):
|
| 149 | # type: (lex_mode_t) -> None
|
| 150 | """Used to parse arithmetic, see ParseContext."""
|
| 151 | self.next_lex_mode = lex_mode
|
| 152 |
|
| 153 | def Reset(self):
|
| 154 | # type: () -> None
|
| 155 | """Called by interactive loop."""
|
| 156 | # For _GetToken()
|
| 157 | self.cur_token = None # type: Token
|
| 158 | self.token_kind = Kind.Undefined
|
| 159 | self.token_type = Id.Undefined_Tok
|
| 160 |
|
| 161 | self.next_lex_mode = lex_mode_e.ShCommand
|
| 162 |
|
| 163 | # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
|
| 164 | # comments
|
| 165 | self.emit_doc_token = False
|
| 166 | # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
|
| 167 | # multiline mode.
|
| 168 | self.multiline = False
|
| 169 |
|
| 170 | # For detecting invalid \n\n in multiline mode. Counts what we got
|
| 171 | # directly from the lexer.
|
| 172 | self.newline_state = 0
|
| 173 | # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
|
| 174 | # that consume words.
|
| 175 | self.returned_newline = False
|
| 176 |
|
| 177 | # For integration with pgen2
|
| 178 | self.buffered_word = None # type: word_t
|
| 179 |
|
| 180 | def _GetToken(self):
|
| 181 | # type: () -> None
|
| 182 | """Call this when you need to make a decision based on any of:
|
| 183 |
|
| 184 | self.token_type
|
| 185 | self.token_kind
|
| 186 | self.cur_token
|
| 187 | """
|
| 188 | if self.next_lex_mode == lex_mode_e.Undefined:
|
| 189 | return # _SetNext() not called, so do nothing
|
| 190 |
|
| 191 | is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
|
| 192 | real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
|
| 193 |
|
| 194 | self.cur_token = self.lexer.Read(real_mode)
|
| 195 |
|
| 196 | # MUTATE TOKEN for fake lexer mode.
|
| 197 | # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
|
| 198 | if (is_fake and self.cur_token.id
|
| 199 | in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
|
| 200 | self.cur_token.id = Id.Lit_Chars
|
| 201 |
|
| 202 | self.token_type = self.cur_token.id
|
| 203 | self.token_kind = consts.GetKind(self.token_type)
|
| 204 |
|
| 205 | # number of consecutive newlines, ignoring whitespace
|
| 206 | if self.token_type == Id.Op_Newline:
|
| 207 | self.newline_state += 1
|
| 208 | elif self.token_kind != Kind.WS:
|
| 209 | self.newline_state = 0
|
| 210 |
|
| 211 | self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
|
| 212 | self.next_lex_mode = lex_mode_e.Undefined
|
| 213 |
|
| 214 | def _SetNext(self, lex_mode):
|
| 215 | # type: (lex_mode_t) -> None
|
| 216 | """Set the next lex state, but don't actually read a token.
|
| 217 |
|
| 218 | We need this for proper interactive parsing.
|
| 219 | """
|
| 220 | self.next_lex_mode = lex_mode
|
| 221 |
|
| 222 | def _ReadVarOpArg(self, arg_lex_mode):
|
| 223 | # type: (lex_mode_t) -> rhs_word_t
|
| 224 |
|
| 225 | # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
|
| 226 | # valid, even when unquoted.
|
| 227 | self._SetNext(arg_lex_mode)
|
| 228 | self._GetToken()
|
| 229 |
|
| 230 | w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
|
| 231 | True) # empty_ok
|
| 232 |
|
| 233 | # If the Compound has no parts, and we're in a double-quoted VarSub
|
| 234 | # arg, and empty_ok, then return Empty. This is so it can evaluate to
|
| 235 | # the empty string and not get elided.
|
| 236 | #
|
| 237 | # Examples:
|
| 238 | # - "${s:-}", "${s/%pat/}"
|
| 239 | # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
|
| 240 | # has the same potential problem of not having Token location info.
|
| 241 | #
|
| 242 | # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
|
| 243 | # return a Compound with no parts, which is explicitly checked with a
|
| 244 | # custom error message.
|
| 245 | if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
|
| 246 | return rhs_word.Empty
|
| 247 |
|
| 248 | return w
|
| 249 |
|
| 250 | def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
|
| 251 | # type: (lex_mode_t, Id_t, bool) -> CompoundWord
|
| 252 | """Return a CompoundWord.
|
| 253 |
|
| 254 | Helper function for _ReadVarOpArg and used directly by
|
| 255 | _ReadPatSubVarOp.
|
| 256 | """
|
| 257 | w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
|
| 258 | #log('w %s', w)
|
| 259 | tilde = word_.TildeDetect(w)
|
| 260 | if tilde:
|
| 261 | w = tilde
|
| 262 | return w
|
| 263 |
|
| 264 | def _ReadSliceVarOp(self):
|
| 265 | # type: () -> suffix_op.Slice
|
| 266 | """
|
| 267 | Looking token after first ':'
|
| 268 |
|
| 269 | ArithExpr? (':' ArithExpr? )? '}'
|
| 270 | """
|
| 271 | self._NextNonSpace()
|
| 272 |
|
| 273 | cur_id = self.token_type
|
| 274 |
|
| 275 | if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
|
| 276 | begin = arith_expr.EmptyZero # type: arith_expr_t
|
| 277 | else:
|
| 278 | begin = self.a_parser.Parse()
|
| 279 | cur_id = self.a_parser.CurrentId() # advance
|
| 280 |
|
| 281 | if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
|
| 282 | no_length = None # type: Optional[arith_expr_t] # No length specified
|
| 283 | return suffix_op.Slice(begin, no_length)
|
| 284 |
|
| 285 | elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
|
| 286 | self._SetNext(lex_mode_e.Arith)
|
| 287 | self._GetToken()
|
| 288 |
|
| 289 | if self.token_type != Id.Arith_RBrace:
|
| 290 | length = self._ReadArithExpr(Id.Arith_RBrace)
|
| 291 | else:
|
| 292 | # quirky bash behavior:
|
| 293 | # ${a:1:} or ${a::} means length ZERO
|
| 294 | # but ${a:1} or ${a:} means length N
|
| 295 | length = arith_expr.EmptyZero
|
| 296 |
|
| 297 | return suffix_op.Slice(begin, length)
|
| 298 |
|
| 299 | else:
|
| 300 | p_die("Expected : or } in slice", self.cur_token)
|
| 301 |
|
| 302 | raise AssertionError() # for MyPy
|
| 303 |
|
| 304 | def _ReadPatSubVarOp(self):
|
| 305 | # type: () -> suffix_op.PatSub
|
| 306 | """Looking at the first '/' after VarOf:
|
| 307 |
|
| 308 | VarSub = ...
|
| 309 | | VarOf '/' Match ( '/' WORD? )?
|
| 310 | Match = '/' WORD # can't be empty
|
| 311 | | '#' WORD? # may be empty
|
| 312 | | '%' WORD?
|
| 313 | """
|
| 314 | slash_tok = self.cur_token # location info
|
| 315 | replace_mode = Id.Undefined_Tok # bizarre syntax / # %
|
| 316 |
|
| 317 | self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
|
| 318 |
|
| 319 | self._GetToken()
|
| 320 | if self.token_type == Id.Right_DollarBrace:
|
| 321 | pat = CompoundWord([])
|
| 322 | return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
|
| 323 | slash_tok)
|
| 324 |
|
| 325 | if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
|
| 326 | replace_mode = self.token_type
|
| 327 | self._SetNext(lex_mode_e.VSub_ArgUnquoted)
|
| 328 |
|
| 329 | # Bash quirk:
|
| 330 | # echo ${x/#/replace} has an empty pattern
|
| 331 | # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
|
| 332 | empty_ok = replace_mode != Id.Lit_Slash
|
| 333 | pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
|
| 334 | empty_ok)
|
| 335 | #log('pat 1 %r', pat)
|
| 336 |
|
| 337 | if self.token_type == Id.Lit_Slash:
|
| 338 | # read until }
|
| 339 | replace = self._ReadVarOpArg(
|
| 340 | lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
|
| 341 | #log('r 1 %r', replace)
|
| 342 | else:
|
| 343 | # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
|
| 344 | replace = rhs_word.Empty
|
| 345 |
|
| 346 | self._GetToken()
|
| 347 | if self.token_type != Id.Right_DollarBrace:
|
| 348 | # This happens on invalid code
|
| 349 | p_die(
|
| 350 | "Expected } after replacement string, got %s" %
|
| 351 | ui.PrettyId(self.token_type), self.cur_token)
|
| 352 |
|
| 353 | return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
|
| 354 |
|
| 355 | def _ReadSubscript(self):
|
| 356 | # type: () -> bracket_op_t
|
| 357 | """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
|
| 358 | # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
|
| 359 | # expression.
|
| 360 | next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
|
| 361 | if next_id in (Id.Lit_At, Id.Arith_Star):
|
| 362 | op = bracket_op.WholeArray(next_id) # type: bracket_op_t
|
| 363 |
|
| 364 | self._SetNext(lex_mode_e.Arith) # skip past [
|
| 365 | self._GetToken()
|
| 366 | self._SetNext(lex_mode_e.Arith) # skip past @
|
| 367 | self._GetToken()
|
| 368 | else:
|
| 369 | self._SetNext(lex_mode_e.Arith) # skip past [
|
| 370 | anode = self._ReadArithExpr(Id.Arith_RBracket)
|
| 371 | op = bracket_op.ArrayIndex(anode)
|
| 372 |
|
| 373 | if self.token_type != Id.Arith_RBracket: # Should be looking at ]
|
| 374 | p_die('Expected ] to close subscript', self.cur_token)
|
| 375 |
|
| 376 | self._SetNext(lex_mode_e.VSub_2) # skip past ]
|
| 377 | self._GetToken() # Needed to be in the same spot as no subscript
|
| 378 |
|
| 379 | return op
|
| 380 |
|
| 381 | def _ParseVarOf(self):
|
| 382 | # type: () -> BracedVarSub
|
| 383 | """
|
| 384 | VarOf = NAME Subscript?
|
| 385 | | NUMBER # no subscript allowed, none of these are arrays
|
| 386 | # ${@[1]} doesn't work, even though slicing does
|
| 387 | | VarSymbol
|
| 388 | """
|
| 389 | self._GetToken()
|
| 390 | name_token = self.cur_token
|
| 391 | self._SetNext(lex_mode_e.VSub_2)
|
| 392 |
|
| 393 | self._GetToken() # Check for []
|
| 394 | if self.token_type == Id.VOp2_LBracket:
|
| 395 | bracket_op = self._ReadSubscript()
|
| 396 | else:
|
| 397 | bracket_op = None
|
| 398 |
|
| 399 | part = BracedVarSub.CreateNull()
|
| 400 | part.token = name_token
|
| 401 | part.var_name = lexer.TokenVal(name_token)
|
| 402 | part.bracket_op = bracket_op
|
| 403 | return part
|
| 404 |
|
| 405 | def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
|
| 406 | # type: (lex_mode_t, bool) -> BracedVarSub
|
| 407 | """Start parsing at the op -- we already skipped past the name."""
|
| 408 | part = self._ParseVarOf()
|
| 409 |
|
| 410 | self._GetToken()
|
| 411 | if self.token_type == Id.Right_DollarBrace:
|
| 412 | return part # no ops
|
| 413 |
|
| 414 | op_kind = self.token_kind
|
| 415 |
|
| 416 | if op_kind == Kind.VTest:
|
| 417 | tok = self.cur_token
|
| 418 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
| 419 | if self.token_type != Id.Right_DollarBrace:
|
| 420 | p_die('Expected } to close ${', self.cur_token)
|
| 421 |
|
| 422 | part.suffix_op = suffix_op.Unary(tok, arg_word)
|
| 423 |
|
| 424 | elif op_kind == Kind.VOpYsh:
|
| 425 | tok = self.cur_token
|
| 426 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
| 427 | if self.token_type != Id.Right_DollarBrace:
|
| 428 | p_die('Expected } to close ${', self.cur_token)
|
| 429 |
|
| 430 | UP_arg_word = arg_word
|
| 431 | with tagswitch(arg_word) as case:
|
| 432 | if case(rhs_word_e.Empty):
|
| 433 | pass
|
| 434 | elif case(rhs_word_e.Compound):
|
| 435 | arg_word = cast(CompoundWord, UP_arg_word)
|
| 436 | # This handles ${x|html} and ${x %.3f} now
|
| 437 | # However I think ${x %.3f} should be statically parsed? It can enter
|
| 438 | # the printf lexer modes.
|
| 439 | ok, arg, quoted = word_.StaticEval(arg_word)
|
| 440 | if not ok or quoted:
|
| 441 | p_die('Expected a constant argument',
|
| 442 | loc.Word(arg_word))
|
| 443 |
|
| 444 | part.suffix_op = suffix_op.Static(tok, arg)
|
| 445 |
|
| 446 | elif op_kind == Kind.VOp0:
|
| 447 | part.suffix_op = self.cur_token # Nullary
|
| 448 | self._SetNext(lex_mode_e.VSub_2) # Expecting }
|
| 449 | self._GetToken()
|
| 450 |
|
| 451 | elif op_kind == Kind.VOp1: # % %% # ## etc.
|
| 452 | tok = self.cur_token
|
| 453 | # Weird exception that all shells have: these operators take a glob
|
| 454 | # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
|
| 455 | arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
|
| 456 | if self.token_type != Id.Right_DollarBrace:
|
| 457 | p_die('Expected } to close ${', self.cur_token)
|
| 458 |
|
| 459 | part.suffix_op = suffix_op.Unary(tok, arg_word)
|
| 460 |
|
| 461 | elif op_kind == Kind.VOp2: # / : [ ]
|
| 462 | if self.token_type == Id.VOp2_Slash:
|
| 463 | patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
|
| 464 | part.suffix_op = patsub_op
|
| 465 |
|
| 466 | # Checked by the method above
|
| 467 | assert self.token_type == Id.Right_DollarBrace, self.cur_token
|
| 468 |
|
| 469 | elif self.token_type == Id.VOp2_Colon:
|
| 470 | part.suffix_op = self._ReadSliceVarOp()
|
| 471 | # NOTE: } in arithmetic mode.
|
| 472 | if self.token_type != Id.Arith_RBrace:
|
| 473 | # Token seems off; doesn't point to X in # ${a:1:2 X
|
| 474 | p_die('Expected } to close ${', self.cur_token)
|
| 475 |
|
| 476 | else:
|
| 477 | # TODO: Does this ever happen?
|
| 478 | p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
|
| 479 |
|
| 480 | elif op_kind == Kind.VOp3: # ${prefix@} etc.
|
| 481 | if allow_query:
|
| 482 | part.suffix_op = self.cur_token # Nullary
|
| 483 | self._SetNext(lex_mode_e.VSub_2) # Expecting }
|
| 484 | self._GetToken()
|
| 485 | else:
|
| 486 | p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
|
| 487 |
|
| 488 | # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
|
| 489 | # mode. It's redundantly checked above.
|
| 490 | if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
|
| 491 | # ${a.} or ${!a.}
|
| 492 | p_die('Expected } to close ${', self.cur_token)
|
| 493 |
|
| 494 | # Now look for ops
|
| 495 | return part
|
| 496 |
|
| 497 | def _ReadZshVarSub(self, left_token):
|
| 498 | # type: (Token) -> word_part.ZshVarSub
|
| 499 |
|
| 500 | self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
|
| 501 |
|
| 502 | # Can be empty
|
| 503 | w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
|
| 504 | True)
|
| 505 | self._GetToken()
|
| 506 | return word_part.ZshVarSub(left_token, w, self.cur_token)
|
| 507 |
|
| 508 | def ReadBracedVarSub(self, left_token):
|
| 509 | # type: (Token) -> Tuple[BracedVarSub, Token]
|
| 510 | """ For YSH expressions like var x = ${x:-"default"}. """
|
| 511 | part = self._ReadBracedVarSub(left_token, d_quoted=False)
|
| 512 | last_token = self.cur_token
|
| 513 | return part, last_token
|
| 514 |
|
| 515 | def _ReadBracedVarSub(self, left_token, d_quoted):
|
| 516 | # type: (Token, bool) -> BracedVarSub
|
| 517 | """For the ${} expression language.
|
| 518 |
|
| 519 | NAME = [a-zA-Z_][a-zA-Z0-9_]*
|
| 520 | NUMBER = [0-9]+ # ${10}, ${11}, ...
|
| 521 |
|
| 522 | Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
| 523 | VarSymbol = '!' | '@' | '#' | ...
|
| 524 | VarOf = NAME Subscript?
|
| 525 | | NUMBER # no subscript allowed, none of these are arrays
|
| 526 | # ${@[1]} doesn't work, even though slicing does
|
| 527 | | VarSymbol
|
| 528 |
|
| 529 | NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
|
| 530 |
|
| 531 | TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
|
| 532 | STRIP_OP = '#' | '##' | '%' | '%%'
|
| 533 | CASE_OP = ',' | ',,' | '^' | '^^'
|
| 534 | UnaryOp = TEST_OP | STRIP_OP | CASE_OP
|
| 535 |
|
| 536 | YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
|
| 537 | # SPACE is operator not %
|
| 538 | Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
|
| 539 | VarExpr = VarOf
|
| 540 | | VarOf NULLARY_OP
|
| 541 | | VarOf UnaryOp WORD
|
| 542 | | VarOf YSH_UNARY STATIC_WORD
|
| 543 | | VarOf ':' ArithExpr (':' ArithExpr )?
|
| 544 | | VarOf '/' Match '/' WORD
|
| 545 |
|
| 546 | LengthExpr = '#' VarOf # can't apply operators after length
|
| 547 |
|
| 548 | RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
|
| 549 | # ${!ref[0]} vs ${!keys[@]} resolved later
|
| 550 |
|
| 551 | PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
|
| 552 |
|
| 553 | BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
|
| 554 |
|
| 555 | VarSub = LengthExpr
|
| 556 | | RefOrKeys
|
| 557 | | PrefixQuery
|
| 558 | | VarExpr
|
| 559 | | BuiltinSub
|
| 560 |
|
| 561 | NOTES:
|
| 562 | - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
|
| 563 | slicing ${a:x+1:y+2}
|
| 564 | - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
|
| 565 | - @ and * are technically arithmetic expressions in this implementation
|
| 566 | - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
|
| 567 | it's also vectorized.
|
| 568 |
|
| 569 | Strictness over bash:
|
| 570 | - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
|
| 571 | grammar
|
| 572 | - ! and # prefixes can't be composed, even though named refs can be
|
| 573 | composed with other operators
|
| 574 | - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
|
| 575 | a prefix, and it can also be a literal part of WORD.
|
| 576 |
|
| 577 | From the parser's point of view, the prefix # can't be combined with
|
| 578 | UnaryOp/slicing/matching, and the ! can. However
|
| 579 |
|
| 580 | - ${a[@]:1:2} is not allowed
|
| 581 | - ${#a[@]:1:2} is allowed, but gives the wrong answer
|
| 582 | """
|
| 583 | if d_quoted:
|
| 584 | arg_lex_mode = lex_mode_e.VSub_ArgDQ
|
| 585 | else:
|
| 586 | arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
|
| 587 |
|
| 588 | self._SetNext(lex_mode_e.VSub_1)
|
| 589 | self._GetToken()
|
| 590 |
|
| 591 | ty = self.token_type
|
| 592 | first_tok = self.cur_token
|
| 593 |
|
| 594 | if ty == Id.VSub_Pound:
|
| 595 | # Disambiguate
|
| 596 | next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
|
| 597 | if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
|
| 598 | # e.g. a name, '#' is the prefix
|
| 599 | self._SetNext(lex_mode_e.VSub_1)
|
| 600 | part = self._ParseVarOf()
|
| 601 |
|
| 602 | self._GetToken()
|
| 603 | if self.token_type != Id.Right_DollarBrace:
|
| 604 | p_die('Expected } after length expression', self.cur_token)
|
| 605 |
|
| 606 | part.prefix_op = first_tok
|
| 607 |
|
| 608 | else: # not a prefix, '#' is the variable
|
| 609 | part = self._ParseVarExpr(arg_lex_mode)
|
| 610 |
|
| 611 | elif ty == Id.VSub_Bang:
|
| 612 | next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
|
| 613 | if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
|
| 614 | # e.g. a name, '!' is the prefix
|
| 615 | # ${!a} -- this is a ref
|
| 616 | # ${!3} -- this is ref
|
| 617 | # ${!a[1]} -- this is a ref
|
| 618 | # ${!a[@]} -- this is a keys
|
| 619 | # No lookahead -- do it in a second step, or at runtime
|
| 620 | self._SetNext(lex_mode_e.VSub_1)
|
| 621 | part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
|
| 622 |
|
| 623 | part.prefix_op = first_tok
|
| 624 |
|
| 625 | else: # not a prefix, '!' is the variable
|
| 626 | part = self._ParseVarExpr(arg_lex_mode)
|
| 627 |
|
| 628 | elif ty == Id.VSub_Dot:
|
| 629 | # Note: this will become a new builtin_sub type, so this method must
|
| 630 | # return word_part_t rather than BracedVarSub. I don't think that
|
| 631 | # should cause problems.
|
| 632 | p_die('TODO: ${.myproc builtin sub}', self.cur_token)
|
| 633 |
|
| 634 | # VS_NAME, VS_NUMBER, symbol that isn't # or !
|
| 635 | elif self.token_kind == Kind.VSub:
|
| 636 | part = self._ParseVarExpr(arg_lex_mode)
|
| 637 |
|
| 638 | else:
|
| 639 | # e.g. ${^}
|
| 640 | p_die('Unexpected token in ${}', self.cur_token)
|
| 641 |
|
| 642 | part.left = left_token # attach the argument
|
| 643 | part.right = self.cur_token
|
| 644 | return part
|
| 645 |
|
| 646 | def _ReadSingleQuoted(self, left_token, lex_mode):
|
| 647 | # type: (Token, lex_mode_t) -> SingleQuoted
|
| 648 | """Internal method to read a word_part."""
|
| 649 | tokens = [] # type: List[Token]
|
| 650 | # In command mode, we never disallow backslashes like '\'
|
| 651 | right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
|
| 652 | False)
|
| 653 | sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
|
| 654 | node = SingleQuoted(left_token, sval, right_quote)
|
| 655 | return node
|
| 656 |
|
| 657 | def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
|
| 658 | # type: (lex_mode_t, Token, List[Token], bool) -> Token
|
| 659 | """Appends to out_tokens; returns last token
|
| 660 |
|
| 661 | Used by expr_parse.py
|
| 662 | """
|
| 663 | # TODO: Remove and use out_tokens
|
| 664 | tokens = [] # type: List[Token]
|
| 665 |
|
| 666 | # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
|
| 667 | no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
|
| 668 |
|
| 669 | expected_end_tokens = 3 if left_token.id in (
|
| 670 | Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
|
| 671 | Id.Left_BTSingleQuote) else 1
|
| 672 | num_end_tokens = 0
|
| 673 |
|
| 674 | while num_end_tokens < expected_end_tokens:
|
| 675 | self._SetNext(lex_mode)
|
| 676 | self._GetToken()
|
| 677 |
|
| 678 | # Kind.Char emitted in lex_mode.SQ_C
|
| 679 | if self.token_kind in (Kind.Lit, Kind.Char):
|
| 680 | tok = self.cur_token
|
| 681 | # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
|
| 682 | # r'one\two' or c'one\\two'
|
| 683 | if no_backslashes and lexer.TokenContains(tok, '\\'):
|
| 684 | p_die(
|
| 685 | r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
|
| 686 | tok)
|
| 687 |
|
| 688 | if is_ysh_expr:
|
| 689 | # Disallow var x = $'\001'. Arguably we don't need these
|
| 690 | # checks because u'\u{1}' is the way to write it.
|
| 691 | if self.token_type == Id.Char_Octal3:
|
| 692 | p_die(
|
| 693 | r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
|
| 694 | tok)
|
| 695 |
|
| 696 | if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
|
| 697 | # disallow \xH
|
| 698 | p_die(
|
| 699 | r'Invalid hex escape in YSH string (must be \xHH)',
|
| 700 | tok)
|
| 701 |
|
| 702 | tokens.append(tok)
|
| 703 |
|
| 704 | elif self.token_kind == Kind.Unknown:
|
| 705 | tok = self.cur_token
|
| 706 | assert tok.id == Id.Unknown_Backslash, tok
|
| 707 |
|
| 708 | # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
|
| 709 | if is_ysh_expr or not self.parse_opts.parse_backslash():
|
| 710 | p_die(
|
| 711 | "Invalid char escape in C-style string literal (OILS-ERR-11)",
|
| 712 | tok)
|
| 713 |
|
| 714 | tokens.append(tok)
|
| 715 |
|
| 716 | elif self.token_kind == Kind.Eof:
|
| 717 | p_die('Unexpected EOF in single-quoted string that began here',
|
| 718 | left_token)
|
| 719 |
|
| 720 | elif self.token_kind == Kind.Right:
|
| 721 | # assume Id.Right_SingleQuote
|
| 722 | num_end_tokens += 1
|
| 723 | tokens.append(self.cur_token)
|
| 724 |
|
| 725 | else:
|
| 726 | raise AssertionError(self.cur_token)
|
| 727 |
|
| 728 | if self.token_kind != Kind.Right:
|
| 729 | num_end_tokens = 0 # we need three in a ROW
|
| 730 |
|
| 731 | if expected_end_tokens == 1:
|
| 732 | tokens.pop()
|
| 733 | elif expected_end_tokens == 3: # Get rid of spurious end tokens
|
| 734 | tokens.pop()
|
| 735 | tokens.pop()
|
| 736 | tokens.pop()
|
| 737 |
|
| 738 | # Remove space from ''' r''' $''' in both expression mode and command mode
|
| 739 | if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
|
| 740 | Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
|
| 741 | word_compile.RemoveLeadingSpaceSQ(tokens)
|
| 742 |
|
| 743 | # Validation after lexing - same 2 checks in j8.LexerDecoder
|
| 744 | is_u_string = left_token.id in (Id.Left_USingleQuote,
|
| 745 | Id.Left_UTSingleQuote)
|
| 746 |
|
| 747 | for tok in tokens:
|
| 748 | # u'\yff' is not valid, but b'\yff' is
|
| 749 | if is_u_string and tok.id == Id.Char_YHex:
|
| 750 | p_die(
|
| 751 | r"%s escapes not allowed in u'' strings" %
|
| 752 | lexer.TokenVal(tok), tok)
|
| 753 |
|
| 754 | out_tokens.extend(tokens)
|
| 755 | return self.cur_token
|
| 756 |
|
| 757 | def _ReadDoubleQuotedLeftParts(self):
|
| 758 | # type: () -> word_part_t
|
| 759 | """Read substitution parts in a double quoted context."""
|
| 760 | if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
|
| 761 | return self._ReadCommandSub(self.token_type, d_quoted=True)
|
| 762 |
|
| 763 | if self.token_type == Id.Left_DollarBrace:
|
| 764 | return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
|
| 765 |
|
| 766 | if self.token_type == Id.Left_DollarDParen:
|
| 767 | return self._ReadArithSub()
|
| 768 |
|
| 769 | if self.token_type == Id.Left_DollarBracket:
|
| 770 | return self._ReadExprSub(lex_mode_e.DQ)
|
| 771 |
|
| 772 | raise AssertionError(self.cur_token)
|
| 773 |
|
| 774 | def _ReadYshSingleQuoted(self, left_id):
|
| 775 | # type: (Id_t) -> CompoundWord
|
| 776 | """Read YSH style strings
|
| 777 |
|
| 778 | r'' u'' b''
|
| 779 | r''' ''' u''' ''' b''' '''
|
| 780 | """
|
| 781 | #log('BEF self.cur_token %s', self.cur_token)
|
| 782 | if left_id == Id.Left_RSingleQuote:
|
| 783 | lexer_mode = lex_mode_e.SQ_Raw
|
| 784 | triple_left_id = Id.Left_RTSingleQuote
|
| 785 | elif left_id == Id.Left_USingleQuote:
|
| 786 | lexer_mode = lex_mode_e.J8_Str
|
| 787 | triple_left_id = Id.Left_UTSingleQuote
|
| 788 | elif left_id == Id.Left_BSingleQuote:
|
| 789 | lexer_mode = lex_mode_e.J8_Str
|
| 790 | triple_left_id = Id.Left_BTSingleQuote
|
| 791 | else:
|
| 792 | raise AssertionError(left_id)
|
| 793 |
|
| 794 | # Needed for syntax checks
|
| 795 | left_tok = self.cur_token
|
| 796 | left_tok.id = left_id
|
| 797 |
|
| 798 | sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
|
| 799 |
|
| 800 | if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
|
| 801 | self._SetNext(lex_mode_e.ShCommand)
|
| 802 | self._GetToken()
|
| 803 |
|
| 804 | assert self.token_type == Id.Left_SingleQuote
|
| 805 | # HACK: magically transform the third ' in u''' to
|
| 806 | # Id.Left_UTSingleQuote, so that ''' is the terminator
|
| 807 | left_tok = self.cur_token
|
| 808 | left_tok.id = triple_left_id
|
| 809 |
|
| 810 | # Handles stripping leading whitespace
|
| 811 | sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
|
| 812 |
|
| 813 | # Advance and validate
|
| 814 | self._SetNext(lex_mode_e.ShCommand)
|
| 815 |
|
| 816 | self._GetToken()
|
| 817 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
| 818 | p_die('Unexpected token after YSH single-quoted string',
|
| 819 | self.cur_token)
|
| 820 |
|
| 821 | return CompoundWord([sq_part])
|
| 822 |
|
| 823 | def _ReadUnquotedLeftParts(self, triple_out):
|
| 824 | # type: (Optional[BoolParamBox]) -> word_part_t
|
| 825 | """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
|
| 826 |
|
| 827 | If triple_out is set, then we try parsing triple quoted strings,
|
| 828 | and set its value to True if we got one.
|
| 829 | """
|
| 830 | if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
|
| 831 | # Note: $"" is a synonym for "". It might make sense if it added
|
| 832 | # \n \0 \x00 \u{123} etc. But that's not what bash does!
|
| 833 | dq_part = self._ReadDoubleQuoted(self.cur_token)
|
| 834 | # Got empty word "" and there's a " after
|
| 835 | if (triple_out and len(dq_part.parts) == 0 and
|
| 836 | self.lexer.ByteLookAhead() == '"'):
|
| 837 |
|
| 838 | self._SetNext(lex_mode_e.ShCommand)
|
| 839 | self._GetToken()
|
| 840 | # HACK: magically transform the third " in """ to
|
| 841 | # Id.Left_TDoubleQuote, so that """ is the terminator
|
| 842 | left_dq_token = self.cur_token
|
| 843 | left_dq_token.id = Id.Left_TDoubleQuote
|
| 844 | triple_out.b = True # let caller know we got it
|
| 845 | return self._ReadDoubleQuoted(left_dq_token)
|
| 846 |
|
| 847 | return dq_part
|
| 848 |
|
| 849 | if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
|
| 850 | Id.Left_DollarSingleQuote):
|
| 851 | if self.token_type == Id.Left_SingleQuote:
|
| 852 | lexer_mode = lex_mode_e.SQ_Raw
|
| 853 | triple_left_id = Id.Left_TSingleQuote
|
| 854 | elif self.token_type == Id.Left_RSingleQuote:
|
| 855 | lexer_mode = lex_mode_e.SQ_Raw
|
| 856 | triple_left_id = Id.Left_RTSingleQuote
|
| 857 | else:
|
| 858 | lexer_mode = lex_mode_e.SQ_C
|
| 859 | # there is no such thing as $'''
|
| 860 | triple_left_id = Id.Undefined_Tok
|
| 861 |
|
| 862 | sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
|
| 863 |
|
| 864 | # Got empty '' or r'' and there's a ' after
|
| 865 | # u'' and b'' are handled in _ReadYshSingleQuoted
|
| 866 | if (triple_left_id != Id.Undefined_Tok and
|
| 867 | triple_out is not None and len(sq_part.sval) == 0 and
|
| 868 | self.lexer.ByteLookAhead() == "'"):
|
| 869 |
|
| 870 | self._SetNext(lex_mode_e.ShCommand)
|
| 871 | self._GetToken()
|
| 872 |
|
| 873 | # HACK: magically transform the third ' in ''' to
|
| 874 | # Id.Left_TSingleQuote, so that ''' is the terminator
|
| 875 | left_sq_token = self.cur_token
|
| 876 | left_sq_token.id = triple_left_id
|
| 877 |
|
| 878 | triple_out.b = True # let caller know we got it
|
| 879 | return self._ReadSingleQuoted(left_sq_token, lexer_mode)
|
| 880 |
|
| 881 | return sq_part
|
| 882 |
|
| 883 | if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
|
| 884 | Id.Left_ProcSubIn, Id.Left_ProcSubOut):
|
| 885 | return self._ReadCommandSub(self.token_type, d_quoted=False)
|
| 886 |
|
| 887 | if self.token_type == Id.Left_DollarBrace:
|
| 888 | return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
|
| 889 |
|
| 890 | if self.token_type == Id.Left_DollarDParen:
|
| 891 | return self._ReadArithSub()
|
| 892 |
|
| 893 | if self.token_type == Id.Left_DollarBracket:
|
| 894 | return self._ReadExprSub(lex_mode_e.ShCommand)
|
| 895 |
|
| 896 | if self.token_type == Id.Left_DollarBraceZsh:
|
| 897 | return self._ReadZshVarSub(self.cur_token)
|
| 898 |
|
| 899 | raise AssertionError(self.cur_token)
|
| 900 |
|
| 901 | def _ReadExtGlob(self):
|
| 902 | # type: () -> word_part.ExtGlob
|
| 903 | """
|
| 904 | Grammar:
|
| 905 | Item = CompoundWord | EPSILON # important: @(foo|) is allowed
|
| 906 | LEFT = '@(' | '*(' | '+(' | '?(' | '!('
|
| 907 | RIGHT = ')'
|
| 908 | ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
|
| 909 | Compound includes ExtGlob
|
| 910 | """
|
| 911 | left_token = self.cur_token
|
| 912 | right_token = None # type: Token
|
| 913 | arms = [] # type: List[CompoundWord]
|
| 914 |
|
| 915 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
|
| 916 | self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
|
| 917 |
|
| 918 | read_word = False # did we just a read a word? To handle @(||).
|
| 919 |
|
| 920 | while True:
|
| 921 | self._GetToken()
|
| 922 |
|
| 923 | if self.token_type == Id.Right_ExtGlob:
|
| 924 | if not read_word:
|
| 925 | arms.append(CompoundWord([]))
|
| 926 | right_token = self.cur_token
|
| 927 | break
|
| 928 |
|
| 929 | elif self.token_type == Id.Op_Pipe:
|
| 930 | if not read_word:
|
| 931 | arms.append(CompoundWord([]))
|
| 932 | read_word = False
|
| 933 | self._SetNext(lex_mode_e.ExtGlob)
|
| 934 |
|
| 935 | # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
|
| 936 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
|
| 937 | Kind.ExtGlob):
|
| 938 | w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
|
| 939 | arms.append(w)
|
| 940 | read_word = True
|
| 941 |
|
| 942 | elif self.token_kind == Kind.Eof:
|
| 943 | p_die('Unexpected EOF reading extended glob that began here',
|
| 944 | left_token)
|
| 945 |
|
| 946 | else:
|
| 947 | raise AssertionError(self.cur_token)
|
| 948 |
|
| 949 | return word_part.ExtGlob(left_token, arms, right_token)
|
| 950 |
|
| 951 | def _ReadBashRegexGroup(self):
|
| 952 | # type: () -> word_part.BashRegexGroup
|
| 953 | """
|
| 954 | Grammar:
|
| 955 | BashRegexGroup = '(' WORD? ')
|
| 956 | """
|
| 957 | left_token = self.cur_token
|
| 958 | assert left_token.id == Id.BashRegex_LParen, left_token
|
| 959 |
|
| 960 | right_token = None # type: Token
|
| 961 | arms = [] # type: List[CompoundWord]
|
| 962 |
|
| 963 | self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
|
| 964 | self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
|
| 965 |
|
| 966 | self._GetToken()
|
| 967 | if self.token_type == Id.Right_BashRegexGroup: # empty ()
|
| 968 | return word_part.BashRegexGroup(left_token, None, self.cur_token)
|
| 969 |
|
| 970 | # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
|
| 971 | if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
|
| 972 | # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
|
| 973 | # To allow bash style [[ s =~ (a b) ]]
|
| 974 | w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
|
| 975 | arms.append(w)
|
| 976 |
|
| 977 | self._GetToken()
|
| 978 | if self.token_type != Id.Right_BashRegexGroup:
|
| 979 | p_die('Expected ) to close bash regex group', self.cur_token)
|
| 980 |
|
| 981 | return word_part.BashRegexGroup(left_token, w, self.cur_token)
|
| 982 |
|
| 983 | p_die('Expected word after ( opening bash regex group', self.cur_token)
|
| 984 |
|
| 985 | def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
|
| 986 | # type: (Optional[Token], bool, List[word_part_t]) -> None
|
| 987 | """
|
| 988 | Args:
|
| 989 | left_token: A token if we are reading a double quoted part, or None if
|
| 990 | we're reading a here doc.
|
| 991 | is_ysh_expr: Whether to disallow backticks and invalid char escapes
|
| 992 | out_parts: list of word_part to append to
|
| 993 | """
|
| 994 | if left_token:
|
| 995 | if left_token.id in (Id.Left_TDoubleQuote,
|
| 996 | Id.Left_DollarTDoubleQuote):
|
| 997 | expected_end_tokens = 3
|
| 998 | else:
|
| 999 | expected_end_tokens = 1
|
| 1000 | else:
|
| 1001 | expected_end_tokens = 1000 # here doc will break
|
| 1002 |
|
| 1003 | num_end_tokens = 0
|
| 1004 | while num_end_tokens < expected_end_tokens:
|
| 1005 | self._SetNext(lex_mode_e.DQ)
|
| 1006 | self._GetToken()
|
| 1007 |
|
| 1008 | if self.token_kind == Kind.Lit:
|
| 1009 | if self.token_type == Id.Lit_EscapedChar:
|
| 1010 | tok = self.cur_token
|
| 1011 | ch = lexer.TokenSliceLeft(tok, 1)
|
| 1012 | part = word_part.EscapedLiteral(tok,
|
| 1013 | ch) # type: word_part_t
|
| 1014 | else:
|
| 1015 | if self.token_type == Id.Lit_BadBackslash:
|
| 1016 | # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
|
| 1017 | # YSH.
|
| 1018 | # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
|
| 1019 | # recursion (unless parse_backslash)
|
| 1020 | if (is_ysh_expr or
|
| 1021 | not self.parse_opts.parse_backslash()):
|
| 1022 | p_die(
|
| 1023 | "Invalid char escape in double quoted string (OILS-ERR-12)",
|
| 1024 | self.cur_token)
|
| 1025 | elif self.token_type == Id.Lit_Dollar:
|
| 1026 | if is_ysh_expr or not self.parse_opts.parse_dollar():
|
| 1027 | p_die("Literal $ should be quoted like \$",
|
| 1028 | self.cur_token)
|
| 1029 |
|
| 1030 | part = self.cur_token
|
| 1031 | out_parts.append(part)
|
| 1032 |
|
| 1033 | elif self.token_kind == Kind.Left:
|
| 1034 | if self.token_type == Id.Left_Backtick and is_ysh_expr:
|
| 1035 | p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
|
| 1036 | self.cur_token)
|
| 1037 |
|
| 1038 | part = self._ReadDoubleQuotedLeftParts()
|
| 1039 | out_parts.append(part)
|
| 1040 |
|
| 1041 | elif self.token_kind == Kind.VSub:
|
| 1042 | tok = self.cur_token
|
| 1043 | part = SimpleVarSub(tok)
|
| 1044 | out_parts.append(part)
|
| 1045 | # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
|
| 1046 | # later.
|
| 1047 |
|
| 1048 | elif self.token_kind == Kind.Right:
|
| 1049 | assert self.token_type == Id.Right_DoubleQuote, self.token_type
|
| 1050 | if left_token:
|
| 1051 | num_end_tokens += 1
|
| 1052 |
|
| 1053 | # In a here doc, the right quote is literal!
|
| 1054 | out_parts.append(self.cur_token)
|
| 1055 |
|
| 1056 | elif self.token_kind == Kind.Eof:
|
| 1057 | if left_token:
|
| 1058 | p_die(
|
| 1059 | 'Unexpected EOF reading double-quoted string that began here',
|
| 1060 | left_token)
|
| 1061 | else: # here docs will have an EOF in their token stream
|
| 1062 | break
|
| 1063 |
|
| 1064 | else:
|
| 1065 | raise AssertionError(self.cur_token)
|
| 1066 |
|
| 1067 | if self.token_kind != Kind.Right:
|
| 1068 | num_end_tokens = 0 # """ must be CONSECUTIVE
|
| 1069 |
|
| 1070 | if expected_end_tokens == 1:
|
| 1071 | out_parts.pop()
|
| 1072 | elif expected_end_tokens == 3:
|
| 1073 | out_parts.pop()
|
| 1074 | out_parts.pop()
|
| 1075 | out_parts.pop()
|
| 1076 |
|
| 1077 | # Remove space from """ in both expression mode and command mode
|
| 1078 | if (left_token and left_token.id
|
| 1079 | in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
|
| 1080 | word_compile.RemoveLeadingSpaceDQ(out_parts)
|
| 1081 |
|
| 1082 | # Return nothing, since we appended to 'out_parts'
|
| 1083 |
|
| 1084 | def _ReadDoubleQuoted(self, left_token):
|
| 1085 | # type: (Token) -> DoubleQuoted
|
| 1086 | """Helper function for "hello $name".
|
| 1087 |
|
| 1088 | Args:
|
| 1089 | eof_type: for stopping at }, Id.Lit_RBrace
|
| 1090 | here_doc: Whether we are reading in a here doc context
|
| 1091 |
|
| 1092 | Also ${foo%%a b c} # treat this as double quoted. until you hit
|
| 1093 | """
|
| 1094 | parts = [] # type: List[word_part_t]
|
| 1095 | self._ReadLikeDQ(left_token, False, parts)
|
| 1096 |
|
| 1097 | right_quote = self.cur_token
|
| 1098 | return DoubleQuoted(left_token, parts, right_quote)
|
| 1099 |
|
| 1100 | def ReadDoubleQuoted(self, left_token, parts):
|
| 1101 | # type: (Token, List[word_part_t]) -> Token
|
| 1102 | """For expression mode.
|
| 1103 |
|
| 1104 | Read var x = "${dir:-}/$name"; etc.
|
| 1105 | """
|
| 1106 | self._ReadLikeDQ(left_token, True, parts)
|
| 1107 | return self.cur_token
|
| 1108 |
|
| 1109 | def _ReadCommandSub(self, left_id, d_quoted=False):
|
| 1110 | # type: (Id_t, bool) -> CommandSub
|
| 1111 | """
|
| 1112 | NOTE: This is not in the grammar, because word parts aren't in the grammar!
|
| 1113 |
|
| 1114 | command_sub = '$(' command_list ')'
|
| 1115 | | '@(' command_list ')'
|
| 1116 | | '<(' command_list ')'
|
| 1117 | | '>(' command_list ')'
|
| 1118 | | ` command_list `
|
| 1119 | """
|
| 1120 | left_token = self.cur_token
|
| 1121 |
|
| 1122 | # Set the lexer in a state so ) becomes the EOF token.
|
| 1123 | if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
|
| 1124 | Id.Left_ProcSubOut):
|
| 1125 | self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
|
| 1126 |
|
| 1127 | right_id = Id.Eof_RParen
|
| 1128 | self.lexer.PushHint(Id.Op_RParen, right_id)
|
| 1129 | c_parser = self.parse_ctx.MakeParserForCommandSub(
|
| 1130 | self.line_reader, self.lexer, right_id)
|
| 1131 | # NOTE: This doesn't use something like main_loop because we don't want
|
| 1132 | # to interleave parsing and execution! Unlike 'source' and 'eval'.
|
| 1133 | node = c_parser.ParseCommandSub()
|
| 1134 |
|
| 1135 | right_token = c_parser.w_parser.cur_token
|
| 1136 |
|
| 1137 | elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
|
| 1138 | # NOTE: This is an APPROXIMATE solution for translation ONLY. See
|
| 1139 | # test/osh2oil.
|
| 1140 |
|
| 1141 | right_id = Id.Eof_Backtick
|
| 1142 | self.lexer.PushHint(Id.Left_Backtick, right_id)
|
| 1143 | c_parser = self.parse_ctx.MakeParserForCommandSub(
|
| 1144 | self.line_reader, self.lexer, right_id)
|
| 1145 | node = c_parser.ParseCommandSub()
|
| 1146 | right_token = c_parser.w_parser.cur_token
|
| 1147 |
|
| 1148 | elif left_id == Id.Left_Backtick:
|
| 1149 | if not self.parse_opts.parse_backticks():
|
| 1150 | p_die('Use $(cmd) instead of backticks (parse_backticks)',
|
| 1151 | left_token)
|
| 1152 |
|
| 1153 | self._SetNext(lex_mode_e.Backtick) # advance past `
|
| 1154 |
|
| 1155 | parts = [] # type: List[str]
|
| 1156 | while True:
|
| 1157 | self._GetToken()
|
| 1158 | #log("TOK %s", self.cur_token)
|
| 1159 |
|
| 1160 | if self.token_type == Id.Backtick_Quoted:
|
| 1161 | # Remove leading \
|
| 1162 | parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
|
| 1163 |
|
| 1164 | elif self.token_type == Id.Backtick_DoubleQuote:
|
| 1165 | # Compatibility: If backticks are double quoted, then double quotes
|
| 1166 | # within them have to be \"
|
| 1167 | # Shells aren't smart enough to match nested " and ` quotes (but OSH
|
| 1168 | # is)
|
| 1169 | if d_quoted:
|
| 1170 | # Remove leading \
|
| 1171 | parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
|
| 1172 | else:
|
| 1173 | parts.append(lexer.TokenVal(self.cur_token))
|
| 1174 |
|
| 1175 | elif self.token_type == Id.Backtick_Other:
|
| 1176 | parts.append(lexer.TokenVal(self.cur_token))
|
| 1177 |
|
| 1178 | elif self.token_type == Id.Backtick_Right:
|
| 1179 | break
|
| 1180 |
|
| 1181 | elif self.token_type == Id.Eof_Real:
|
| 1182 | # Note: this parse error is in the ORIGINAL context. No code_str yet.
|
| 1183 | p_die('Unexpected EOF while looking for closing backtick',
|
| 1184 | left_token)
|
| 1185 |
|
| 1186 | else:
|
| 1187 | raise AssertionError(self.cur_token)
|
| 1188 |
|
| 1189 | self._SetNext(lex_mode_e.Backtick)
|
| 1190 |
|
| 1191 | # Calculate right SPID on CommandSub BEFORE re-parsing.
|
| 1192 | right_token = self.cur_token
|
| 1193 |
|
| 1194 | code_str = ''.join(parts)
|
| 1195 | #log('code %r', code_str)
|
| 1196 |
|
| 1197 | # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
|
| 1198 | # won't have the same location info as MakeParserForCommandSub(), because
|
| 1199 | # the lexer is different.
|
| 1200 | arena = self.parse_ctx.arena
|
| 1201 | #arena = alloc.Arena()
|
| 1202 | line_reader = reader.StringLineReader(code_str, arena)
|
| 1203 | c_parser = self.parse_ctx.MakeOshParser(line_reader)
|
| 1204 | src = source.Reparsed('backticks', left_token, right_token)
|
| 1205 | with alloc.ctx_SourceCode(arena, src):
|
| 1206 | node = c_parser.ParseCommandSub()
|
| 1207 |
|
| 1208 | else:
|
| 1209 | raise AssertionError(left_id)
|
| 1210 |
|
| 1211 | return CommandSub(left_token, node, right_token)
|
| 1212 |
|
| 1213 | def _ReadExprSub(self, lex_mode):
|
| 1214 | # type: (lex_mode_t) -> word_part.ExprSub
|
| 1215 | """$[d->key] $[obj.method()] etc."""
|
| 1216 | left_token = self.cur_token
|
| 1217 |
|
| 1218 | self._SetNext(lex_mode_e.Expr)
|
| 1219 | enode, right_token = self.parse_ctx.ParseYshExpr(
|
| 1220 | self.lexer, grammar_nt.ysh_expr_sub)
|
| 1221 |
|
| 1222 | self._SetNext(lex_mode) # Move past ]
|
| 1223 | return word_part.ExprSub(left_token, enode, right_token)
|
| 1224 |
|
| 1225 | def ParseVarDecl(self, kw_token):
|
| 1226 | # type: (Token) -> command.VarDecl
|
| 1227 | """
|
| 1228 | oil_var_decl: name_type_list '=' testlist end_stmt
|
| 1229 |
|
| 1230 | Note that assignments must end with \n ; } or EOF. Unlike shell
|
| 1231 | assignments, we disallow:
|
| 1232 |
|
| 1233 | var x = 42 | wc -l
|
| 1234 | var x = 42 && echo hi
|
| 1235 | """
|
| 1236 | self._SetNext(lex_mode_e.Expr)
|
| 1237 | enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
|
| 1238 | # Hack to move } from what the Expr lexer modes gives to what CommandParser
|
| 1239 | # wants
|
| 1240 | if last_token.id == Id.Op_RBrace:
|
| 1241 | last_token.id = Id.Lit_RBrace
|
| 1242 |
|
| 1243 | # Let the CommandParser see the Op_Semi or Op_Newline.
|
| 1244 | self.buffered_word = last_token
|
| 1245 | self._SetNext(lex_mode_e.ShCommand) # always back to this
|
| 1246 | return enode
|
| 1247 |
|
| 1248 | def ParseMutation(self, kw_token, var_checker):
|
| 1249 | # type: (Token, VarChecker) -> command.Mutation
|
| 1250 | """
|
| 1251 | setvar i = 42
|
| 1252 | setvar i += 1
|
| 1253 | setvar a[i] = 42
|
| 1254 | setvar a[i] += 1
|
| 1255 | setvar d.key = 42
|
| 1256 | setvar d.key += 1
|
| 1257 | """
|
| 1258 | self._SetNext(lex_mode_e.Expr)
|
| 1259 | enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
|
| 1260 | # Hack to move } from what the Expr lexer modes gives to what CommandParser
|
| 1261 | # wants
|
| 1262 | if last_token.id == Id.Op_RBrace:
|
| 1263 | last_token.id = Id.Lit_RBrace
|
| 1264 |
|
| 1265 | for lhs in enode.lhs:
|
| 1266 | UP_lhs = lhs
|
| 1267 | with tagswitch(lhs) as case:
|
| 1268 | if case(y_lhs_e.Var):
|
| 1269 | lhs = cast(Token, UP_lhs)
|
| 1270 | var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
|
| 1271 |
|
| 1272 | # Note: this does not cover cases like
|
| 1273 | # setvar (a[0])[1] = v
|
| 1274 | # setvar (d.key).other = v
|
| 1275 | # This leaks into catching all typos statically, which may be
|
| 1276 | # possible if 'use' makes all names explicit.
|
| 1277 | elif case(y_lhs_e.Subscript):
|
| 1278 | lhs = cast(Subscript, UP_lhs)
|
| 1279 | if lhs.obj.tag() == expr_e.Var:
|
| 1280 | v = cast(expr.Var, lhs.obj)
|
| 1281 | var_checker.Check(kw_token.id, v.name, v.left)
|
| 1282 |
|
| 1283 | elif case(y_lhs_e.Attribute):
|
| 1284 | lhs = cast(Attribute, UP_lhs)
|
| 1285 | if lhs.obj.tag() == expr_e.Var:
|
| 1286 | v = cast(expr.Var, lhs.obj)
|
| 1287 | var_checker.Check(kw_token.id, v.name, v.left)
|
| 1288 |
|
| 1289 | # Let the CommandParser see the Op_Semi or Op_Newline.
|
| 1290 | self.buffered_word = last_token
|
| 1291 | self._SetNext(lex_mode_e.ShCommand) # always back to this
|
| 1292 | return enode
|
| 1293 |
|
| 1294 | def ParseBareDecl(self):
|
| 1295 | # type: () -> expr_t
|
| 1296 | """
|
| 1297 | x = {name: val}
|
| 1298 | """
|
| 1299 | self._SetNext(lex_mode_e.Expr)
|
| 1300 | self._GetToken()
|
| 1301 | enode, last_token = self.parse_ctx.ParseYshExpr(
|
| 1302 | self.lexer, grammar_nt.command_expr)
|
| 1303 | if last_token.id == Id.Op_RBrace:
|
| 1304 | last_token.id = Id.Lit_RBrace
|
| 1305 | self.buffered_word = last_token
|
| 1306 | self._SetNext(lex_mode_e.ShCommand)
|
| 1307 | return enode
|
| 1308 |
|
| 1309 | def ParseYshExprForCommand(self):
|
| 1310 | # type: () -> expr_t
|
| 1311 |
|
| 1312 | # Fudge for this case
|
| 1313 | # for x in(y) {
|
| 1314 | # versus
|
| 1315 | # for x in (y) {
|
| 1316 | #
|
| 1317 | # In the former case, ReadWord on 'in' puts the lexer past (.
|
| 1318 | # Also see LookPastSpace in CommandParers.
|
| 1319 | # A simpler solution would be nicer.
|
| 1320 |
|
| 1321 | if self.token_type == Id.Op_LParen:
|
| 1322 | self.lexer.MaybeUnreadOne()
|
| 1323 |
|
| 1324 | enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
|
| 1325 |
|
| 1326 | self._SetNext(lex_mode_e.ShCommand)
|
| 1327 | return enode
|
| 1328 |
|
| 1329 | def ParseCommandExpr(self):
|
| 1330 | # type: () -> expr_t
|
| 1331 | """
|
| 1332 | = 1+2
|
| 1333 | """
|
| 1334 | enode, last_token = self.parse_ctx.ParseYshExpr(
|
| 1335 | self.lexer, grammar_nt.command_expr)
|
| 1336 |
|
| 1337 | # In some cases, such as the case statement, we expect *the lexer* to be
|
| 1338 | # pointing at the token right after the expression. But the expression
|
| 1339 | # parser must have read to the `last_token`. Unreading places the lexer
|
| 1340 | # back in the expected state. Ie:
|
| 1341 | #
|
| 1342 | # case (x) { case (x) {
|
| 1343 | # (else) { = x } (else) { = x }
|
| 1344 | # ^ The lexer is here ^ Unread to here
|
| 1345 | # } }
|
| 1346 | assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
|
| 1347 | Id.Op_RBrace), last_token
|
| 1348 | if last_token.id != Id.Eof_Real:
|
| 1349 | # Eof_Real is the only token we cannot unread
|
| 1350 | self.lexer.MaybeUnreadOne()
|
| 1351 |
|
| 1352 | return enode
|
| 1353 |
|
| 1354 | def ParseProc(self, node):
|
| 1355 | # type: (Proc) -> None
|
| 1356 |
|
| 1357 | # proc name-with-hyphens() must be accepted
|
| 1358 | self._SetNext(lex_mode_e.ShCommand)
|
| 1359 | self._GetToken()
|
| 1360 | # example: 'proc f[' gets you Lit_ArrayLhsOpen
|
| 1361 | if self.token_type != Id.Lit_Chars:
|
| 1362 | p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
|
| 1363 | self.cur_token)
|
| 1364 |
|
| 1365 | # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
|
| 1366 | # for shell functions. Similar to IsValidVarName().
|
| 1367 | node.name = self.cur_token
|
| 1368 |
|
| 1369 | last_token = self.parse_ctx.ParseProc(self.lexer, node)
|
| 1370 |
|
| 1371 | # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
|
| 1372 | assert last_token.id == Id.Op_LBrace
|
| 1373 | last_token.id = Id.Lit_LBrace
|
| 1374 | self.buffered_word = last_token
|
| 1375 |
|
| 1376 | self._SetNext(lex_mode_e.ShCommand)
|
| 1377 |
|
| 1378 | def ParseFunc(self, node):
|
| 1379 | # type: (Func) -> None
|
| 1380 | last_token = self.parse_ctx.ParseFunc(self.lexer, node)
|
| 1381 |
|
| 1382 | # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
|
| 1383 | assert last_token.id == Id.Op_LBrace
|
| 1384 | last_token.id = Id.Lit_LBrace
|
| 1385 | self.buffered_word = last_token
|
| 1386 |
|
| 1387 | self._SetNext(lex_mode_e.ShCommand)
|
| 1388 |
|
| 1389 | def ParseYshCasePattern(self):
|
| 1390 | # type: () -> Tuple[pat_t, Token]
|
| 1391 | pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
|
| 1392 | self.lexer)
|
| 1393 |
|
| 1394 | if last_token.id == Id.Op_LBrace:
|
| 1395 | last_token.id = Id.Lit_LBrace
|
| 1396 | self.buffered_word = last_token
|
| 1397 |
|
| 1398 | return pat, left_tok
|
| 1399 |
|
| 1400 | def NewlineOkForYshCase(self):
|
| 1401 | # type: () -> Id_t
|
| 1402 | """Check for optional newline and consume it.
|
| 1403 |
|
| 1404 | This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
|
| 1405 | which crop up while parsing Ysh Case Arms. For more details, see
|
| 1406 | #oil-dev > Progress On YSH Case Grammar on zulip.
|
| 1407 |
|
| 1408 | Returns a token id which is filled with the choice of
|
| 1409 |
|
| 1410 | word { echo word }
|
| 1411 | (3) { echo expr }
|
| 1412 | /e/ { echo eggex }
|
| 1413 | } # right brace
|
| 1414 | """
|
| 1415 | while True:
|
| 1416 | next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
|
| 1417 |
|
| 1418 | # Cannot lookahead past lines
|
| 1419 | if next_id == Id.Unknown_Tok:
|
| 1420 | self.lexer.MoveToNextLine()
|
| 1421 | continue
|
| 1422 |
|
| 1423 | next_kind = consts.GetKind(next_id)
|
| 1424 | if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
|
| 1425 | break
|
| 1426 |
|
| 1427 | self.lexer.Read(lex_mode_e.Expr)
|
| 1428 |
|
| 1429 | if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
|
| 1430 | self._SetNext(lex_mode_e.Expr) # Continue in expression mode
|
| 1431 | else:
|
| 1432 | # Consume the trailing Op_Newline
|
| 1433 | self._SetNext(lex_mode_e.ShCommand)
|
| 1434 | self._GetToken()
|
| 1435 |
|
| 1436 | return next_id
|
| 1437 |
|
| 1438 | def _ReadArithExpr(self, end_id):
|
| 1439 | # type: (Id_t) -> arith_expr_t
|
| 1440 | """Read and parse an arithmetic expression in various contexts.
|
| 1441 |
|
| 1442 | $(( 1+2 ))
|
| 1443 | (( a=1+2 ))
|
| 1444 | ${a[ 1+2 ]}
|
| 1445 | ${a : 1+2 : 1+2}
|
| 1446 |
|
| 1447 | See tests/arith-context.test.sh for ambiguous cases.
|
| 1448 |
|
| 1449 | ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
|
| 1450 |
|
| 1451 | ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
|
| 1452 |
|
| 1453 | See the assertion in ArithParser.Parse() -- unexpected extra input.
|
| 1454 | """
|
| 1455 | # calls self.ReadWord(lex_mode_e.Arith)
|
| 1456 | anode = self.a_parser.Parse()
|
| 1457 | cur_id = self.a_parser.CurrentId()
|
| 1458 | if end_id != Id.Undefined_Tok and cur_id != end_id:
|
| 1459 | p_die(
|
| 1460 | 'Unexpected token after arithmetic expression (%s != %s)' %
|
| 1461 | (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
|
| 1462 | loc.Word(self.a_parser.cur_word))
|
| 1463 | return anode
|
| 1464 |
|
| 1465 | def _ReadArithSub(self):
|
| 1466 | # type: () -> word_part.ArithSub
|
| 1467 | """Read an arith substitution, which contains an arith expression, e.g.
|
| 1468 |
|
| 1469 | $((a + 1)).
|
| 1470 | """
|
| 1471 | left_tok = self.cur_token
|
| 1472 |
|
| 1473 | # The second one needs to be disambiguated in stuff like stuff like:
|
| 1474 | # $(echo $(( 1+2 )) )
|
| 1475 | self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
|
| 1476 |
|
| 1477 | # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
|
| 1478 | # could save the lexer/reader state here, and retry if the arithmetic parse
|
| 1479 | # fails. But we can almost always catch this at parse time. There could
|
| 1480 | # be some exceptions like:
|
| 1481 | # $((echo * foo)) # looks like multiplication
|
| 1482 | # $((echo / foo)) # looks like division
|
| 1483 |
|
| 1484 | # $(( )) is valid
|
| 1485 | anode = arith_expr.EmptyZero # type: arith_expr_t
|
| 1486 |
|
| 1487 | self._NextNonSpace()
|
| 1488 | if self.token_type != Id.Arith_RParen:
|
| 1489 | anode = self._ReadArithExpr(Id.Arith_RParen)
|
| 1490 |
|
| 1491 | self._SetNext(lex_mode_e.ShCommand)
|
| 1492 |
|
| 1493 | # Ensure we get closing )
|
| 1494 | self._GetToken()
|
| 1495 | if self.token_type != Id.Right_DollarDParen:
|
| 1496 | p_die('Expected second ) to end arith sub', self.cur_token)
|
| 1497 |
|
| 1498 | right_tok = self.cur_token
|
| 1499 | return word_part.ArithSub(left_tok, anode, right_tok)
|
| 1500 |
|
| 1501 | def ReadDParen(self):
|
| 1502 | # type: () -> Tuple[arith_expr_t, Token]
|
| 1503 | """Read ((1+ 2)) -- command context.
|
| 1504 |
|
| 1505 | We're using the word parser because it's very similar to _ReadArithExpr
|
| 1506 | above.
|
| 1507 |
|
| 1508 | This also returns the terminating Id.Op_DRightParen token for location
|
| 1509 | info.
|
| 1510 | """
|
| 1511 | # (( )) is valid
|
| 1512 | anode = arith_expr.EmptyZero # type: arith_expr_t
|
| 1513 |
|
| 1514 | self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
| 1515 |
|
| 1516 | self._NextNonSpace()
|
| 1517 | if self.token_type != Id.Arith_RParen:
|
| 1518 | anode = self._ReadArithExpr(Id.Arith_RParen)
|
| 1519 |
|
| 1520 | self._SetNext(lex_mode_e.ShCommand)
|
| 1521 |
|
| 1522 | # Ensure we get the second )
|
| 1523 | self._GetToken()
|
| 1524 | right = self.cur_token
|
| 1525 | if right.id != Id.Op_DRightParen:
|
| 1526 | p_die('Expected second ) to end arith statement', right)
|
| 1527 |
|
| 1528 | self._SetNext(lex_mode_e.ShCommand)
|
| 1529 |
|
| 1530 | return anode, right
|
| 1531 |
|
| 1532 | def _NextNonSpace(self):
|
| 1533 | # type: () -> None
|
| 1534 | """Advance in lex_mode_e.Arith until non-space token.
|
| 1535 |
|
| 1536 | Same logic as _ReadWord, but used in
|
| 1537 | $(( ))
|
| 1538 | (( ))
|
| 1539 | for (( ))
|
| 1540 |
|
| 1541 | You can read self.token_type after this, without calling _GetToken.
|
| 1542 | """
|
| 1543 | while True:
|
| 1544 | self._SetNext(lex_mode_e.Arith)
|
| 1545 | self._GetToken()
|
| 1546 | if self.token_kind not in (Kind.Ignored, Kind.WS):
|
| 1547 | break
|
| 1548 |
|
| 1549 | def ReadForExpression(self):
|
| 1550 | # type: () -> command.ForExpr
|
| 1551 | """Read ((i=0; i<5; ++i)) -- part of command context."""
|
| 1552 | self._NextNonSpace() # skip over ((
|
| 1553 | cur_id = self.token_type # for end of arith expressions
|
| 1554 |
|
| 1555 | if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
|
| 1556 | init_node = arith_expr.EmptyZero # type: arith_expr_t
|
| 1557 | else:
|
| 1558 | init_node = self.a_parser.Parse()
|
| 1559 | cur_id = self.a_parser.CurrentId()
|
| 1560 | self._NextNonSpace()
|
| 1561 |
|
| 1562 | # It's odd to keep track of both cur_id and self.token_type in this
|
| 1563 | # function, but it works, and is tested in 'test/parse_error.sh
|
| 1564 | # arith-integration'
|
| 1565 | if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
|
| 1566 | p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
|
| 1567 |
|
| 1568 | self._GetToken()
|
| 1569 | cur_id = self.token_type
|
| 1570 |
|
| 1571 | if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
|
| 1572 | # empty condition is TRUE
|
| 1573 | cond_node = arith_expr.EmptyOne # type: arith_expr_t
|
| 1574 | else:
|
| 1575 | cond_node = self.a_parser.Parse()
|
| 1576 | cur_id = self.a_parser.CurrentId()
|
| 1577 |
|
| 1578 | if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
|
| 1579 | p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
|
| 1580 |
|
| 1581 | self._NextNonSpace()
|
| 1582 | if self.token_type == Id.Arith_RParen: # for (( ; ; ))
|
| 1583 | update_node = arith_expr.EmptyZero # type: arith_expr_t
|
| 1584 | else:
|
| 1585 | update_node = self._ReadArithExpr(Id.Arith_RParen)
|
| 1586 |
|
| 1587 | self._NextNonSpace()
|
| 1588 | if self.token_type != Id.Arith_RParen:
|
| 1589 | p_die('Expected ) to end for loop expression', self.cur_token)
|
| 1590 | self._SetNext(lex_mode_e.ShCommand)
|
| 1591 |
|
| 1592 | # redirects is None, will be assigned in CommandEvaluator
|
| 1593 | node = command.ForExpr.CreateNull()
|
| 1594 | node.init = init_node
|
| 1595 | node.cond = cond_node
|
| 1596 | node.update = update_node
|
| 1597 | return node
|
| 1598 |
|
| 1599 | def _ReadArrayLiteral(self):
|
| 1600 | # type: () -> word_part_t
|
| 1601 | """a=(1 2 3)
|
| 1602 |
|
| 1603 | TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
|
| 1604 |
|
| 1605 | We want:
|
| 1606 |
|
| 1607 | A=(['x']=1 ["x"]=2 [$x$y]=3)
|
| 1608 |
|
| 1609 | Maybe allow this as a literal string? Because I think I've seen it before?
|
| 1610 | Or maybe force people to patch to learn the rule.
|
| 1611 |
|
| 1612 | A=([x]=4)
|
| 1613 |
|
| 1614 | Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
|
| 1615 | Maybe enforce that ALL have keys or NONE of have keys.
|
| 1616 | """
|
| 1617 | self._SetNext(lex_mode_e.ShCommand) # advance past (
|
| 1618 | self._GetToken()
|
| 1619 | if self.cur_token.id != Id.Op_LParen:
|
| 1620 | p_die('Expected ( after =', self.cur_token)
|
| 1621 | left_token = self.cur_token
|
| 1622 | right_token = None # type: Token
|
| 1623 |
|
| 1624 | # MUST use a new word parser (with same lexer).
|
| 1625 | w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
|
| 1626 | words = [] # type: List[CompoundWord]
|
| 1627 | done = False
|
| 1628 | while not done:
|
| 1629 | w = w_parser.ReadWord(lex_mode_e.ShCommand)
|
| 1630 | with tagswitch(w) as case:
|
| 1631 | if case(word_e.Operator):
|
| 1632 | tok = cast(Token, w)
|
| 1633 | if tok.id == Id.Right_ShArrayLiteral:
|
| 1634 | right_token = tok
|
| 1635 | done = True # can't use break here
|
| 1636 | # Unlike command parsing, array parsing allows embedded \n.
|
| 1637 | elif tok.id == Id.Op_Newline:
|
| 1638 | continue
|
| 1639 | else:
|
| 1640 | p_die('Unexpected token in array literal', loc.Word(w))
|
| 1641 |
|
| 1642 | elif case(word_e.Compound):
|
| 1643 | words.append(cast(CompoundWord, w))
|
| 1644 |
|
| 1645 | else:
|
| 1646 | raise AssertionError()
|
| 1647 |
|
| 1648 | if len(words) == 0: # a=() is empty indexed array
|
| 1649 | # Needed for type safety, doh
|
| 1650 | no_words = [] # type: List[word_t]
|
| 1651 | node = ShArrayLiteral(left_token, no_words, right_token)
|
| 1652 | return node
|
| 1653 |
|
| 1654 | pairs = [] # type: List[AssocPair]
|
| 1655 | # If the first one is a key/value pair, then the rest are assumed to be.
|
| 1656 | pair = word_.DetectAssocPair(words[0])
|
| 1657 | if pair:
|
| 1658 | pairs.append(pair)
|
| 1659 |
|
| 1660 | n = len(words)
|
| 1661 | for i in xrange(1, n):
|
| 1662 | w2 = words[i]
|
| 1663 | pair = word_.DetectAssocPair(w2)
|
| 1664 | if not pair:
|
| 1665 | p_die("Expected associative array pair", loc.Word(w2))
|
| 1666 |
|
| 1667 | pairs.append(pair)
|
| 1668 |
|
| 1669 | # invariant List?
|
| 1670 | return word_part.BashAssocLiteral(left_token, pairs, right_token)
|
| 1671 |
|
| 1672 | # Brace detection for arrays but NOT associative arrays
|
| 1673 | words2 = braces.BraceDetectAll(words)
|
| 1674 | words3 = word_.TildeDetectAll(words2)
|
| 1675 | return ShArrayLiteral(left_token, words3, right_token)
|
| 1676 |
|
| 1677 | def ParseProcCallArgs(self, start_symbol):
|
| 1678 | # type: (int) -> ArgList
|
| 1679 | """ json write (x) """
|
| 1680 | self.lexer.MaybeUnreadOne()
|
| 1681 |
|
| 1682 | arg_list = ArgList.CreateNull(alloc_lists=True)
|
| 1683 | arg_list.left = self.cur_token
|
| 1684 | self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
|
| 1685 | return arg_list
|
| 1686 |
|
| 1687 | def _MaybeReadWordPart(self, is_first, lex_mode, parts):
|
| 1688 | # type: (bool, lex_mode_t, List[word_part_t]) -> bool
|
| 1689 | """Helper for _ReadCompoundWord3."""
|
| 1690 | done = False
|
| 1691 |
|
| 1692 | if self.token_type == Id.Lit_EscapedChar:
|
| 1693 | tok = self.cur_token
|
| 1694 | assert tok.length == 2
|
| 1695 | ch = lexer.TokenSliceLeft(tok, 1)
|
| 1696 | if not self.parse_opts.parse_backslash():
|
| 1697 | if not pyutil.IsValidCharEscape(ch):
|
| 1698 | p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
|
| 1699 | self.cur_token)
|
| 1700 |
|
| 1701 | part = word_part.EscapedLiteral(self.cur_token,
|
| 1702 | ch) # type: word_part_t
|
| 1703 | else:
|
| 1704 | part = self.cur_token
|
| 1705 |
|
| 1706 | if is_first and self.token_type == Id.Lit_VarLike: # foo=
|
| 1707 | parts.append(part)
|
| 1708 | # Unfortunately it's awkward to pull the check for a=(1 2) up to
|
| 1709 | # _ReadWord.
|
| 1710 | next_id = self.lexer.LookPastSpace(lex_mode)
|
| 1711 | if next_id == Id.Op_LParen:
|
| 1712 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
|
| 1713 | part2 = self._ReadArrayLiteral()
|
| 1714 | parts.append(part2)
|
| 1715 |
|
| 1716 | # Array literal must be the last part of the word.
|
| 1717 | self._SetNext(lex_mode)
|
| 1718 | self._GetToken()
|
| 1719 | # EOF, whitespace, newline, Right_Subshell
|
| 1720 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
| 1721 | p_die('Unexpected token after array literal',
|
| 1722 | self.cur_token)
|
| 1723 | done = True
|
| 1724 |
|
| 1725 | elif (is_first and self.parse_opts.parse_at() and
|
| 1726 | self.token_type == Id.Lit_Splice):
|
| 1727 |
|
| 1728 | splice_tok = self.cur_token
|
| 1729 | part2 = word_part.Splice(splice_tok,
|
| 1730 | lexer.TokenSliceLeft(splice_tok, 1))
|
| 1731 |
|
| 1732 | parts.append(part2)
|
| 1733 |
|
| 1734 | # @words must be the last part of the word
|
| 1735 | self._SetNext(lex_mode)
|
| 1736 | self._GetToken()
|
| 1737 | # EOF, whitespace, newline, Right_Subshell
|
| 1738 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
| 1739 | p_die('Unexpected token after array splice', self.cur_token)
|
| 1740 | done = True
|
| 1741 |
|
| 1742 | elif (is_first and self.parse_opts.parse_at() and
|
| 1743 | self.token_type == Id.Lit_AtLBracket): # @[split(x)]
|
| 1744 | part2 = self._ReadExprSub(lex_mode_e.DQ)
|
| 1745 | parts.append(part2)
|
| 1746 |
|
| 1747 | # @[split(x)]
|
| 1748 | self._SetNext(lex_mode)
|
| 1749 | self._GetToken()
|
| 1750 | # EOF, whitespace, newline, Right_Subshell
|
| 1751 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
| 1752 | p_die('Unexpected token after Expr splice', self.cur_token)
|
| 1753 | done = True
|
| 1754 |
|
| 1755 | elif (is_first and self.parse_opts.parse_at() and
|
| 1756 | self.token_type == Id.Lit_AtLBraceDot):
|
| 1757 | p_die('TODO: @{.myproc builtin sub}', self.cur_token)
|
| 1758 |
|
| 1759 | elif (is_first and self.parse_opts.parse_at_all() and
|
| 1760 | self.token_type == Id.Lit_At):
|
| 1761 | # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
|
| 1762 | # at the beginning of a word to be reserved.
|
| 1763 |
|
| 1764 | # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
|
| 1765 | # @_argv and
|
| 1766 | p_die('Literal @ starting a word must be quoted (parse_at_all)',
|
| 1767 | self.cur_token)
|
| 1768 |
|
| 1769 | else:
|
| 1770 | # not a literal with lookahead; append it
|
| 1771 | parts.append(part)
|
| 1772 |
|
| 1773 | return done
|
| 1774 |
|
| 1775 | def _ReadCompoundWord(self, lex_mode):
|
| 1776 | # type: (lex_mode_t) -> CompoundWord
|
| 1777 | return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
|
| 1778 |
|
| 1779 | def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
|
| 1780 | # type: (lex_mode_t, Id_t, bool) -> CompoundWord
|
| 1781 | """
|
| 1782 | Precondition: Looking at the first token of the first word part
|
| 1783 | Postcondition: Looking at the token after, e.g. space or operator
|
| 1784 |
|
| 1785 | NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
|
| 1786 | could be an operator delimiting a compound word. Can we change lexer modes
|
| 1787 | and remove this special case?
|
| 1788 | """
|
| 1789 | w = CompoundWord([])
|
| 1790 | num_parts = 0
|
| 1791 | brace_count = 0
|
| 1792 | done = False
|
| 1793 | is_triple_quoted = None # type: Optional[BoolParamBox]
|
| 1794 |
|
| 1795 | while not done:
|
| 1796 | self._GetToken()
|
| 1797 |
|
| 1798 | allow_done = empty_ok or num_parts != 0
|
| 1799 | if allow_done and self.token_type == eof_type:
|
| 1800 | done = True # e.g. for ${foo//pat/replace}
|
| 1801 |
|
| 1802 | # Keywords like "for" are treated like literals
|
| 1803 | elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
|
| 1804 | Kind.ControlFlow, Kind.BoolUnary,
|
| 1805 | Kind.BoolBinary):
|
| 1806 |
|
| 1807 | # Syntax error for { and }
|
| 1808 | if self.token_type == Id.Lit_LBrace:
|
| 1809 | brace_count += 1
|
| 1810 | elif self.token_type == Id.Lit_RBrace:
|
| 1811 | brace_count -= 1
|
| 1812 | elif self.token_type == Id.Lit_Dollar:
|
| 1813 | if not self.parse_opts.parse_dollar():
|
| 1814 | if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
|
| 1815 | next_byte = self.lexer.ByteLookAhead()
|
| 1816 | # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
|
| 1817 | if next_byte == '/':
|
| 1818 | #log('next_byte %r', next_byte)
|
| 1819 | pass
|
| 1820 |
|
| 1821 | p_die('Literal $ should be quoted like \$',
|
| 1822 | self.cur_token)
|
| 1823 |
|
| 1824 | done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
|
| 1825 | w.parts)
|
| 1826 |
|
| 1827 | elif self.token_kind == Kind.VSub:
|
| 1828 | vsub_token = self.cur_token
|
| 1829 |
|
| 1830 | part = SimpleVarSub(vsub_token) # type: word_part_t
|
| 1831 | w.parts.append(part)
|
| 1832 |
|
| 1833 | elif self.token_kind == Kind.ExtGlob:
|
| 1834 | # If parse_at, we can take over @( to start @(seq 3)
|
| 1835 | # Users can also use look at ,(*.py|*.sh)
|
| 1836 | if (self.parse_opts.parse_at() and
|
| 1837 | self.token_type == Id.ExtGlob_At and num_parts == 0):
|
| 1838 | cs_part = self._ReadCommandSub(Id.Left_AtParen,
|
| 1839 | d_quoted=False)
|
| 1840 | # RARE mutation of tok.id!
|
| 1841 | cs_part.left_token.id = Id.Left_AtParen
|
| 1842 | part = cs_part # for type safety
|
| 1843 |
|
| 1844 | # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
|
| 1845 | # a=(one two)x and @arrayfunc(3)x.
|
| 1846 | self._GetToken()
|
| 1847 | if self.token_kind not in KINDS_THAT_END_WORDS:
|
| 1848 | p_die('Unexpected token after @()', self.cur_token)
|
| 1849 | done = True
|
| 1850 |
|
| 1851 | else:
|
| 1852 | part = self._ReadExtGlob()
|
| 1853 | w.parts.append(part)
|
| 1854 |
|
| 1855 | elif self.token_kind == Kind.BashRegex:
|
| 1856 | if self.token_type == Id.BashRegex_LParen: # Opening (
|
| 1857 | part = self._ReadBashRegexGroup()
|
| 1858 | w.parts.append(part)
|
| 1859 | else:
|
| 1860 | assert self.token_type == Id.BashRegex_AllowedInParens
|
| 1861 | p_die('Invalid token in bash regex', self.cur_token)
|
| 1862 |
|
| 1863 | elif self.token_kind == Kind.Left:
|
| 1864 | try_triple_quote = (self.parse_opts.parse_triple_quote() and
|
| 1865 | lex_mode == lex_mode_e.ShCommand and
|
| 1866 | num_parts == 0)
|
| 1867 |
|
| 1868 | # Save allocation
|
| 1869 | if try_triple_quote:
|
| 1870 | is_triple_quoted = BoolParamBox(False)
|
| 1871 |
|
| 1872 | part = self._ReadUnquotedLeftParts(is_triple_quoted)
|
| 1873 | w.parts.append(part)
|
| 1874 |
|
| 1875 | # NOT done yet, will advance below
|
| 1876 | elif self.token_kind == Kind.Right:
|
| 1877 | # Still part of the word; will be done on the next iter.
|
| 1878 | if self.token_type == Id.Right_DoubleQuote:
|
| 1879 | pass
|
| 1880 | # Never happens, no PushHint for this case.
|
| 1881 | #elif self.token_type == Id.Right_DollarParen:
|
| 1882 | # pass
|
| 1883 | elif self.token_type == Id.Right_Subshell:
|
| 1884 | # LEXER HACK for (case x in x) ;; esac )
|
| 1885 | # Rewind before it's used
|
| 1886 | assert self.next_lex_mode == lex_mode_e.Undefined
|
| 1887 | if self.lexer.MaybeUnreadOne():
|
| 1888 | self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
|
| 1889 | self._SetNext(lex_mode)
|
| 1890 | done = True
|
| 1891 | else:
|
| 1892 | done = True
|
| 1893 |
|
| 1894 | elif self.token_kind == Kind.Ignored:
|
| 1895 | done = True
|
| 1896 |
|
| 1897 | else:
|
| 1898 | # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
|
| 1899 | # so to test for ESAC, we can read ) before getting a chance to
|
| 1900 | # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
|
| 1901 | # token and do it again.
|
| 1902 |
|
| 1903 | # We get Id.Op_RParen at top level: case x in x) ;; esac
|
| 1904 | # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
|
| 1905 | if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
|
| 1906 | # Rewind before it's used
|
| 1907 | assert self.next_lex_mode == lex_mode_e.Undefined
|
| 1908 | if self.lexer.MaybeUnreadOne():
|
| 1909 | if self.token_type == Id.Eof_RParen:
|
| 1910 | # Redo translation
|
| 1911 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
| 1912 | self._SetNext(lex_mode)
|
| 1913 |
|
| 1914 | done = True # anything we don't recognize means we're done
|
| 1915 |
|
| 1916 | if not done:
|
| 1917 | self._SetNext(lex_mode)
|
| 1918 | num_parts += 1
|
| 1919 |
|
| 1920 | if (self.parse_opts.parse_brace() and num_parts > 1 and
|
| 1921 | brace_count != 0):
|
| 1922 | # accept { and }, but not foo{
|
| 1923 | p_die(
|
| 1924 | 'Word has unbalanced { }. Maybe add a space or quote it like \{',
|
| 1925 | loc.Word(w))
|
| 1926 |
|
| 1927 | if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
|
| 1928 | p_die('Unexpected parts after triple quoted string',
|
| 1929 | loc.WordPart(w.parts[-1]))
|
| 1930 |
|
| 1931 | if 0:
|
| 1932 | from _devbuild.gen.syntax_asdl import word_part_str
|
| 1933 | word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
|
| 1934 | WORD_HIST[word_key] += 1
|
| 1935 | return w
|
| 1936 |
|
| 1937 | def _ReadArithWord(self):
|
| 1938 | # type: () -> Optional[word_t]
|
| 1939 | """ Helper for ReadArithWord() """
|
| 1940 | self._GetToken()
|
| 1941 |
|
| 1942 | if self.token_kind == Kind.Unknown:
|
| 1943 | # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
|
| 1944 | p_die(
|
| 1945 | 'Unexpected token while parsing arithmetic: %r' %
|
| 1946 | lexer.TokenVal(self.cur_token), self.cur_token)
|
| 1947 |
|
| 1948 | elif self.token_kind == Kind.Eof:
|
| 1949 | return self.cur_token
|
| 1950 |
|
| 1951 | elif self.token_kind == Kind.Ignored:
|
| 1952 | # Space should be ignored.
|
| 1953 | self._SetNext(lex_mode_e.Arith)
|
| 1954 | return None
|
| 1955 |
|
| 1956 | elif self.token_kind in (Kind.Arith, Kind.Right):
|
| 1957 | # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
|
| 1958 | self._SetNext(lex_mode_e.Arith)
|
| 1959 | return self.cur_token
|
| 1960 |
|
| 1961 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
|
| 1962 | return self._ReadCompoundWord(lex_mode_e.Arith)
|
| 1963 |
|
| 1964 | else:
|
| 1965 | raise AssertionError(self.cur_token)
|
| 1966 |
|
| 1967 | def _ReadWord(self, word_mode):
|
| 1968 | # type: (lex_mode_t) -> Optional[word_t]
|
| 1969 | """Helper function for ReadWord()."""
|
| 1970 |
|
| 1971 | # Change the pseudo lexer mode to a real lexer mode
|
| 1972 | if word_mode == lex_mode_e.ShCommandFakeBrack:
|
| 1973 | lex_mode = lex_mode_e.ShCommand
|
| 1974 | else:
|
| 1975 | lex_mode = word_mode
|
| 1976 |
|
| 1977 | self._GetToken()
|
| 1978 |
|
| 1979 | if self.token_kind == Kind.Eof:
|
| 1980 | # No advance
|
| 1981 | return self.cur_token
|
| 1982 |
|
| 1983 | # Allow Arith for ) at end of for loop?
|
| 1984 | elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
|
| 1985 | self._SetNext(lex_mode)
|
| 1986 |
|
| 1987 | # Newlines are complicated. See 3x2 matrix in the comment about
|
| 1988 | # self.multiline and self.newline_state above.
|
| 1989 | if self.token_type == Id.Op_Newline:
|
| 1990 | if self.multiline:
|
| 1991 | if self.newline_state > 1:
|
| 1992 | # This points at a blank line, but at least it gives the line number
|
| 1993 | p_die('Invalid blank line in multiline mode',
|
| 1994 | self.cur_token)
|
| 1995 | return None
|
| 1996 |
|
| 1997 | if self.returned_newline: # skip
|
| 1998 | return None
|
| 1999 |
|
| 2000 | return self.cur_token
|
| 2001 |
|
| 2002 | elif self.token_kind == Kind.Right:
|
| 2003 | if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
|
| 2004 | Id.Right_CasePat,
|
| 2005 | Id.Right_ShArrayLiteral):
|
| 2006 | raise AssertionError(self.cur_token)
|
| 2007 |
|
| 2008 | self._SetNext(lex_mode)
|
| 2009 | return self.cur_token
|
| 2010 |
|
| 2011 | elif self.token_kind in (Kind.Ignored, Kind.WS):
|
| 2012 | self._SetNext(lex_mode)
|
| 2013 | return None
|
| 2014 |
|
| 2015 | else:
|
| 2016 | assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
|
| 2017 | Kind.Left, Kind.KW, Kind.ControlFlow,
|
| 2018 | Kind.BoolUnary, Kind.BoolBinary,
|
| 2019 | Kind.ExtGlob,
|
| 2020 | Kind.BashRegex), 'Unhandled token kind'
|
| 2021 |
|
| 2022 | if (word_mode == lex_mode_e.ShCommandFakeBrack and
|
| 2023 | self.parse_opts.parse_bracket() and
|
| 2024 | self.token_type == Id.Lit_LBracket):
|
| 2025 | # Change [ from Kind.Lit -> Kind.Op
|
| 2026 | # So CommandParser can treat
|
| 2027 | # assert [42 === x]
|
| 2028 | # like
|
| 2029 | # json write (x)
|
| 2030 | bracket_word = self.cur_token
|
| 2031 | bracket_word.id = Id.Op_LBracket
|
| 2032 |
|
| 2033 | self._SetNext(lex_mode)
|
| 2034 | return bracket_word
|
| 2035 |
|
| 2036 | # We're beginning a word. If we see Id.Lit_Pound, change to
|
| 2037 | # lex_mode_e.Comment and read until end of line.
|
| 2038 | if self.token_type == Id.Lit_Pound:
|
| 2039 | self._SetNext(lex_mode_e.Comment)
|
| 2040 | self._GetToken()
|
| 2041 |
|
| 2042 | # NOTE: The # could be the last character in the file. It can't be
|
| 2043 | # Eof_{RParen,Backtick} because #) and #` are comments.
|
| 2044 | assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
|
| 2045 | self.cur_token
|
| 2046 |
|
| 2047 | # The next iteration will go into Kind.Ignored and set lex state to
|
| 2048 | # lex_mode_e.ShCommand/etc.
|
| 2049 | return None # tell ReadWord() to try again after comment
|
| 2050 |
|
| 2051 | elif self.token_type == Id.Lit_TPound: ### doc comment
|
| 2052 | self._SetNext(lex_mode_e.Comment)
|
| 2053 | self._GetToken()
|
| 2054 |
|
| 2055 | if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
|
| 2056 | return self.cur_token
|
| 2057 |
|
| 2058 | return None # tell ReadWord() to try again after comment
|
| 2059 |
|
| 2060 | else:
|
| 2061 | # r'' u'' b''
|
| 2062 | if (self.token_type == Id.Lit_Chars and
|
| 2063 | self.lexer.LookAheadOne(
|
| 2064 | lex_mode_e.ShCommand) == Id.Left_SingleQuote):
|
| 2065 |
|
| 2066 | # When shopt -s parse_raw_string:
|
| 2067 | # echo r'hi' is like echo 'hi'
|
| 2068 | #
|
| 2069 | # echo u'\u{3bc}' b'\yff' works
|
| 2070 |
|
| 2071 | tok = self.cur_token
|
| 2072 | if self.parse_opts.parse_ysh_string():
|
| 2073 | if lexer.TokenEquals(tok, 'r'):
|
| 2074 | left_id = Id.Left_RSingleQuote
|
| 2075 | elif lexer.TokenEquals(tok, 'u'):
|
| 2076 | left_id = Id.Left_USingleQuote
|
| 2077 | elif lexer.TokenEquals(tok, 'b'):
|
| 2078 | left_id = Id.Left_BSingleQuote
|
| 2079 | else:
|
| 2080 | left_id = Id.Undefined_Tok
|
| 2081 |
|
| 2082 | if left_id != Id.Undefined_Tok:
|
| 2083 | # skip the r, and then 'foo' will be read as normal
|
| 2084 | self._SetNext(lex_mode_e.ShCommand)
|
| 2085 |
|
| 2086 | self._GetToken()
|
| 2087 | assert self.token_type == Id.Left_SingleQuote, self.token_type
|
| 2088 |
|
| 2089 | # Read the word in a different lexer mode
|
| 2090 | return self._ReadYshSingleQuoted(left_id)
|
| 2091 |
|
| 2092 | return self._ReadCompoundWord(lex_mode)
|
| 2093 |
|
| 2094 | def ParseVarRef(self):
|
| 2095 | # type: () -> BracedVarSub
|
| 2096 | """DYNAMIC parsing of what's inside ${!ref}
|
| 2097 |
|
| 2098 | # Same as VarOf production
|
| 2099 | VarRefExpr = VarOf EOF
|
| 2100 | """
|
| 2101 | self._SetNext(lex_mode_e.VSub_1)
|
| 2102 |
|
| 2103 | self._GetToken()
|
| 2104 | if self.token_kind != Kind.VSub:
|
| 2105 | p_die('Expected var name', self.cur_token)
|
| 2106 |
|
| 2107 | part = self._ParseVarOf()
|
| 2108 | # NOTE: no ${ } means no part.left and part.right
|
| 2109 | part.left = part.token # cheat to make test pass
|
| 2110 | part.right = part.token
|
| 2111 |
|
| 2112 | self._GetToken()
|
| 2113 | if self.token_type != Id.Eof_Real:
|
| 2114 | p_die('Expected end of var ref expression', self.cur_token)
|
| 2115 | return part
|
| 2116 |
|
| 2117 | def LookPastSpace(self):
|
| 2118 | # type: () -> Id_t
|
| 2119 | """Look ahead to the next token.
|
| 2120 |
|
| 2121 | For the CommandParser to recognize
|
| 2122 | array= (1 2 3)
|
| 2123 | YSH for ( versus bash for ((
|
| 2124 | YSH if ( versus if test
|
| 2125 | YSH while ( versus while test
|
| 2126 | YSH bare assignment 'grep =' versus 'grep foo'
|
| 2127 | """
|
| 2128 | assert self.token_type != Id.Undefined_Tok
|
| 2129 | if self.cur_token.id == Id.WS_Space:
|
| 2130 | id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
|
| 2131 | else:
|
| 2132 | id_ = self.cur_token.id
|
| 2133 | return id_
|
| 2134 |
|
| 2135 | def LookAheadFuncParens(self):
|
| 2136 | # type: () -> bool
|
| 2137 | """Special lookahead for f( ) { echo hi; } to check for ( )"""
|
| 2138 | assert self.token_type != Id.Undefined_Tok
|
| 2139 |
|
| 2140 | # We have to handle 2 cases because we buffer a token
|
| 2141 | if self.cur_token.id == Id.Op_LParen: # saw funcname(
|
| 2142 | return self.lexer.LookAheadFuncParens(1) # go back one char
|
| 2143 |
|
| 2144 | elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
|
| 2145 | return self.lexer.LookAheadFuncParens(0)
|
| 2146 |
|
| 2147 | else:
|
| 2148 | return False
|
| 2149 |
|
| 2150 | def ReadWord(self, word_mode):
|
| 2151 | # type: (lex_mode_t) -> word_t
|
| 2152 | """Read the next word, using the given lexer mode.
|
| 2153 |
|
| 2154 | This is a stateful wrapper for the stateless _ReadWord function.
|
| 2155 | """
|
| 2156 | assert word_mode in (lex_mode_e.ShCommand,
|
| 2157 | lex_mode_e.ShCommandFakeBrack,
|
| 2158 | lex_mode_e.DBracket, lex_mode_e.BashRegex)
|
| 2159 |
|
| 2160 | if self.buffered_word: # For integration with pgen2
|
| 2161 | w = self.buffered_word
|
| 2162 | self.buffered_word = None
|
| 2163 | else:
|
| 2164 | while True:
|
| 2165 | w = self._ReadWord(word_mode)
|
| 2166 | if w is not None:
|
| 2167 | break
|
| 2168 |
|
| 2169 | self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
|
| 2170 | return w
|
| 2171 |
|
| 2172 | def ReadArithWord(self):
|
| 2173 | # type: () -> word_t
|
| 2174 | while True:
|
| 2175 | w = self._ReadArithWord()
|
| 2176 | if w is not None:
|
| 2177 | break
|
| 2178 | return w
|
| 2179 |
|
| 2180 | def ReadHereDocBody(self, parts):
|
| 2181 | # type: (List[word_part_t]) -> None
|
| 2182 | """
|
| 2183 | A here doc is like a double quoted context, except " isn't special.
|
| 2184 | """
|
| 2185 | self._ReadLikeDQ(None, False, parts)
|
| 2186 | # Returns nothing
|
| 2187 |
|
| 2188 | def ReadForPlugin(self):
|
| 2189 | # type: () -> CompoundWord
|
| 2190 | """For $PS1, $PS4, etc.
|
| 2191 |
|
| 2192 | This is just like reading a here doc line. "\n" is allowed, as
|
| 2193 | well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
|
| 2194 | """
|
| 2195 | w = CompoundWord([])
|
| 2196 | self._ReadLikeDQ(None, False, w.parts)
|
| 2197 | return w
|
| 2198 |
|
| 2199 | def EmitDocToken(self, b):
|
| 2200 | # type: (bool) -> None
|
| 2201 | self.emit_doc_token = b
|
| 2202 |
|
| 2203 | def Multiline(self, b):
|
| 2204 | # type: (bool) -> None
|
| 2205 | self.multiline = b
|
| 2206 |
|
| 2207 |
|
| 2208 | if 0:
|
| 2209 | import collections
|
| 2210 | WORD_HIST = collections.Counter()
|
| 2211 |
|
| 2212 | # vim: sw=4
|