| 1 | """
 | 
| 2 | word.py - Utility functions for words, e.g. treating them as "tokens".
 | 
| 3 | """
 | 
| 4 | 
 | 
| 5 | from _devbuild.gen.id_kind_asdl import Id, Kind, Id_t, Kind_t
 | 
| 6 | from _devbuild.gen.syntax_asdl import (
 | 
| 7 |     Token,
 | 
| 8 |     CompoundWord,
 | 
| 9 |     DoubleQuoted,
 | 
| 10 |     SingleQuoted,
 | 
| 11 |     word,
 | 
| 12 |     word_e,
 | 
| 13 |     word_t,
 | 
| 14 |     word_str,
 | 
| 15 |     word_part,
 | 
| 16 |     word_part_t,
 | 
| 17 |     word_part_e,
 | 
| 18 |     AssocPair,
 | 
| 19 | )
 | 
| 20 | from frontend import consts
 | 
| 21 | from frontend import lexer
 | 
| 22 | from mycpp import mylib
 | 
| 23 | from mycpp.mylib import tagswitch, log
 | 
| 24 | 
 | 
| 25 | from typing import Tuple, Optional, List, Any, cast, TYPE_CHECKING
 | 
| 26 | if TYPE_CHECKING:
 | 
| 27 |     from osh.word_parse import WordParser
 | 
| 28 | 
 | 
| 29 | _ = log
 | 
| 30 | 
 | 
| 31 | 
 | 
| 32 | def LiteralId(p):
 | 
| 33 |     # type: (word_part_t) -> Id_t
 | 
| 34 |     """If the WordPart consists of a single literal token, return its Id.
 | 
| 35 | 
 | 
| 36 |     Used for Id.KW_For, or Id.RBrace, etc.
 | 
| 37 |     """
 | 
| 38 |     UP_part = p
 | 
| 39 |     if p.tag() == word_part_e.Literal:
 | 
| 40 |         return cast(Token, UP_part).id
 | 
| 41 |     else:
 | 
| 42 |         return Id.Undefined_Tok  # unequal to any other Id
 | 
| 43 | 
 | 
| 44 | 
 | 
| 45 | def _EvalWordPart(part):
 | 
| 46 |     # type: (word_part_t) -> Tuple[bool, str, bool]
 | 
| 47 |     """Evaluate a WordPart at PARSE TIME.
 | 
| 48 | 
 | 
| 49 |     Used for:
 | 
| 50 | 
 | 
| 51 |     1. here doc delimiters
 | 
| 52 |     2. function names
 | 
| 53 |     3. for loop variable names
 | 
| 54 |     4. Compiling constant regex words at parse time
 | 
| 55 |     5. a special case for ${a////c} to see if we got a leading slash in the
 | 
| 56 |     pattern.
 | 
| 57 | 
 | 
| 58 |     Returns:
 | 
| 59 |       3-tuple of
 | 
| 60 |         ok: bool, success.  If there are parts that can't be statically
 | 
| 61 |           evaluated, then we return false.
 | 
| 62 |         value: a string (not Value)
 | 
| 63 |         quoted: whether any part of the word was quoted
 | 
| 64 |     """
 | 
| 65 |     UP_part = part
 | 
| 66 |     with tagswitch(part) as case:
 | 
| 67 |         if case(word_part_e.Literal):
 | 
| 68 |             tok = cast(Token, UP_part)
 | 
| 69 |             # Weird performance issue: if we change this to lexer.LazyStr(),
 | 
| 70 |             # the parser slows down, e.g. on configure-coreutils from 805 B
 | 
| 71 |             # irefs to ~830 B.  The real issue is that we should avoid calling
 | 
| 72 |             # this from CommandParser - for the Hay node.
 | 
| 73 |             return True, lexer.TokenVal(tok), False
 | 
| 74 |             #return True, lexer.LazyStr(tok), False
 | 
| 75 | 
 | 
| 76 |         elif case(word_part_e.EscapedLiteral):
 | 
| 77 |             part = cast(word_part.EscapedLiteral, UP_part)
 | 
| 78 |             if mylib.PYTHON:
 | 
| 79 |                 val = lexer.TokenVal(part.token)
 | 
| 80 |                 assert len(val) == 2, val  # e.g. \*
 | 
| 81 |                 assert val[0] == '\\'
 | 
| 82 |             s = lexer.TokenSliceLeft(part.token, 1)
 | 
| 83 |             return True, s, True
 | 
| 84 | 
 | 
| 85 |         elif case(word_part_e.SingleQuoted):
 | 
| 86 |             part = cast(SingleQuoted, UP_part)
 | 
| 87 |             return True, part.sval, True
 | 
| 88 | 
 | 
| 89 |         elif case(word_part_e.DoubleQuoted):
 | 
| 90 |             part = cast(DoubleQuoted, UP_part)
 | 
| 91 |             strs = []  # type: List[str]
 | 
| 92 |             for p in part.parts:
 | 
| 93 |                 ok, s, _ = _EvalWordPart(p)
 | 
| 94 |                 if not ok:
 | 
| 95 |                     return False, '', True
 | 
| 96 |                 strs.append(s)
 | 
| 97 | 
 | 
| 98 |             return True, ''.join(strs), True  # At least one part was quoted!
 | 
| 99 | 
 | 
| 100 |         elif case(word_part_e.ShArrayLiteral, word_part_e.BashAssocLiteral,
 | 
| 101 |                   word_part_e.ZshVarSub, word_part_e.CommandSub,
 | 
| 102 |                   word_part_e.SimpleVarSub, word_part_e.BracedVarSub,
 | 
| 103 |                   word_part_e.TildeSub, word_part_e.ArithSub,
 | 
| 104 |                   word_part_e.ExtGlob, word_part_e.Splice,
 | 
| 105 |                   word_part_e.ExprSub):
 | 
| 106 |             return False, '', False
 | 
| 107 | 
 | 
| 108 |         else:
 | 
| 109 |             raise AssertionError(part.tag())
 | 
| 110 | 
 | 
| 111 | 
 | 
| 112 | def FastStrEval(w):
 | 
| 113 |     # type: (CompoundWord) -> Optional[str]
 | 
| 114 |     """
 | 
| 115 |     Detects common case 
 | 
| 116 | 
 | 
| 117 |     (1) CompoundWord([LiteralPart(Id.LitChars)])
 | 
| 118 |         For echo -e, test x -lt 0, etc.
 | 
| 119 |     (2) single quoted word like 'foo'
 | 
| 120 | 
 | 
| 121 |     Other patterns we could detect are:
 | 
| 122 |     (1) "foo"
 | 
| 123 |     (2) "$var" and "${var}" - I think these are very common in OSH code (but not YSH)
 | 
| 124 |         - I think val_ops.Stringify() can handle all the errors
 | 
| 125 |     """
 | 
| 126 |     if len(w.parts) != 1:
 | 
| 127 |         return None
 | 
| 128 | 
 | 
| 129 |     part0 = w.parts[0]
 | 
| 130 |     UP_part0 = part0
 | 
| 131 |     with tagswitch(part0) as case:
 | 
| 132 |         if case(word_part_e.Literal):
 | 
| 133 |             part0 = cast(Token, UP_part0)
 | 
| 134 | 
 | 
| 135 |             if part0.id in (Id.Lit_Chars, Id.Lit_LBracket, Id.Lit_RBracket):
 | 
| 136 |                 # Could add more tokens in this case
 | 
| 137 |                 #   e.g. + is Lit_Other, and it's a Token in 'expr'
 | 
| 138 |                 #   Right now it's Lit_Chars (e.g. ls -l) and [ and ] because I
 | 
| 139 |                 #   know those are common
 | 
| 140 |                 #   { } are not as common
 | 
| 141 |                 return lexer.LazyStr(part0)
 | 
| 142 | 
 | 
| 143 |             else:
 | 
| 144 |                 # e.g. Id.Lit_Star needs to be glob expanded
 | 
| 145 |                 # TODO: Consider moving Id.Lit_Star etc. to Kind.MaybeGlob?
 | 
| 146 |                 return None
 | 
| 147 | 
 | 
| 148 |         elif case(word_part_e.SingleQuoted):
 | 
| 149 |             part0 = cast(SingleQuoted, UP_part0)
 | 
| 150 |             # TODO: SingleQuoted should have lazy (str? sval) field
 | 
| 151 |             # This would only affect multi-line strings though?
 | 
| 152 |             return part0.sval
 | 
| 153 | 
 | 
| 154 |         else:
 | 
| 155 |             # e.g. DoubleQuoted can't be optimized to a string, because it
 | 
| 156 |             # might have "$@" and such
 | 
| 157 |             return None
 | 
| 158 | 
 | 
| 159 | 
 | 
| 160 | def StaticEval(UP_w):
 | 
| 161 |     # type: (word_t) -> Tuple[bool, str, bool]
 | 
| 162 |     """Evaluate a Compound at PARSE TIME."""
 | 
| 163 |     quoted = False
 | 
| 164 | 
 | 
| 165 |     # e.g. for ( instead of for (( is a token word
 | 
| 166 |     if UP_w.tag() != word_e.Compound:
 | 
| 167 |         return False, '', quoted
 | 
| 168 | 
 | 
| 169 |     w = cast(CompoundWord, UP_w)
 | 
| 170 | 
 | 
| 171 |     strs = []  # type: List[str]
 | 
| 172 |     for part in w.parts:
 | 
| 173 |         ok, s, q = _EvalWordPart(part)
 | 
| 174 |         if not ok:
 | 
| 175 |             return False, '', quoted
 | 
| 176 |         if q:
 | 
| 177 |             quoted = True  # at least one part was quoted
 | 
| 178 |         strs.append(s)
 | 
| 179 |     #log('StaticEval parts %s', w.parts)
 | 
| 180 |     return True, ''.join(strs), quoted
 | 
| 181 | 
 | 
| 182 | 
 | 
| 183 | # From bash, general.c, unquoted_tilde_word():
 | 
| 184 | # POSIX.2, 3.6.1:  A tilde-prefix consists of an unquoted tilde character at
 | 
| 185 | # the beginning of the word, followed by all of the characters preceding the
 | 
| 186 | # first unquoted slash in the word, or all the characters in the word if there
 | 
| 187 | # is no slash...If none of the characters in the tilde-prefix are quoted, the
 | 
| 188 | # characters in the tilde-prefix following the tilde shell be treated as a
 | 
| 189 | # possible login name.
 | 
| 190 | #define TILDE_END(c)    ((c) == '\0' || (c) == '/' || (c) == ':')
 | 
| 191 | #
 | 
| 192 | # So an unquoted tilde can ALWAYS start a new lex mode?  You respect quotes and
 | 
| 193 | # substitutions.
 | 
| 194 | #
 | 
| 195 | # We only detect ~Lit_Chars and split.  So we might as well just write a regex.
 | 
| 196 | 
 | 
| 197 | 
 | 
| 198 | def TildeDetect(UP_w):
 | 
| 199 |     # type: (word_t) -> Optional[CompoundWord]
 | 
| 200 |     """Detect tilde expansion in a word.
 | 
| 201 | 
 | 
| 202 |     It might begin with  Literal that needs to be turned into a TildeSub.
 | 
| 203 |     (It depends on whether the second token begins with slash).
 | 
| 204 | 
 | 
| 205 |     If so, it return a new word.  Otherwise return None.
 | 
| 206 | 
 | 
| 207 |     NOTE:
 | 
| 208 |     - The regex for Lit_TildeLike could be expanded.  Right now it's
 | 
| 209 |       conservative, like Lit_Chars without the /.
 | 
| 210 |     - It's possible to write this in a mutating style, since only the first token
 | 
| 211 |       is changed.  But note that we CANNOT know this during lexing.
 | 
| 212 |     """
 | 
| 213 |     # BracedTree can't be tilde expanded
 | 
| 214 |     if UP_w.tag() != word_e.Compound:
 | 
| 215 |         return None
 | 
| 216 | 
 | 
| 217 |     w = cast(CompoundWord, UP_w)
 | 
| 218 |     return TildeDetect2(w)
 | 
| 219 | 
 | 
| 220 | 
 | 
| 221 | def TildeDetect2(w):
 | 
| 222 |     # type: (CompoundWord) -> Optional[CompoundWord]
 | 
| 223 |     """If tilde sub is detected, returns a new CompoundWord.
 | 
| 224 | 
 | 
| 225 |     Accepts CompoundWord, not word_t.  After brace expansion, we know we have a
 | 
| 226 |     List[CompoundWord].
 | 
| 227 | 
 | 
| 228 |     Tilde detection:
 | 
| 229 | 
 | 
| 230 |     YES:
 | 
| 231 |         ~       ~/   
 | 
| 232 |         ~bob    ~bob/
 | 
| 233 | 
 | 
| 234 |     NO:
 | 
| 235 |         ~bob#    ~bob#/
 | 
| 236 |         ~bob$x
 | 
| 237 |         ~$x
 | 
| 238 | 
 | 
| 239 |     Pattern to match (all must be word_part_e.Literal):
 | 
| 240 | 
 | 
| 241 |         Lit_Tilde Lit_Chars? (Lit_Slash | %end)
 | 
| 242 |     """
 | 
| 243 |     if len(w.parts) == 0:  # ${a-} has no parts
 | 
| 244 |         return None
 | 
| 245 | 
 | 
| 246 |     part0 = w.parts[0]
 | 
| 247 |     id0 = LiteralId(part0)
 | 
| 248 |     if id0 != Id.Lit_Tilde:
 | 
| 249 |         return None  # $x is not TildeSub
 | 
| 250 | 
 | 
| 251 |     tok0 = cast(Token, part0)
 | 
| 252 | 
 | 
| 253 |     new_parts = []  # type: List[word_part_t]
 | 
| 254 | 
 | 
| 255 |     if len(w.parts) == 1:  # ~
 | 
| 256 |         new_parts.append(word_part.TildeSub(tok0, None, None))
 | 
| 257 |         return CompoundWord(new_parts)
 | 
| 258 | 
 | 
| 259 |     id1 = LiteralId(w.parts[1])
 | 
| 260 |     if id1 == Id.Lit_Slash:  # ~/
 | 
| 261 |         new_parts.append(word_part.TildeSub(tok0, None, None))
 | 
| 262 |         new_parts.extend(w.parts[1:])
 | 
| 263 |         return CompoundWord(new_parts)
 | 
| 264 | 
 | 
| 265 |     if id1 != Id.Lit_Chars:
 | 
| 266 |         return None  # ~$x is not TildeSub
 | 
| 267 | 
 | 
| 268 |     tok1 = cast(Token, w.parts[1])
 | 
| 269 | 
 | 
| 270 |     if len(w.parts) == 2:  # ~foo
 | 
| 271 |         new_parts.append(word_part.TildeSub(tok0, tok1, lexer.TokenVal(tok1)))
 | 
| 272 |         return CompoundWord(new_parts)
 | 
| 273 | 
 | 
| 274 |     id2 = LiteralId(w.parts[2])
 | 
| 275 |     if id2 != Id.Lit_Slash:  # ~foo$x is not TildeSub
 | 
| 276 |         return None
 | 
| 277 | 
 | 
| 278 |     new_parts.append(word_part.TildeSub(tok0, tok1, lexer.TokenVal(tok1)))
 | 
| 279 |     new_parts.extend(w.parts[2:])
 | 
| 280 |     return CompoundWord(new_parts)
 | 
| 281 | 
 | 
| 282 | 
 | 
| 283 | def TildeDetectAssign(w):
 | 
| 284 |     # type: (CompoundWord) -> None
 | 
| 285 |     """Detects multiple tilde sub, like a=~:~/src:~bob
 | 
| 286 | 
 | 
| 287 |     MUTATES its argument.
 | 
| 288 | 
 | 
| 289 |     Pattern for to match (all must be word_part_e.Literal):
 | 
| 290 | 
 | 
| 291 |         Lit_Tilde Lit_Chars? (Lit_Slash | Lit_Colon | %end)
 | 
| 292 |     """
 | 
| 293 |     parts = w.parts
 | 
| 294 | 
 | 
| 295 |     # Bail out EARLY if there are no ~ at all
 | 
| 296 |     has_tilde = False
 | 
| 297 |     for part in parts:
 | 
| 298 |         if LiteralId(part) == Id.Lit_Tilde:
 | 
| 299 |             has_tilde = True
 | 
| 300 |             break
 | 
| 301 |     if not has_tilde:
 | 
| 302 |         return  # Avoid further work and allocations
 | 
| 303 | 
 | 
| 304 |     # Avoid IndexError, since we have to look ahead up to 2 tokens
 | 
| 305 |     parts.append(None)
 | 
| 306 |     parts.append(None)
 | 
| 307 | 
 | 
| 308 |     new_parts = []  # type: List[word_part_t]
 | 
| 309 | 
 | 
| 310 |     tilde_could_be_next = True  # true at first, and true after :
 | 
| 311 | 
 | 
| 312 |     i = 0
 | 
| 313 |     n = len(parts)
 | 
| 314 | 
 | 
| 315 |     while i < n:
 | 
| 316 |         part0 = parts[i]
 | 
| 317 |         if part0 is None:
 | 
| 318 |             break
 | 
| 319 | 
 | 
| 320 |         #log('i = %d', i)
 | 
| 321 |         #log('part0 %s', part0)
 | 
| 322 | 
 | 
| 323 |         # Skip tilde in middle of word, like a=foo~bar
 | 
| 324 |         if tilde_could_be_next and LiteralId(part0) == Id.Lit_Tilde:
 | 
| 325 |             # If ~ ends the string, we have
 | 
| 326 |             part1 = parts[i + 1]
 | 
| 327 |             part2 = parts[i + 2]
 | 
| 328 | 
 | 
| 329 |             tok0 = cast(Token, part0)
 | 
| 330 | 
 | 
| 331 |             if part1 is None:  # x=foo:~
 | 
| 332 |                 new_parts.append(word_part.TildeSub(tok0, None, None))
 | 
| 333 |                 break  # at end
 | 
| 334 | 
 | 
| 335 |             id1 = LiteralId(part1)
 | 
| 336 | 
 | 
| 337 |             if id1 in (Id.Lit_Slash, Id.Lit_Colon):  # x=foo:~/ or x=foo:~:
 | 
| 338 |                 new_parts.append(word_part.TildeSub(tok0, None, None))
 | 
| 339 |                 new_parts.append(part1)
 | 
| 340 |                 i += 2
 | 
| 341 |                 continue
 | 
| 342 | 
 | 
| 343 |             if id1 != Id.Lit_Chars:
 | 
| 344 |                 new_parts.append(part0)  # unchanged
 | 
| 345 |                 new_parts.append(part1)  # ...
 | 
| 346 |                 i += 2
 | 
| 347 |                 continue  # x=foo:~$x is not tilde sub
 | 
| 348 | 
 | 
| 349 |             tok1 = cast(Token, part1)
 | 
| 350 | 
 | 
| 351 |             if part2 is None:  # x=foo:~foo
 | 
| 352 |                 # consume both
 | 
| 353 |                 new_parts.append(
 | 
| 354 |                     word_part.TildeSub(tok0, tok1, lexer.TokenVal(tok1)))
 | 
| 355 |                 break  # at end
 | 
| 356 | 
 | 
| 357 |             id2 = LiteralId(part2)
 | 
| 358 |             if id2 not in (Id.Lit_Slash, Id.Lit_Colon):  # x=foo:~foo$x
 | 
| 359 |                 new_parts.append(part0)  # unchanged
 | 
| 360 |                 new_parts.append(part1)  # ...
 | 
| 361 |                 new_parts.append(part2)  # ...
 | 
| 362 |                 i += 3
 | 
| 363 |                 continue
 | 
| 364 | 
 | 
| 365 |             new_parts.append(
 | 
| 366 |                 word_part.TildeSub(tok0, tok1, lexer.TokenVal(tok1)))
 | 
| 367 |             new_parts.append(part2)
 | 
| 368 |             i += 3
 | 
| 369 | 
 | 
| 370 |             tilde_could_be_next = (id2 == Id.Lit_Colon)
 | 
| 371 | 
 | 
| 372 |         else:
 | 
| 373 |             new_parts.append(part0)
 | 
| 374 |             i += 1
 | 
| 375 | 
 | 
| 376 |             tilde_could_be_next = (LiteralId(part0) == Id.Lit_Colon)
 | 
| 377 | 
 | 
| 378 |     parts.pop()
 | 
| 379 |     parts.pop()
 | 
| 380 | 
 | 
| 381 |     # Mutate argument
 | 
| 382 |     w.parts = new_parts
 | 
| 383 | 
 | 
| 384 | 
 | 
| 385 | def TildeDetectAll(words):
 | 
| 386 |     # type: (List[word_t]) -> List[word_t]
 | 
| 387 |     out = []  # type: List[word_t]
 | 
| 388 |     for w in words:
 | 
| 389 |         t = TildeDetect(w)
 | 
| 390 |         if t:
 | 
| 391 |             out.append(t)
 | 
| 392 |         else:
 | 
| 393 |             out.append(w)
 | 
| 394 |     return out
 | 
| 395 | 
 | 
| 396 | 
 | 
| 397 | def HasArrayPart(w):
 | 
| 398 |     # type: (CompoundWord) -> bool
 | 
| 399 |     """Used in cmd_parse."""
 | 
| 400 |     for part in w.parts:
 | 
| 401 |         if part.tag() == word_part_e.ShArrayLiteral:
 | 
| 402 |             return True
 | 
| 403 |     return False
 | 
| 404 | 
 | 
| 405 | 
 | 
| 406 | def ShFunctionName(w):
 | 
| 407 |     # type: (CompoundWord) -> str
 | 
| 408 |     """Returns a valid shell function name, or the empty string.
 | 
| 409 | 
 | 
| 410 |     TODO: Maybe use this regex to validate:
 | 
| 411 | 
 | 
| 412 |     FUNCTION_NAME_RE = r'[^{}\[\]=]*'
 | 
| 413 | 
 | 
| 414 |     Bash is very lenient, but that would disallow confusing characters, for
 | 
| 415 |     better error messages on a[x]=(), etc.
 | 
| 416 |     """
 | 
| 417 |     ok, s, quoted = StaticEval(w)
 | 
| 418 |     # Function names should not have quotes
 | 
| 419 |     if not ok or quoted:
 | 
| 420 |         return ''
 | 
| 421 |     return s
 | 
| 422 | 
 | 
| 423 | 
 | 
| 424 | def LooksLikeArithVar(UP_w):
 | 
| 425 |     # type: (word_t) -> Optional[Token]
 | 
| 426 |     """Return a token if this word looks like an arith var.
 | 
| 427 | 
 | 
| 428 |     NOTE: This can't be combined with DetectShAssignment because VarLike and
 | 
| 429 |     ArithVarLike must be different tokens.  Otherwise _ReadCompoundWord will be
 | 
| 430 |     confused between array assignments foo=(1 2) and function calls foo(1, 2).
 | 
| 431 |     """
 | 
| 432 |     if UP_w.tag() != word_e.Compound:
 | 
| 433 |         return None
 | 
| 434 | 
 | 
| 435 |     w = cast(CompoundWord, UP_w)
 | 
| 436 |     if len(w.parts) != 1:
 | 
| 437 |         return None
 | 
| 438 | 
 | 
| 439 |     UP_part0 = w.parts[0]
 | 
| 440 |     if LiteralId(UP_part0) != Id.Lit_ArithVarLike:
 | 
| 441 |         return None
 | 
| 442 | 
 | 
| 443 |     return cast(Token, UP_part0)
 | 
| 444 | 
 | 
| 445 | 
 | 
| 446 | def IsVarLike(w):
 | 
| 447 |     # type: (CompoundWord) -> bool
 | 
| 448 |     """Tests whether a word looks like FOO=bar.
 | 
| 449 | 
 | 
| 450 |     This is a quick test for the command parser to distinguish:
 | 
| 451 | 
 | 
| 452 |     func() { echo hi; }
 | 
| 453 |     func=(1 2 3)
 | 
| 454 |     """
 | 
| 455 |     if len(w.parts) == 0:
 | 
| 456 |         return False
 | 
| 457 | 
 | 
| 458 |     return LiteralId(w.parts[0]) == Id.Lit_VarLike
 | 
| 459 | 
 | 
| 460 | 
 | 
| 461 | def DetectShAssignment(w):
 | 
| 462 |     # type: (CompoundWord) -> Tuple[Optional[Token], Optional[Token], int]
 | 
| 463 |     """Detects whether a word looks like FOO=bar or FOO[x]=bar.
 | 
| 464 | 
 | 
| 465 |     Returns:
 | 
| 466 |       left_token or None   # Lit_VarLike, Lit_ArrayLhsOpen, or None if it's not an
 | 
| 467 |                            # assignment
 | 
| 468 |       close_token,         # Lit_ArrayLhsClose if it was detected, or None
 | 
| 469 |       part_offset          # where to start the value word, 0 if not an assignment
 | 
| 470 | 
 | 
| 471 |     Cases:
 | 
| 472 | 
 | 
| 473 |     s=1
 | 
| 474 |     s+=1
 | 
| 475 |     s[x]=1
 | 
| 476 |     s[x]+=1
 | 
| 477 | 
 | 
| 478 |     a=()
 | 
| 479 |     a+=()
 | 
| 480 |     a[x]=(
 | 
| 481 |     a[x]+=()  # We parse this (as bash does), but it's never valid because arrays
 | 
| 482 |               # can't be nested.
 | 
| 483 |     """
 | 
| 484 |     no_token = None  # type: Optional[Token]
 | 
| 485 | 
 | 
| 486 |     n = len(w.parts)
 | 
| 487 |     if n == 0:
 | 
| 488 |         return no_token, no_token, 0
 | 
| 489 | 
 | 
| 490 |     UP_part0 = w.parts[0]
 | 
| 491 |     id0 = LiteralId(UP_part0)
 | 
| 492 |     if id0 == Id.Lit_VarLike:
 | 
| 493 |         tok = cast(Token, UP_part0)
 | 
| 494 |         return tok, no_token, 1  # everything after first token is the value
 | 
| 495 | 
 | 
| 496 |     if id0 == Id.Lit_ArrayLhsOpen:
 | 
| 497 |         tok0 = cast(Token, UP_part0)
 | 
| 498 |         # NOTE that a[]=x should be an error.  We don't want to silently decay.
 | 
| 499 |         if n < 2:
 | 
| 500 |             return no_token, no_token, 0
 | 
| 501 |         for i in xrange(1, n):
 | 
| 502 |             UP_part = w.parts[i]
 | 
| 503 |             if LiteralId(UP_part) == Id.Lit_ArrayLhsClose:
 | 
| 504 |                 tok_close = cast(Token, UP_part)
 | 
| 505 |                 return tok0, tok_close, i + 1
 | 
| 506 | 
 | 
| 507 |     # Nothing detected.  Could be 'foobar' or a[x+1+2/' without the closing ].
 | 
| 508 |     return no_token, no_token, 0
 | 
| 509 | 
 | 
| 510 | 
 | 
| 511 | def DetectAssocPair(w):
 | 
| 512 |     # type: (CompoundWord) -> Optional[AssocPair]
 | 
| 513 |     """Like DetectShAssignment, but for A=(['k']=v ['k2']=v)
 | 
| 514 | 
 | 
| 515 |     The key and the value are both strings.  So we just pick out
 | 
| 516 |     word_part. Unlike a[k]=v, A=([k]=v) is NOT ambiguous, because the
 | 
| 517 |     [k] syntax is only used for associative array literals, as opposed
 | 
| 518 |     to indexed array literals.
 | 
| 519 |     """
 | 
| 520 |     parts = w.parts
 | 
| 521 |     if LiteralId(parts[0]) != Id.Lit_LBracket:
 | 
| 522 |         return None
 | 
| 523 | 
 | 
| 524 |     n = len(parts)
 | 
| 525 |     for i in xrange(n):
 | 
| 526 |         id_ = LiteralId(parts[i])
 | 
| 527 |         if id_ == Id.Lit_ArrayLhsClose:  # ]=
 | 
| 528 |             # e.g. if we have [$x$y]=$a$b
 | 
| 529 |             key = CompoundWord(parts[1:i])  # $x$y
 | 
| 530 |             value = CompoundWord(parts[i + 1:])  # $a$b from
 | 
| 531 | 
 | 
| 532 |             # Type-annotated intermediate value for mycpp translation
 | 
| 533 |             return AssocPair(key, value)
 | 
| 534 | 
 | 
| 535 |     return None
 | 
| 536 | 
 | 
| 537 | 
 | 
| 538 | def IsControlFlow(w):
 | 
| 539 |     # type: (CompoundWord) -> Tuple[Kind_t, Optional[Token]]
 | 
| 540 |     """Tests if a word is a control flow word."""
 | 
| 541 |     no_token = None  # type: Optional[Token]
 | 
| 542 | 
 | 
| 543 |     if len(w.parts) != 1:
 | 
| 544 |         return Kind.Undefined, no_token
 | 
| 545 | 
 | 
| 546 |     UP_part0 = w.parts[0]
 | 
| 547 |     token_type = LiteralId(UP_part0)
 | 
| 548 |     if token_type == Id.Undefined_Tok:
 | 
| 549 |         return Kind.Undefined, no_token
 | 
| 550 | 
 | 
| 551 |     token_kind = consts.GetKind(token_type)
 | 
| 552 |     if token_kind == Kind.ControlFlow:
 | 
| 553 |         return token_kind, cast(Token, UP_part0)
 | 
| 554 | 
 | 
| 555 |     return Kind.Undefined, no_token
 | 
| 556 | 
 | 
| 557 | 
 | 
| 558 | def LiteralToken(UP_w):
 | 
| 559 |     # type: (word_t) -> Optional[Token]
 | 
| 560 |     """If a word consists of a literal token, return it.
 | 
| 561 | 
 | 
| 562 |     Otherwise return None.
 | 
| 563 |     """
 | 
| 564 |     # We're casting here because this function is called by the CommandParser for
 | 
| 565 |     # var, setvar, '...', etc.  It's easier to cast in one place.
 | 
| 566 |     assert UP_w.tag() == word_e.Compound, UP_w
 | 
| 567 |     w = cast(CompoundWord, UP_w)
 | 
| 568 | 
 | 
| 569 |     if len(w.parts) != 1:
 | 
| 570 |         return None
 | 
| 571 | 
 | 
| 572 |     part0 = w.parts[0]
 | 
| 573 |     if part0.tag() == word_part_e.Literal:
 | 
| 574 |         return cast(Token, part0)
 | 
| 575 | 
 | 
| 576 |     return None
 | 
| 577 | 
 | 
| 578 | 
 | 
| 579 | def BraceToken(UP_w):
 | 
| 580 |     # type: (word_t) -> Optional[Token]
 | 
| 581 |     """If a word has Id.Lit_LBrace or Lit_RBrace, return a Token.
 | 
| 582 | 
 | 
| 583 |     This is a special case for osh/cmd_parse.py
 | 
| 584 | 
 | 
| 585 |     The WordParser changes Id.Op_LBrace from ExprParser into Id.Lit_LBrace, so we
 | 
| 586 |     may get a token, not a word.
 | 
| 587 |     """
 | 
| 588 |     with tagswitch(UP_w) as case:
 | 
| 589 |         if case(word_e.Operator):
 | 
| 590 |             tok = cast(Token, UP_w)
 | 
| 591 |             assert tok.id in (Id.Lit_LBrace, Id.Lit_RBrace), tok
 | 
| 592 |             return tok
 | 
| 593 | 
 | 
| 594 |         elif case(word_e.Compound):
 | 
| 595 |             w = cast(CompoundWord, UP_w)
 | 
| 596 |             return LiteralToken(w)
 | 
| 597 | 
 | 
| 598 |         else:
 | 
| 599 |             raise AssertionError()
 | 
| 600 | 
 | 
| 601 | 
 | 
| 602 | def AsKeywordToken(UP_w):
 | 
| 603 |     # type: (word_t) -> Token
 | 
| 604 |     """Given a word that IS A CompoundWord containing just a keyword, return
 | 
| 605 |     the single token at the start."""
 | 
| 606 |     assert UP_w.tag() == word_e.Compound, UP_w
 | 
| 607 |     w = cast(CompoundWord, UP_w)
 | 
| 608 | 
 | 
| 609 |     part = w.parts[0]
 | 
| 610 |     assert part.tag() == word_part_e.Literal, part
 | 
| 611 |     tok = cast(Token, part)
 | 
| 612 |     assert consts.GetKind(tok.id) == Kind.KW, tok
 | 
| 613 |     return tok
 | 
| 614 | 
 | 
| 615 | 
 | 
| 616 | def AsOperatorToken(word):
 | 
| 617 |     # type: (word_t) -> Token
 | 
| 618 |     """For a word that IS an operator (word.Token), return that token.
 | 
| 619 | 
 | 
| 620 |     This must only be called on a word which is known to be an operator
 | 
| 621 |     (word.Token).
 | 
| 622 |     """
 | 
| 623 |     assert word.tag() == word_e.Operator, word
 | 
| 624 |     return cast(Token, word)
 | 
| 625 | 
 | 
| 626 | 
 | 
| 627 | #
 | 
| 628 | # Polymorphic between Token and Compound
 | 
| 629 | #
 | 
| 630 | 
 | 
| 631 | 
 | 
| 632 | def ArithId(w):
 | 
| 633 |     # type: (word_t) -> Id_t
 | 
| 634 |     """Used by shell arithmetic parsing."""
 | 
| 635 |     if w.tag() == word_e.Operator:
 | 
| 636 |         tok = cast(Token, w)
 | 
| 637 |         return tok.id
 | 
| 638 | 
 | 
| 639 |     assert isinstance(w, CompoundWord)
 | 
| 640 |     return Id.Word_Compound
 | 
| 641 | 
 | 
| 642 | 
 | 
| 643 | def BoolId(w):
 | 
| 644 |     # type: (word_t) -> Id_t
 | 
| 645 |     UP_w = w
 | 
| 646 |     with tagswitch(w) as case:
 | 
| 647 |         if case(word_e.String):  # for test/[
 | 
| 648 |             w = cast(word.String, UP_w)
 | 
| 649 |             return w.id
 | 
| 650 | 
 | 
| 651 |         elif case(word_e.Operator):
 | 
| 652 |             tok = cast(Token, UP_w)
 | 
| 653 |             return tok.id
 | 
| 654 | 
 | 
| 655 |         elif case(word_e.Compound):
 | 
| 656 |             w = cast(CompoundWord, UP_w)
 | 
| 657 | 
 | 
| 658 |             if len(w.parts) != 1:
 | 
| 659 |                 return Id.Word_Compound
 | 
| 660 | 
 | 
| 661 |             token_type = LiteralId(w.parts[0])
 | 
| 662 |             if token_type == Id.Undefined_Tok:
 | 
| 663 |                 return Id.Word_Compound  # It's a regular word
 | 
| 664 | 
 | 
| 665 |             # This is outside the BoolUnary/BoolBinary namespace, but works the same.
 | 
| 666 |             if token_type in (Id.KW_Bang, Id.Lit_DRightBracket):
 | 
| 667 |                 return token_type  # special boolean "tokens"
 | 
| 668 | 
 | 
| 669 |             token_kind = consts.GetKind(token_type)
 | 
| 670 |             if token_kind in (Kind.BoolUnary, Kind.BoolBinary):
 | 
| 671 |                 return token_type  # boolean operators
 | 
| 672 | 
 | 
| 673 |             return Id.Word_Compound
 | 
| 674 | 
 | 
| 675 |         else:
 | 
| 676 |             # I think Empty never happens in this context?
 | 
| 677 |             raise AssertionError(w.tag())
 | 
| 678 | 
 | 
| 679 | 
 | 
| 680 | def CommandId(w):
 | 
| 681 |     # type: (word_t) -> Id_t
 | 
| 682 |     """Used by CommandParser."""
 | 
| 683 |     UP_w = w
 | 
| 684 |     with tagswitch(w) as case:
 | 
| 685 |         if case(word_e.Operator):
 | 
| 686 |             tok = cast(Token, UP_w)
 | 
| 687 |             return tok.id
 | 
| 688 | 
 | 
| 689 |         elif case(word_e.Compound):
 | 
| 690 |             w = cast(CompoundWord, UP_w)
 | 
| 691 | 
 | 
| 692 |             # Fine-grained categorization of SINGLE literal parts
 | 
| 693 |             if len(w.parts) != 1:
 | 
| 694 |                 return Id.Word_Compound  # generic word
 | 
| 695 | 
 | 
| 696 |             token_type = LiteralId(w.parts[0])
 | 
| 697 |             if token_type == Id.Undefined_Tok:
 | 
| 698 |                 return Id.Word_Compound  # Not Kind.Lit, generic word
 | 
| 699 | 
 | 
| 700 |             if token_type in (Id.Lit_LBrace, Id.Lit_RBrace, Id.Lit_Equals,
 | 
| 701 |                               Id.Lit_TDot):
 | 
| 702 |                 # - { } are for YSH braces
 | 
| 703 |                 # - = is for the = keyword
 | 
| 704 |                 # - ... is to start multiline mode
 | 
| 705 |                 #
 | 
| 706 |                 # TODO: Should we use Op_{LBrace,RBrace} and Kind.Op when
 | 
| 707 |                 # parse_brace?  Lit_Equals could be KW_Equals?
 | 
| 708 |                 return token_type
 | 
| 709 | 
 | 
| 710 |             token_kind = consts.GetKind(token_type)
 | 
| 711 |             if token_kind == Kind.KW:
 | 
| 712 |                 return token_type  # Id.KW_Var, etc.
 | 
| 713 | 
 | 
| 714 |             return Id.Word_Compound  # generic word
 | 
| 715 | 
 | 
| 716 |         else:
 | 
| 717 |             raise AssertionError(w.tag())
 | 
| 718 | 
 | 
| 719 | 
 | 
| 720 | def CommandKind(w):
 | 
| 721 |     # type: (word_t) -> Kind_t
 | 
| 722 |     """The CommandKind is for coarse-grained decisions in the CommandParser.
 | 
| 723 | 
 | 
| 724 |     NOTE: This is inconsistent with CommandId(), because we never return
 | 
| 725 |     Kind.KW or Kind.Lit.  But the CommandParser is easier to write this way.
 | 
| 726 | 
 | 
| 727 |     For example, these are valid redirects to a Kind.Word, and the parser
 | 
| 728 |     checks:
 | 
| 729 | 
 | 
| 730 |       echo hi > =
 | 
| 731 |       echo hi > {
 | 
| 732 | 
 | 
| 733 |     Invalid:
 | 
| 734 |       echo hi > (
 | 
| 735 |       echo hi > ;
 | 
| 736 |     """
 | 
| 737 |     if w.tag() == word_e.Operator:
 | 
| 738 |         tok = cast(Token, w)
 | 
| 739 |         # CommandParser uses Kind.Redir, Kind.Op, Kind.Eof, etc.
 | 
| 740 |         return consts.GetKind(tok.id)
 | 
| 741 | 
 | 
| 742 |     return Kind.Word
 | 
| 743 | 
 | 
| 744 | 
 | 
| 745 | # Stubs for converting RHS of assignment to expression mode.
 | 
| 746 | # For osh2oil.py
 | 
| 747 | def IsVarSub(w):
 | 
| 748 |     # type: (word_t) -> bool
 | 
| 749 |     """Return whether it's any var sub, or a double quoted one."""
 | 
| 750 |     return False
 | 
| 751 | 
 | 
| 752 | 
 | 
| 753 | # Doesn't translate with mycpp because of dynamic %
 | 
| 754 | def ErrorWord(error_str):
 | 
| 755 |     # type: (str) -> CompoundWord
 | 
| 756 |     t = lexer.DummyToken(Id.Lit_Chars, error_str)
 | 
| 757 |     return CompoundWord([t])
 | 
| 758 | 
 | 
| 759 | 
 | 
| 760 | def Pretty(w):
 | 
| 761 |     # type: (word_t) -> str
 | 
| 762 |     """Return a string to display to the user."""
 | 
| 763 |     UP_w = w
 | 
| 764 |     if w.tag() == word_e.String:
 | 
| 765 |         w = cast(word.String, UP_w)
 | 
| 766 |         if w.id == Id.Eof_Real:
 | 
| 767 |             return 'EOF'
 | 
| 768 |         else:
 | 
| 769 |             return repr(w.s)
 | 
| 770 |     else:
 | 
| 771 |         return word_str(w.tag())  # tag name
 | 
| 772 | 
 | 
| 773 | 
 | 
| 774 | class ctx_EmitDocToken(object):
 | 
| 775 |     """For doc comments."""
 | 
| 776 | 
 | 
| 777 |     def __init__(self, w_parser):
 | 
| 778 |         # type: (WordParser) -> None
 | 
| 779 |         w_parser.EmitDocToken(True)
 | 
| 780 |         self.w_parser = w_parser
 | 
| 781 | 
 | 
| 782 |     def __enter__(self):
 | 
| 783 |         # type: () -> None
 | 
| 784 |         pass
 | 
| 785 | 
 | 
| 786 |     def __exit__(self, type, value, traceback):
 | 
| 787 |         # type: (Any, Any, Any) -> None
 | 
| 788 |         self.w_parser.EmitDocToken(False)
 | 
| 789 | 
 | 
| 790 | 
 | 
| 791 | class ctx_Multiline(object):
 | 
| 792 |     """For multiline commands."""
 | 
| 793 | 
 | 
| 794 |     def __init__(self, w_parser):
 | 
| 795 |         # type: (WordParser) -> None
 | 
| 796 |         w_parser.Multiline(True)
 | 
| 797 |         self.w_parser = w_parser
 | 
| 798 | 
 | 
| 799 |     def __enter__(self):
 | 
| 800 |         # type: () -> None
 | 
| 801 |         pass
 | 
| 802 | 
 | 
| 803 |     def __exit__(self, type, value, traceback):
 | 
| 804 |         # type: (Any, Any, Any) -> None
 | 
| 805 |         self.w_parser.Multiline(False)
 |