OILS / osh / word_parse.py View on Github | oilshell.org

2192 lines, 1174 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90)
91from core import alloc
92from core.error import p_die
93from mycpp.mylib import log
94from core import pyutil
95from core import ui
96from frontend import consts
97from frontend import lexer
98from frontend import reader
99from osh import tdop
100from osh import arith_parse
101from osh import braces
102from osh import word_
103from osh import word_compile
104from mycpp.mylib import tagswitch
105
106from typing import List, Optional, Tuple, cast
107from typing import TYPE_CHECKING
108if TYPE_CHECKING:
109 from frontend.lexer import Lexer
110 from frontend.parse_lib import ParseContext
111 from frontend.reader import _Reader
112 from osh.cmd_parse import VarChecker
113
114unused1 = log
115unused2 = Id_str
116
117KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120class WordEmitter(object):
121 """Common interface for [ and [["""
122
123 def __init__(self):
124 # type: () -> None
125 """Empty constructor for mycpp."""
126 pass
127
128 def ReadWord(self, lex_mode):
129 # type: (lex_mode_t) -> word_t
130 raise NotImplementedError()
131
132
133class WordParser(WordEmitter):
134
135 def __init__(self, parse_ctx, lexer, line_reader):
136 # type: (ParseContext, Lexer, _Reader) -> None
137 self.parse_ctx = parse_ctx
138 self.lexer = lexer
139 self.line_reader = line_reader
140 self.arena = line_reader.arena
141
142 self.parse_opts = parse_ctx.parse_opts
143 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144 self.parse_opts)
145 self.Reset()
146
147 def Init(self, lex_mode):
148 # type: (lex_mode_t) -> None
149 """Used to parse arithmetic, see ParseContext."""
150 self.next_lex_mode = lex_mode
151
152 def Reset(self):
153 # type: () -> None
154 """Called by interactive loop."""
155 # For _GetToken()
156 self.cur_token = None # type: Token
157 self.token_kind = Kind.Undefined
158 self.token_type = Id.Undefined_Tok
159
160 self.next_lex_mode = lex_mode_e.ShCommand
161
162 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163 # comments
164 self.emit_doc_token = False
165 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166 # multiline mode.
167 self.multiline = False
168
169 # For detecting invalid \n\n in multiline mode. Counts what we got
170 # directly from the lexer.
171 self.newline_state = 0
172 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173 # that consume words.
174 self.returned_newline = False
175
176 # For integration with pgen2
177 self.buffered_word = None # type: word_t
178
179 def _GetToken(self):
180 # type: () -> None
181 """Call this when you need to make a decision based on any of:
182
183 self.token_type
184 self.token_kind
185 self.cur_token
186 """
187 if self.next_lex_mode == lex_mode_e.Undefined:
188 return # _SetNext() not called, so do nothing
189
190 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
191 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
192
193 self.cur_token = self.lexer.Read(real_mode)
194
195 # MUTATE TOKEN for fake lexer mode.
196 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
197 if (is_fake and self.cur_token.id
198 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
199 self.cur_token.id = Id.Lit_Chars
200
201 self.token_type = self.cur_token.id
202 self.token_kind = consts.GetKind(self.token_type)
203
204 # number of consecutive newlines, ignoring whitespace
205 if self.token_type == Id.Op_Newline:
206 self.newline_state += 1
207 elif self.token_kind != Kind.WS:
208 self.newline_state = 0
209
210 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
211 self.next_lex_mode = lex_mode_e.Undefined
212
213 def _SetNext(self, lex_mode):
214 # type: (lex_mode_t) -> None
215 """Set the next lex state, but don't actually read a token.
216
217 We need this for proper interactive parsing.
218 """
219 self.next_lex_mode = lex_mode
220
221 def _ReadVarOpArg(self, arg_lex_mode):
222 # type: (lex_mode_t) -> rhs_word_t
223
224 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
225 # valid, even when unquoted.
226 self._SetNext(arg_lex_mode)
227 self._GetToken()
228
229 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
230 True) # empty_ok
231
232 # If the Compound has no parts, and we're in a double-quoted VarSub
233 # arg, and empty_ok, then return Empty. This is so it can evaluate to
234 # the empty string and not get elided.
235 #
236 # Examples:
237 # - "${s:-}", "${s/%pat/}"
238 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
239 # has the same potential problem of not having Token location info.
240 #
241 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
242 # return a Compound with no parts, which is explicitly checked with a
243 # custom error message.
244 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
245 return rhs_word.Empty
246
247 return w
248
249 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
250 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
251 """Return a CompoundWord.
252
253 Helper function for _ReadVarOpArg and used directly by
254 _ReadPatSubVarOp.
255 """
256 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
257 #log('w %s', w)
258 tilde = word_.TildeDetect(w)
259 if tilde:
260 w = tilde
261 return w
262
263 def _ReadSliceVarOp(self):
264 # type: () -> suffix_op.Slice
265 """VarOf ':' ArithExpr (':' ArithExpr )?"""
266 self._SetNext(lex_mode_e.Arith)
267 self._GetToken()
268 cur_id = self.token_type # e.g. Id.Arith_Colon
269
270 if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
271 # no beginning specified
272 begin = None # type: Optional[arith_expr_t]
273 else:
274 begin = self.a_parser.Parse()
275 cur_id = self.a_parser.CurrentId()
276
277 if cur_id == Id.Arith_RBrace:
278 no_length = None # type: Optional[arith_expr_t] # No length specified
279 return suffix_op.Slice(begin, no_length)
280
281 # Id.Arith_Colon is a pun for Id.VOp2_Colon
282 if cur_id == Id.Arith_Colon:
283 self._SetNext(lex_mode_e.Arith)
284 length = self._ReadArithExpr(Id.Arith_RBrace)
285 return suffix_op.Slice(begin, length)
286
287 p_die("Expected : or } in slice", self.cur_token)
288 raise AssertionError() # for MyPy
289
290 def _ReadPatSubVarOp(self):
291 # type: () -> suffix_op.PatSub
292 """Looking at the first '/' after VarOf:
293
294 VarSub = ...
295 | VarOf '/' Match ( '/' WORD? )?
296 Match = '/' WORD # can't be empty
297 | '#' WORD? # may be empty
298 | '%' WORD?
299 """
300 slash_tok = self.cur_token # location info
301 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
302
303 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
304
305 self._GetToken()
306 if self.token_type == Id.Right_DollarBrace:
307 pat = CompoundWord([])
308 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
309 slash_tok)
310
311 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
312 replace_mode = self.token_type
313 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
314
315 # Bash quirk:
316 # echo ${x/#/replace} has an empty pattern
317 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
318 empty_ok = replace_mode != Id.Lit_Slash
319 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
320 empty_ok)
321 #log('pat 1 %r', pat)
322
323 if self.token_type == Id.Lit_Slash:
324 # read until }
325 replace = self._ReadVarOpArg(
326 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
327 #log('r 1 %r', replace)
328 else:
329 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
330 replace = rhs_word.Empty
331
332 self._GetToken()
333 if self.token_type != Id.Right_DollarBrace:
334 # This happens on invalid code
335 p_die(
336 "Expected } after replacement string, got %s" %
337 ui.PrettyId(self.token_type), self.cur_token)
338
339 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
340
341 def _ReadSubscript(self):
342 # type: () -> bracket_op_t
343 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
344 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
345 # expression.
346 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
347 if next_id in (Id.Lit_At, Id.Arith_Star):
348 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
349
350 self._SetNext(lex_mode_e.Arith) # skip past [
351 self._GetToken()
352 self._SetNext(lex_mode_e.Arith) # skip past @
353 self._GetToken()
354 else:
355 self._SetNext(lex_mode_e.Arith) # skip past [
356 anode = self._ReadArithExpr(Id.Arith_RBracket)
357 op = bracket_op.ArrayIndex(anode)
358
359 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
360 p_die('Expected ] to close subscript', self.cur_token)
361
362 self._SetNext(lex_mode_e.VSub_2) # skip past ]
363 self._GetToken() # Needed to be in the same spot as no subscript
364
365 return op
366
367 def _ParseVarOf(self):
368 # type: () -> BracedVarSub
369 """
370 VarOf = NAME Subscript?
371 | NUMBER # no subscript allowed, none of these are arrays
372 # ${@[1]} doesn't work, even though slicing does
373 | VarSymbol
374 """
375 self._GetToken()
376 name_token = self.cur_token
377 self._SetNext(lex_mode_e.VSub_2)
378
379 self._GetToken() # Check for []
380 if self.token_type == Id.VOp2_LBracket:
381 bracket_op = self._ReadSubscript()
382 else:
383 bracket_op = None
384
385 part = BracedVarSub.CreateNull()
386 part.token = name_token
387 part.var_name = lexer.TokenVal(name_token)
388 part.bracket_op = bracket_op
389 return part
390
391 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
392 # type: (lex_mode_t, bool) -> BracedVarSub
393 """Start parsing at the op -- we already skipped past the name."""
394 part = self._ParseVarOf()
395
396 self._GetToken()
397 if self.token_type == Id.Right_DollarBrace:
398 return part # no ops
399
400 op_kind = self.token_kind
401
402 if op_kind == Kind.VTest:
403 tok = self.cur_token
404 arg_word = self._ReadVarOpArg(arg_lex_mode)
405 if self.token_type != Id.Right_DollarBrace:
406 p_die('Expected } to close ${', self.cur_token)
407
408 part.suffix_op = suffix_op.Unary(tok, arg_word)
409
410 elif op_kind == Kind.VOpYsh:
411 tok = self.cur_token
412 arg_word = self._ReadVarOpArg(arg_lex_mode)
413 if self.token_type != Id.Right_DollarBrace:
414 p_die('Expected } to close ${', self.cur_token)
415
416 UP_arg_word = arg_word
417 with tagswitch(arg_word) as case:
418 if case(rhs_word_e.Empty):
419 pass
420 elif case(rhs_word_e.Compound):
421 arg_word = cast(CompoundWord, UP_arg_word)
422 # This handles ${x|html} and ${x %.3f} now
423 # However I think ${x %.3f} should be statically parsed? It can enter
424 # the printf lexer modes.
425 ok, arg, quoted = word_.StaticEval(arg_word)
426 if not ok or quoted:
427 p_die('Expected a constant argument',
428 loc.Word(arg_word))
429
430 part.suffix_op = suffix_op.Static(tok, arg)
431
432 elif op_kind == Kind.VOp0:
433 part.suffix_op = self.cur_token # Nullary
434 self._SetNext(lex_mode_e.VSub_2) # Expecting }
435 self._GetToken()
436
437 elif op_kind == Kind.VOp1: # % %% # ## etc.
438 tok = self.cur_token
439 # Weird exception that all shells have: these operators take a glob
440 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
441 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
442 if self.token_type != Id.Right_DollarBrace:
443 p_die('Expected } to close ${', self.cur_token)
444
445 part.suffix_op = suffix_op.Unary(tok, arg_word)
446
447 elif op_kind == Kind.VOp2: # / : [ ]
448 if self.token_type == Id.VOp2_Slash:
449 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
450 part.suffix_op = patsub_op
451
452 # Checked by the method above
453 assert self.token_type == Id.Right_DollarBrace, self.cur_token
454
455 elif self.token_type == Id.VOp2_Colon:
456 part.suffix_op = self._ReadSliceVarOp()
457 # NOTE: } in arithmetic mode.
458 if self.token_type != Id.Arith_RBrace:
459 # Token seems off; doesn't point to X in # ${a:1:2 X
460 p_die('Expected } to close ${', self.cur_token)
461
462 else:
463 # TODO: Does this ever happen?
464 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
465
466 elif op_kind == Kind.VOp3: # ${prefix@} etc.
467 if allow_query:
468 part.suffix_op = self.cur_token # Nullary
469 self._SetNext(lex_mode_e.VSub_2) # Expecting }
470 self._GetToken()
471 else:
472 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
473
474 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
475 # mode. It's redundantly checked above.
476 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
477 # ${a.} or ${!a.}
478 p_die('Expected } to close ${', self.cur_token)
479
480 # Now look for ops
481 return part
482
483 def _ReadZshVarSub(self, left_token):
484 # type: (Token) -> word_part.ZshVarSub
485
486 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
487
488 # Can be empty
489 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
490 True)
491 self._GetToken()
492 return word_part.ZshVarSub(left_token, w, self.cur_token)
493
494 def ReadBracedVarSub(self, left_token):
495 # type: (Token) -> Tuple[BracedVarSub, Token]
496 """ For YSH expressions like var x = ${x:-"default"}. """
497 part = self._ReadBracedVarSub(left_token, d_quoted=False)
498 last_token = self.cur_token
499 return part, last_token
500
501 def _ReadBracedVarSub(self, left_token, d_quoted):
502 # type: (Token, bool) -> BracedVarSub
503 """For the ${} expression language.
504
505 NAME = [a-zA-Z_][a-zA-Z0-9_]*
506 NUMBER = [0-9]+ # ${10}, ${11}, ...
507
508 Subscript = '[' ('@' | '*' | ArithExpr) ']'
509 VarSymbol = '!' | '@' | '#' | ...
510 VarOf = NAME Subscript?
511 | NUMBER # no subscript allowed, none of these are arrays
512 # ${@[1]} doesn't work, even though slicing does
513 | VarSymbol
514
515 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
516
517 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
518 STRIP_OP = '#' | '##' | '%' | '%%'
519 CASE_OP = ',' | ',,' | '^' | '^^'
520 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
521
522 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
523 # SPACE is operator not %
524 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
525 VarExpr = VarOf
526 | VarOf NULLARY_OP
527 | VarOf UnaryOp WORD
528 | VarOf YSH_UNARY STATIC_WORD
529 | VarOf ':' ArithExpr (':' ArithExpr )?
530 | VarOf '/' Match '/' WORD
531
532 LengthExpr = '#' VarOf # can't apply operators after length
533
534 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
535 # ${!ref[0]} vs ${!keys[@]} resolved later
536
537 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
538
539 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
540
541 VarSub = LengthExpr
542 | RefOrKeys
543 | PrefixQuery
544 | VarExpr
545 | BuiltinSub
546
547 NOTES:
548 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
549 slicing ${a:x+1:y+2}
550 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
551 - @ and * are technically arithmetic expressions in this implementation
552 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
553 it's also vectorized.
554
555 Strictness over bash:
556 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
557 grammar
558 - ! and # prefixes can't be composed, even though named refs can be
559 composed with other operators
560 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
561 a prefix, and it can also be a literal part of WORD.
562
563 From the parser's point of view, the prefix # can't be combined with
564 UnaryOp/slicing/matching, and the ! can. However
565
566 - ${a[@]:1:2} is not allowed
567 - ${#a[@]:1:2} is allowed, but gives the wrong answer
568 """
569 if d_quoted:
570 arg_lex_mode = lex_mode_e.VSub_ArgDQ
571 else:
572 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
573
574 self._SetNext(lex_mode_e.VSub_1)
575 self._GetToken()
576
577 ty = self.token_type
578 first_tok = self.cur_token
579
580 if ty == Id.VSub_Pound:
581 # Disambiguate
582 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
583 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
584 # e.g. a name, '#' is the prefix
585 self._SetNext(lex_mode_e.VSub_1)
586 part = self._ParseVarOf()
587
588 self._GetToken()
589 if self.token_type != Id.Right_DollarBrace:
590 p_die('Expected } after length expression', self.cur_token)
591
592 part.prefix_op = first_tok
593
594 else: # not a prefix, '#' is the variable
595 part = self._ParseVarExpr(arg_lex_mode)
596
597 elif ty == Id.VSub_Bang:
598 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
599 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
600 # e.g. a name, '!' is the prefix
601 # ${!a} -- this is a ref
602 # ${!3} -- this is ref
603 # ${!a[1]} -- this is a ref
604 # ${!a[@]} -- this is a keys
605 # No lookahead -- do it in a second step, or at runtime
606 self._SetNext(lex_mode_e.VSub_1)
607 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
608
609 part.prefix_op = first_tok
610
611 else: # not a prefix, '!' is the variable
612 part = self._ParseVarExpr(arg_lex_mode)
613
614 elif ty == Id.VSub_Dot:
615 # Note: this will become a new builtin_sub type, so this method must
616 # return word_part_t rather than BracedVarSub. I don't think that
617 # should cause problems.
618 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
619
620 # VS_NAME, VS_NUMBER, symbol that isn't # or !
621 elif self.token_kind == Kind.VSub:
622 part = self._ParseVarExpr(arg_lex_mode)
623
624 else:
625 # e.g. ${^}
626 p_die('Unexpected token in ${}', self.cur_token)
627
628 part.left = left_token # attach the argument
629 part.right = self.cur_token
630 return part
631
632 def _ReadSingleQuoted(self, left_token, lex_mode):
633 # type: (Token, lex_mode_t) -> SingleQuoted
634 """Internal method to read a word_part."""
635 tokens = [] # type: List[Token]
636 # In command mode, we never disallow backslashes like '\'
637 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
638 False)
639 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
640 node = SingleQuoted(left_token, sval, right_quote)
641 return node
642
643 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
644 # type: (lex_mode_t, Token, List[Token], bool) -> Token
645 """Appends to out_tokens; returns last token
646
647 Used by expr_parse.py
648 """
649 # TODO: Remove and use out_tokens
650 tokens = [] # type: List[Token]
651
652 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
653 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
654
655 expected_end_tokens = 3 if left_token.id in (
656 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
657 Id.Left_BTSingleQuote) else 1
658 num_end_tokens = 0
659
660 while num_end_tokens < expected_end_tokens:
661 self._SetNext(lex_mode)
662 self._GetToken()
663
664 # Kind.Char emitted in lex_mode.SQ_C
665 if self.token_kind in (Kind.Lit, Kind.Char):
666 tok = self.cur_token
667 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
668 # r'one\two' or c'one\\two'
669 if no_backslashes and lexer.TokenContains(tok, '\\'):
670 p_die(
671 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
672 tok)
673
674 if is_ysh_expr:
675 # Disallow var x = $'\001'. Arguably we don't need these
676 # checks because u'\u{1}' is the way to write it.
677 if self.token_type == Id.Char_Octal3:
678 p_die(
679 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
680 tok)
681
682 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
683 # disallow \xH
684 p_die(
685 r'Invalid hex escape in YSH string (must be \xHH)',
686 tok)
687
688 tokens.append(tok)
689
690 elif self.token_kind == Kind.Unknown:
691 tok = self.cur_token
692 assert tok.id == Id.Unknown_Backslash, tok
693
694 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
695 if is_ysh_expr or not self.parse_opts.parse_backslash():
696 p_die(
697 "Invalid char escape in C-style string literal (OILS-ERR-11)",
698 tok)
699
700 tokens.append(tok)
701
702 elif self.token_kind == Kind.Eof:
703 p_die('Unexpected EOF in single-quoted string that began here',
704 left_token)
705
706 elif self.token_kind == Kind.Right:
707 # assume Id.Right_SingleQuote
708 num_end_tokens += 1
709 tokens.append(self.cur_token)
710
711 else:
712 raise AssertionError(self.cur_token)
713
714 if self.token_kind != Kind.Right:
715 num_end_tokens = 0 # we need three in a ROW
716
717 if expected_end_tokens == 1:
718 tokens.pop()
719 elif expected_end_tokens == 3: # Get rid of spurious end tokens
720 tokens.pop()
721 tokens.pop()
722 tokens.pop()
723
724 # Remove space from ''' r''' $''' in both expression mode and command mode
725 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
726 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
727 word_compile.RemoveLeadingSpaceSQ(tokens)
728
729 # Validation after lexing - same 2 checks in j8.LexerDecoder
730 is_u_string = left_token.id in (Id.Left_USingleQuote,
731 Id.Left_UTSingleQuote)
732
733 for tok in tokens:
734 # u'\yff' is not valid, but b'\yff' is
735 if is_u_string and tok.id == Id.Char_YHex:
736 p_die(
737 r"%s escapes not allowed in u'' strings" %
738 lexer.TokenVal(tok), tok)
739
740 out_tokens.extend(tokens)
741 return self.cur_token
742
743 def _ReadDoubleQuotedLeftParts(self):
744 # type: () -> word_part_t
745 """Read substitution parts in a double quoted context."""
746 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
747 return self._ReadCommandSub(self.token_type, d_quoted=True)
748
749 if self.token_type == Id.Left_DollarBrace:
750 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
751
752 if self.token_type == Id.Left_DollarDParen:
753 return self._ReadArithSub()
754
755 if self.token_type == Id.Left_DollarBracket:
756 return self._ReadExprSub(lex_mode_e.DQ)
757
758 raise AssertionError(self.cur_token)
759
760 def _ReadYshSingleQuoted(self, left_id):
761 # type: (Id_t) -> CompoundWord
762 """Read YSH style strings
763
764 r'' u'' b''
765 r''' ''' u''' ''' b''' '''
766 """
767 #log('BEF self.cur_token %s', self.cur_token)
768 if left_id == Id.Left_RSingleQuote:
769 lexer_mode = lex_mode_e.SQ_Raw
770 triple_left_id = Id.Left_RTSingleQuote
771 elif left_id == Id.Left_USingleQuote:
772 lexer_mode = lex_mode_e.J8_Str
773 triple_left_id = Id.Left_UTSingleQuote
774 elif left_id == Id.Left_BSingleQuote:
775 lexer_mode = lex_mode_e.J8_Str
776 triple_left_id = Id.Left_BTSingleQuote
777 else:
778 raise AssertionError(left_id)
779
780 # Needed for syntax checks
781 left_tok = self.cur_token
782 left_tok.id = left_id
783
784 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
785
786 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
787 self._SetNext(lex_mode_e.ShCommand)
788 self._GetToken()
789
790 assert self.token_type == Id.Left_SingleQuote
791 # HACK: magically transform the third ' in u''' to
792 # Id.Left_UTSingleQuote, so that ''' is the terminator
793 left_tok = self.cur_token
794 left_tok.id = triple_left_id
795
796 # Handles stripping leading whitespace
797 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
798
799 # Advance and validate
800 self._SetNext(lex_mode_e.ShCommand)
801
802 self._GetToken()
803 if self.token_kind not in KINDS_THAT_END_WORDS:
804 p_die('Unexpected token after YSH single-quoted string',
805 self.cur_token)
806
807 return CompoundWord([sq_part])
808
809 def _ReadUnquotedLeftParts(self, triple_out):
810 # type: (Optional[BoolParamBox]) -> word_part_t
811 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
812
813 If triple_out is set, then we try parsing triple quoted strings,
814 and set its value to True if we got one.
815 """
816 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
817 # Note: $"" is a synonym for "". It might make sense if it added
818 # \n \0 \x00 \u{123} etc. But that's not what bash does!
819 dq_part = self._ReadDoubleQuoted(self.cur_token)
820 # Got empty word "" and there's a " after
821 if (triple_out and len(dq_part.parts) == 0 and
822 self.lexer.ByteLookAhead() == '"'):
823
824 self._SetNext(lex_mode_e.ShCommand)
825 self._GetToken()
826 # HACK: magically transform the third " in """ to
827 # Id.Left_TDoubleQuote, so that """ is the terminator
828 left_dq_token = self.cur_token
829 left_dq_token.id = Id.Left_TDoubleQuote
830 triple_out.b = True # let caller know we got it
831 return self._ReadDoubleQuoted(left_dq_token)
832
833 return dq_part
834
835 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
836 Id.Left_DollarSingleQuote):
837 if self.token_type == Id.Left_SingleQuote:
838 lexer_mode = lex_mode_e.SQ_Raw
839 triple_left_id = Id.Left_TSingleQuote
840 elif self.token_type == Id.Left_RSingleQuote:
841 lexer_mode = lex_mode_e.SQ_Raw
842 triple_left_id = Id.Left_RTSingleQuote
843 else:
844 lexer_mode = lex_mode_e.SQ_C
845 # there is no such thing as $'''
846 triple_left_id = Id.Undefined_Tok
847
848 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
849
850 # Got empty '' or r'' and there's a ' after
851 # u'' and b'' are handled in _ReadYshSingleQuoted
852 if (triple_left_id != Id.Undefined_Tok and
853 triple_out is not None and len(sq_part.sval) == 0 and
854 self.lexer.ByteLookAhead() == "'"):
855
856 self._SetNext(lex_mode_e.ShCommand)
857 self._GetToken()
858
859 # HACK: magically transform the third ' in ''' to
860 # Id.Left_TSingleQuote, so that ''' is the terminator
861 left_sq_token = self.cur_token
862 left_sq_token.id = triple_left_id
863
864 triple_out.b = True # let caller know we got it
865 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
866
867 return sq_part
868
869 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
870 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
871 return self._ReadCommandSub(self.token_type, d_quoted=False)
872
873 if self.token_type == Id.Left_DollarBrace:
874 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
875
876 if self.token_type == Id.Left_DollarDParen:
877 return self._ReadArithSub()
878
879 if self.token_type == Id.Left_DollarBracket:
880 return self._ReadExprSub(lex_mode_e.ShCommand)
881
882 if self.token_type == Id.Left_DollarBraceZsh:
883 return self._ReadZshVarSub(self.cur_token)
884
885 raise AssertionError(self.cur_token)
886
887 def _ReadExtGlob(self):
888 # type: () -> word_part.ExtGlob
889 """
890 Grammar:
891 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
892 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
893 RIGHT = ')'
894 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
895 Compound includes ExtGlob
896 """
897 left_token = self.cur_token
898 right_token = None # type: Token
899 arms = [] # type: List[CompoundWord]
900
901 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
902 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
903
904 read_word = False # did we just a read a word? To handle @(||).
905
906 while True:
907 self._GetToken()
908
909 if self.token_type == Id.Right_ExtGlob:
910 if not read_word:
911 arms.append(CompoundWord([]))
912 right_token = self.cur_token
913 break
914
915 elif self.token_type == Id.Op_Pipe:
916 if not read_word:
917 arms.append(CompoundWord([]))
918 read_word = False
919 self._SetNext(lex_mode_e.ExtGlob)
920
921 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
922 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
923 Kind.ExtGlob):
924 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
925 arms.append(w)
926 read_word = True
927
928 elif self.token_kind == Kind.Eof:
929 p_die('Unexpected EOF reading extended glob that began here',
930 left_token)
931
932 else:
933 raise AssertionError(self.cur_token)
934
935 return word_part.ExtGlob(left_token, arms, right_token)
936
937 def _ReadBashRegexGroup(self):
938 # type: () -> word_part.BashRegexGroup
939 """
940 Grammar:
941 BashRegexGroup = '(' WORD? ')
942 """
943 left_token = self.cur_token
944 assert left_token.id == Id.BashRegex_LParen, left_token
945
946 right_token = None # type: Token
947 arms = [] # type: List[CompoundWord]
948
949 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
950 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
951
952 self._GetToken()
953 if self.token_type == Id.Right_BashRegexGroup: # empty ()
954 return word_part.BashRegexGroup(left_token, None, self.cur_token)
955
956 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
957 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
958 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
959 # To allow bash style [[ s =~ (a b) ]]
960 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
961 arms.append(w)
962
963 self._GetToken()
964 if self.token_type != Id.Right_BashRegexGroup:
965 p_die('Expected ) to close bash regex group', self.cur_token)
966
967 return word_part.BashRegexGroup(left_token, w, self.cur_token)
968
969 p_die('Expected word after ( opening bash regex group', self.cur_token)
970
971 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
972 # type: (Optional[Token], bool, List[word_part_t]) -> None
973 """
974 Args:
975 left_token: A token if we are reading a double quoted part, or None if
976 we're reading a here doc.
977 is_ysh_expr: Whether to disallow backticks and invalid char escapes
978 out_parts: list of word_part to append to
979 """
980 if left_token:
981 if left_token.id in (Id.Left_TDoubleQuote,
982 Id.Left_DollarTDoubleQuote):
983 expected_end_tokens = 3
984 else:
985 expected_end_tokens = 1
986 else:
987 expected_end_tokens = 1000 # here doc will break
988
989 num_end_tokens = 0
990 while num_end_tokens < expected_end_tokens:
991 self._SetNext(lex_mode_e.DQ)
992 self._GetToken()
993
994 if self.token_kind == Kind.Lit:
995 if self.token_type == Id.Lit_EscapedChar:
996 tok = self.cur_token
997 ch = lexer.TokenSliceLeft(tok, 1)
998 part = word_part.EscapedLiteral(tok,
999 ch) # type: word_part_t
1000 else:
1001 if self.token_type == Id.Lit_BadBackslash:
1002 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1003 # YSH.
1004 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1005 # recursion (unless parse_backslash)
1006 if (is_ysh_expr or
1007 not self.parse_opts.parse_backslash()):
1008 p_die(
1009 "Invalid char escape in double quoted string (OILS-ERR-12)",
1010 self.cur_token)
1011 elif self.token_type == Id.Lit_Dollar:
1012 if is_ysh_expr or not self.parse_opts.parse_dollar():
1013 p_die("Literal $ should be quoted like \$",
1014 self.cur_token)
1015
1016 part = self.cur_token
1017 out_parts.append(part)
1018
1019 elif self.token_kind == Kind.Left:
1020 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1021 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1022 self.cur_token)
1023
1024 part = self._ReadDoubleQuotedLeftParts()
1025 out_parts.append(part)
1026
1027 elif self.token_kind == Kind.VSub:
1028 tok = self.cur_token
1029 part = SimpleVarSub(tok)
1030 out_parts.append(part)
1031 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1032 # later.
1033
1034 elif self.token_kind == Kind.Right:
1035 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1036 if left_token:
1037 num_end_tokens += 1
1038
1039 # In a here doc, the right quote is literal!
1040 out_parts.append(self.cur_token)
1041
1042 elif self.token_kind == Kind.Eof:
1043 if left_token:
1044 p_die(
1045 'Unexpected EOF reading double-quoted string that began here',
1046 left_token)
1047 else: # here docs will have an EOF in their token stream
1048 break
1049
1050 else:
1051 raise AssertionError(self.cur_token)
1052
1053 if self.token_kind != Kind.Right:
1054 num_end_tokens = 0 # """ must be CONSECUTIVE
1055
1056 if expected_end_tokens == 1:
1057 out_parts.pop()
1058 elif expected_end_tokens == 3:
1059 out_parts.pop()
1060 out_parts.pop()
1061 out_parts.pop()
1062
1063 # Remove space from """ in both expression mode and command mode
1064 if (left_token and left_token.id
1065 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1066 word_compile.RemoveLeadingSpaceDQ(out_parts)
1067
1068 # Return nothing, since we appended to 'out_parts'
1069
1070 def _ReadDoubleQuoted(self, left_token):
1071 # type: (Token) -> DoubleQuoted
1072 """Helper function for "hello $name".
1073
1074 Args:
1075 eof_type: for stopping at }, Id.Lit_RBrace
1076 here_doc: Whether we are reading in a here doc context
1077
1078 Also ${foo%%a b c} # treat this as double quoted. until you hit
1079 """
1080 parts = [] # type: List[word_part_t]
1081 self._ReadLikeDQ(left_token, False, parts)
1082
1083 right_quote = self.cur_token
1084 return DoubleQuoted(left_token, parts, right_quote)
1085
1086 def ReadDoubleQuoted(self, left_token, parts):
1087 # type: (Token, List[word_part_t]) -> Token
1088 """For expression mode.
1089
1090 Read var x = "${dir:-}/$name"; etc.
1091 """
1092 self._ReadLikeDQ(left_token, True, parts)
1093 return self.cur_token
1094
1095 def _ReadCommandSub(self, left_id, d_quoted=False):
1096 # type: (Id_t, bool) -> CommandSub
1097 """
1098 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1099
1100 command_sub = '$(' command_list ')'
1101 | '@(' command_list ')'
1102 | '<(' command_list ')'
1103 | '>(' command_list ')'
1104 | ` command_list `
1105 """
1106 left_token = self.cur_token
1107
1108 # Set the lexer in a state so ) becomes the EOF token.
1109 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1110 Id.Left_ProcSubOut):
1111 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1112
1113 right_id = Id.Eof_RParen
1114 self.lexer.PushHint(Id.Op_RParen, right_id)
1115 c_parser = self.parse_ctx.MakeParserForCommandSub(
1116 self.line_reader, self.lexer, right_id)
1117 # NOTE: This doesn't use something like main_loop because we don't want
1118 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1119 node = c_parser.ParseCommandSub()
1120
1121 right_token = c_parser.w_parser.cur_token
1122
1123 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1124 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1125 # test/osh2oil.
1126
1127 right_id = Id.Eof_Backtick
1128 self.lexer.PushHint(Id.Left_Backtick, right_id)
1129 c_parser = self.parse_ctx.MakeParserForCommandSub(
1130 self.line_reader, self.lexer, right_id)
1131 node = c_parser.ParseCommandSub()
1132 right_token = c_parser.w_parser.cur_token
1133
1134 elif left_id == Id.Left_Backtick:
1135 if not self.parse_opts.parse_backticks():
1136 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1137 left_token)
1138
1139 self._SetNext(lex_mode_e.Backtick) # advance past `
1140
1141 parts = [] # type: List[str]
1142 while True:
1143 self._GetToken()
1144 #log("TOK %s", self.cur_token)
1145
1146 if self.token_type == Id.Backtick_Quoted:
1147 # Remove leading \
1148 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1149
1150 elif self.token_type == Id.Backtick_DoubleQuote:
1151 # Compatibility: If backticks are double quoted, then double quotes
1152 # within them have to be \"
1153 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1154 # is)
1155 if d_quoted:
1156 # Remove leading \
1157 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1158 else:
1159 parts.append(lexer.TokenVal(self.cur_token))
1160
1161 elif self.token_type == Id.Backtick_Other:
1162 parts.append(lexer.TokenVal(self.cur_token))
1163
1164 elif self.token_type == Id.Backtick_Right:
1165 break
1166
1167 elif self.token_type == Id.Eof_Real:
1168 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1169 p_die('Unexpected EOF while looking for closing backtick',
1170 left_token)
1171
1172 else:
1173 raise AssertionError(self.cur_token)
1174
1175 self._SetNext(lex_mode_e.Backtick)
1176
1177 # Calculate right SPID on CommandSub BEFORE re-parsing.
1178 right_token = self.cur_token
1179
1180 code_str = ''.join(parts)
1181 #log('code %r', code_str)
1182
1183 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1184 # won't have the same location info as MakeParserForCommandSub(), because
1185 # the lexer is different.
1186 arena = self.parse_ctx.arena
1187 #arena = alloc.Arena()
1188 line_reader = reader.StringLineReader(code_str, arena)
1189 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1190 src = source.Reparsed('backticks', left_token, right_token)
1191 with alloc.ctx_SourceCode(arena, src):
1192 node = c_parser.ParseCommandSub()
1193
1194 else:
1195 raise AssertionError(left_id)
1196
1197 return CommandSub(left_token, node, right_token)
1198
1199 def _ReadExprSub(self, lex_mode):
1200 # type: (lex_mode_t) -> word_part.ExprSub
1201 """$[d->key] $[obj.method()] etc."""
1202 left_token = self.cur_token
1203
1204 self._SetNext(lex_mode_e.Expr)
1205 enode, right_token = self.parse_ctx.ParseYshExpr(
1206 self.lexer, grammar_nt.ysh_expr_sub)
1207
1208 self._SetNext(lex_mode) # Move past ]
1209 return word_part.ExprSub(left_token, enode, right_token)
1210
1211 def ParseVarDecl(self, kw_token):
1212 # type: (Token) -> command.VarDecl
1213 """
1214 oil_var_decl: name_type_list '=' testlist end_stmt
1215
1216 Note that assignments must end with \n ; } or EOF. Unlike shell
1217 assignments, we disallow:
1218
1219 var x = 42 | wc -l
1220 var x = 42 && echo hi
1221 """
1222 self._SetNext(lex_mode_e.Expr)
1223 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1224 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1225 # wants
1226 if last_token.id == Id.Op_RBrace:
1227 last_token.id = Id.Lit_RBrace
1228
1229 # Let the CommandParser see the Op_Semi or Op_Newline.
1230 self.buffered_word = last_token
1231 self._SetNext(lex_mode_e.ShCommand) # always back to this
1232 return enode
1233
1234 def ParseMutation(self, kw_token, var_checker):
1235 # type: (Token, VarChecker) -> command.Mutation
1236 """
1237 setvar i = 42
1238 setvar i += 1
1239 setvar a[i] = 42
1240 setvar a[i] += 1
1241 setvar d.key = 42
1242 setvar d.key += 1
1243 """
1244 self._SetNext(lex_mode_e.Expr)
1245 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1246 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1247 # wants
1248 if last_token.id == Id.Op_RBrace:
1249 last_token.id = Id.Lit_RBrace
1250
1251 for lhs in enode.lhs:
1252 UP_lhs = lhs
1253 with tagswitch(lhs) as case:
1254 if case(y_lhs_e.Var):
1255 lhs = cast(Token, UP_lhs)
1256 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1257
1258 # Note: this does not cover cases like
1259 # setvar (a[0])[1] = v
1260 # setvar (d.key).other = v
1261 # This leaks into catching all typos statically, which may be
1262 # possible if 'use' makes all names explicit.
1263 elif case(y_lhs_e.Subscript):
1264 lhs = cast(Subscript, UP_lhs)
1265 if lhs.obj.tag() == expr_e.Var:
1266 v = cast(expr.Var, lhs.obj)
1267 var_checker.Check(kw_token.id, v.name, v.left)
1268
1269 elif case(y_lhs_e.Attribute):
1270 lhs = cast(Attribute, UP_lhs)
1271 if lhs.obj.tag() == expr_e.Var:
1272 v = cast(expr.Var, lhs.obj)
1273 var_checker.Check(kw_token.id, v.name, v.left)
1274
1275 # Let the CommandParser see the Op_Semi or Op_Newline.
1276 self.buffered_word = last_token
1277 self._SetNext(lex_mode_e.ShCommand) # always back to this
1278 return enode
1279
1280 def ParseBareDecl(self):
1281 # type: () -> expr_t
1282 """
1283 x = {name: val}
1284 """
1285 self._SetNext(lex_mode_e.Expr)
1286 self._GetToken()
1287 enode, last_token = self.parse_ctx.ParseYshExpr(
1288 self.lexer, grammar_nt.command_expr)
1289 if last_token.id == Id.Op_RBrace:
1290 last_token.id = Id.Lit_RBrace
1291 self.buffered_word = last_token
1292 self._SetNext(lex_mode_e.ShCommand)
1293 return enode
1294
1295 def ParseYshExprForCommand(self):
1296 # type: () -> expr_t
1297
1298 # Fudge for this case
1299 # for x in(y) {
1300 # versus
1301 # for x in (y) {
1302 #
1303 # In the former case, ReadWord on 'in' puts the lexer past (.
1304 # Also see LookPastSpace in CommandParers.
1305 # A simpler solution would be nicer.
1306
1307 if self.token_type == Id.Op_LParen:
1308 self.lexer.MaybeUnreadOne()
1309
1310 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1311
1312 self._SetNext(lex_mode_e.ShCommand)
1313 return enode
1314
1315 def ParseCommandExpr(self):
1316 # type: () -> expr_t
1317 """
1318 = 1+2
1319 """
1320 enode, last_token = self.parse_ctx.ParseYshExpr(
1321 self.lexer, grammar_nt.command_expr)
1322
1323 # In some cases, such as the case statement, we expect *the lexer* to be
1324 # pointing at the token right after the expression. But the expression
1325 # parser must have read to the `last_token`. Unreading places the lexer
1326 # back in the expected state. Ie:
1327 #
1328 # case (x) { case (x) {
1329 # (else) { = x } (else) { = x }
1330 # ^ The lexer is here ^ Unread to here
1331 # } }
1332 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1333 Id.Op_RBrace), last_token
1334 if last_token.id != Id.Eof_Real:
1335 # Eof_Real is the only token we cannot unread
1336 self.lexer.MaybeUnreadOne()
1337
1338 return enode
1339
1340 def ParseProc(self, node):
1341 # type: (Proc) -> None
1342
1343 # proc name-with-hyphens() must be accepted
1344 self._SetNext(lex_mode_e.ShCommand)
1345 self._GetToken()
1346 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1347 if self.token_type != Id.Lit_Chars:
1348 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1349 self.cur_token)
1350
1351 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1352 # for shell functions. Similar to IsValidVarName().
1353 node.name = self.cur_token
1354
1355 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1356
1357 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1358 assert last_token.id == Id.Op_LBrace
1359 last_token.id = Id.Lit_LBrace
1360 self.buffered_word = last_token
1361
1362 self._SetNext(lex_mode_e.ShCommand)
1363
1364 def ParseFunc(self, node):
1365 # type: (Func) -> None
1366 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1367
1368 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1369 assert last_token.id == Id.Op_LBrace
1370 last_token.id = Id.Lit_LBrace
1371 self.buffered_word = last_token
1372
1373 self._SetNext(lex_mode_e.ShCommand)
1374
1375 def ParseYshCasePattern(self):
1376 # type: () -> Tuple[pat_t, Token]
1377 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1378 self.lexer)
1379
1380 if last_token.id == Id.Op_LBrace:
1381 last_token.id = Id.Lit_LBrace
1382 self.buffered_word = last_token
1383
1384 return pat, left_tok
1385
1386 def NewlineOkForYshCase(self):
1387 # type: () -> Id_t
1388 """Check for optional newline and consume it.
1389
1390 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1391 which crop up while parsing Ysh Case Arms. For more details, see
1392 #oil-dev > Progress On YSH Case Grammar on zulip.
1393
1394 Returns a token id which is filled with the choice of
1395
1396 word { echo word }
1397 (3) { echo expr }
1398 /e/ { echo eggex }
1399 } # right brace
1400 """
1401 while True:
1402 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1403
1404 # Cannot lookahead past lines
1405 if next_id == Id.Unknown_Tok:
1406 self.lexer.MoveToNextLine()
1407 continue
1408
1409 next_kind = consts.GetKind(next_id)
1410 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1411 break
1412
1413 self.lexer.Read(lex_mode_e.Expr)
1414
1415 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1416 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1417 else:
1418 # Consume the trailing Op_Newline
1419 self._SetNext(lex_mode_e.ShCommand)
1420 self._GetToken()
1421
1422 return next_id
1423
1424 def _ReadArithExpr(self, end_id):
1425 # type: (Id_t) -> arith_expr_t
1426 """Read and parse an arithmetic expression in various contexts.
1427
1428 $(( 1+2 ))
1429 (( a=1+2 ))
1430 ${a[ 1+2 ]}
1431 ${a : 1+2 : 1+2}
1432
1433 See tests/arith-context.test.sh for ambiguous cases.
1434
1435 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1436
1437 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1438
1439 See the assertion in ArithParser.Parse() -- unexpected extra input.
1440 """
1441 # calls self.ReadWord(lex_mode_e.Arith)
1442 anode = self.a_parser.Parse()
1443 cur_id = self.a_parser.CurrentId()
1444 if end_id != Id.Undefined_Tok and cur_id != end_id:
1445 p_die(
1446 'Unexpected token after arithmetic expression (%s != %s)' %
1447 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1448 loc.Word(self.a_parser.cur_word))
1449 return anode
1450
1451 def _ReadArithSub(self):
1452 # type: () -> word_part.ArithSub
1453 """Read an arith substitution, which contains an arith expression, e.g.
1454
1455 $((a + 1)).
1456 """
1457 left_tok = self.cur_token
1458
1459 # The second one needs to be disambiguated in stuff like stuff like:
1460 # $(echo $(( 1+2 )) )
1461 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1462
1463 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1464 # could save the lexer/reader state here, and retry if the arithmetic parse
1465 # fails. But we can almost always catch this at parse time. There could
1466 # be some exceptions like:
1467 # $((echo * foo)) # looks like multiplication
1468 # $((echo / foo)) # looks like division
1469
1470 self._SetNext(lex_mode_e.Arith)
1471 anode = self._ReadArithExpr(Id.Arith_RParen)
1472
1473 # TODO: This could be DQ or Arith too
1474 self._SetNext(lex_mode_e.ShCommand)
1475
1476 # PROBLEM: $(echo $(( 1 + 2 )) )
1477 # Two right parens break the Id.Eof_RParen scheme
1478 self._GetToken()
1479 if self.token_type != Id.Right_DollarDParen:
1480 p_die('Expected second ) to end arith sub', self.cur_token)
1481
1482 right_tok = self.cur_token
1483 return word_part.ArithSub(left_tok, anode, right_tok)
1484
1485 def ReadDParen(self):
1486 # type: () -> Tuple[arith_expr_t, Token]
1487 """Read ((1+ 2)) -- command context.
1488
1489 We're using the word parser because it's very similar to _ReadArithExpr
1490 above.
1491
1492 This also returns the terminating `Op_DRightParen` token for use as location
1493 tracking.
1494 """
1495 # The second one needs to be disambiguated in stuff like stuff like:
1496 # TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1497 # Then you can get rid of this.
1498 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1499
1500 self._SetNext(lex_mode_e.Arith)
1501 anode = self._ReadArithExpr(Id.Arith_RParen)
1502
1503 self._SetNext(lex_mode_e.ShCommand)
1504
1505 # PROBLEM: $(echo $(( 1 + 2 )) )
1506 self._GetToken()
1507 right = self.cur_token
1508 if self.token_type != Id.Op_DRightParen:
1509 p_die('Expected second ) to end arith statement', self.cur_token)
1510
1511 self._SetNext(lex_mode_e.ShCommand)
1512
1513 return anode, right
1514
1515 def _SetNextNonSpace(self):
1516 # type: () -> None
1517 """Same logic as _ReadWord, but for ReadForExpression."""
1518 while True:
1519 self._SetNext(lex_mode_e.Arith)
1520 self._GetToken()
1521 if self.token_kind not in (Kind.Ignored, Kind.WS):
1522 break
1523
1524 def ReadForExpression(self):
1525 # type: () -> command.ForExpr
1526 """Read ((i=0; i<5; ++i)) -- part of command context."""
1527 self._SetNextNonSpace() # skip over ((
1528
1529 self._GetToken()
1530 cur_id = self.token_type # for end of arith expressions
1531
1532 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1533 init_node = None # type: Optional[arith_expr_t]
1534 else:
1535 init_node = self.a_parser.Parse()
1536 cur_id = self.a_parser.CurrentId()
1537 self._SetNextNonSpace()
1538
1539 # It's odd to keep track of both cur_id and self.token_type in this
1540 # function, but it works, and is tested in 'test/parse_error.sh
1541 # arith-integration'
1542 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1543 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1544
1545 self._GetToken()
1546 cur_id = self.token_type
1547
1548 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1549 cond_node = None # type: Optional[arith_expr_t]
1550 else:
1551 cond_node = self.a_parser.Parse()
1552 cur_id = self.a_parser.CurrentId()
1553 self._SetNextNonSpace()
1554
1555 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1556 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1557
1558 self._GetToken()
1559 cur_id = self.token_type
1560
1561 if cur_id == Id.Arith_RParen: # for (( ; ; ))
1562 update_node = None # type: Optional[arith_expr_t]
1563 else:
1564 update_node = self._ReadArithExpr(Id.Arith_RParen)
1565 self._SetNextNonSpace()
1566
1567 self._GetToken()
1568 if self.token_type != Id.Arith_RParen:
1569 p_die('Expected ) to end for loop expression', self.cur_token)
1570 self._SetNext(lex_mode_e.ShCommand)
1571
1572 # redirects is None, will be assigned in CommandEvaluator
1573 node = command.ForExpr.CreateNull()
1574 node.init = init_node
1575 node.cond = cond_node
1576 node.update = update_node
1577 return node
1578
1579 def _ReadArrayLiteral(self):
1580 # type: () -> word_part_t
1581 """a=(1 2 3)
1582
1583 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1584
1585 We want:
1586
1587 A=(['x']=1 ["x"]=2 [$x$y]=3)
1588
1589 Maybe allow this as a literal string? Because I think I've seen it before?
1590 Or maybe force people to patch to learn the rule.
1591
1592 A=([x]=4)
1593
1594 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1595 Maybe enforce that ALL have keys or NONE of have keys.
1596 """
1597 self._SetNext(lex_mode_e.ShCommand) # advance past (
1598 self._GetToken()
1599 if self.cur_token.id != Id.Op_LParen:
1600 p_die('Expected ( after =', self.cur_token)
1601 left_token = self.cur_token
1602 right_token = None # type: Token
1603
1604 # MUST use a new word parser (with same lexer).
1605 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1606 words = [] # type: List[CompoundWord]
1607 done = False
1608 while not done:
1609 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1610 with tagswitch(w) as case:
1611 if case(word_e.Operator):
1612 tok = cast(Token, w)
1613 if tok.id == Id.Right_ShArrayLiteral:
1614 right_token = tok
1615 done = True # can't use break here
1616 # Unlike command parsing, array parsing allows embedded \n.
1617 elif tok.id == Id.Op_Newline:
1618 continue
1619 else:
1620 p_die('Unexpected token in array literal', loc.Word(w))
1621
1622 elif case(word_e.Compound):
1623 words.append(cast(CompoundWord, w))
1624
1625 else:
1626 raise AssertionError()
1627
1628 if len(words) == 0: # a=() is empty indexed array
1629 # Needed for type safety, doh
1630 no_words = [] # type: List[word_t]
1631 node = ShArrayLiteral(left_token, no_words, right_token)
1632 return node
1633
1634 pairs = [] # type: List[AssocPair]
1635 # If the first one is a key/value pair, then the rest are assumed to be.
1636 pair = word_.DetectAssocPair(words[0])
1637 if pair:
1638 pairs.append(pair)
1639
1640 n = len(words)
1641 for i in xrange(1, n):
1642 w2 = words[i]
1643 pair = word_.DetectAssocPair(w2)
1644 if not pair:
1645 p_die("Expected associative array pair", loc.Word(w2))
1646
1647 pairs.append(pair)
1648
1649 # invariant List?
1650 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1651
1652 # Brace detection for arrays but NOT associative arrays
1653 words2 = braces.BraceDetectAll(words)
1654 words3 = word_.TildeDetectAll(words2)
1655 return ShArrayLiteral(left_token, words3, right_token)
1656
1657 def ParseProcCallArgs(self, start_symbol):
1658 # type: (int) -> ArgList
1659 """ json write (x) """
1660 self.lexer.MaybeUnreadOne()
1661
1662 arg_list = ArgList.CreateNull(alloc_lists=True)
1663 arg_list.left = self.cur_token
1664 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1665 return arg_list
1666
1667 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1668 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1669 """Helper for _ReadCompoundWord3."""
1670 done = False
1671
1672 if self.token_type == Id.Lit_EscapedChar:
1673 tok = self.cur_token
1674 assert tok.length == 2
1675 ch = lexer.TokenSliceLeft(tok, 1)
1676 if not self.parse_opts.parse_backslash():
1677 if not pyutil.IsValidCharEscape(ch):
1678 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1679 self.cur_token)
1680
1681 part = word_part.EscapedLiteral(self.cur_token,
1682 ch) # type: word_part_t
1683 else:
1684 part = self.cur_token
1685
1686 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1687 parts.append(part)
1688 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1689 # _ReadWord.
1690 next_id = self.lexer.LookPastSpace(lex_mode)
1691 if next_id == Id.Op_LParen:
1692 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1693 part2 = self._ReadArrayLiteral()
1694 parts.append(part2)
1695
1696 # Array literal must be the last part of the word.
1697 self._SetNext(lex_mode)
1698 self._GetToken()
1699 # EOF, whitespace, newline, Right_Subshell
1700 if self.token_kind not in KINDS_THAT_END_WORDS:
1701 p_die('Unexpected token after array literal',
1702 self.cur_token)
1703 done = True
1704
1705 elif (is_first and self.parse_opts.parse_at() and
1706 self.token_type == Id.Lit_Splice):
1707
1708 splice_tok = self.cur_token
1709 part2 = word_part.Splice(splice_tok,
1710 lexer.TokenSliceLeft(splice_tok, 1))
1711
1712 parts.append(part2)
1713
1714 # @words must be the last part of the word
1715 self._SetNext(lex_mode)
1716 self._GetToken()
1717 # EOF, whitespace, newline, Right_Subshell
1718 if self.token_kind not in KINDS_THAT_END_WORDS:
1719 p_die('Unexpected token after array splice', self.cur_token)
1720 done = True
1721
1722 elif (is_first and self.parse_opts.parse_at() and
1723 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1724 part2 = self._ReadExprSub(lex_mode_e.DQ)
1725 parts.append(part2)
1726
1727 # @[split(x)]
1728 self._SetNext(lex_mode)
1729 self._GetToken()
1730 # EOF, whitespace, newline, Right_Subshell
1731 if self.token_kind not in KINDS_THAT_END_WORDS:
1732 p_die('Unexpected token after Expr splice', self.cur_token)
1733 done = True
1734
1735 elif (is_first and self.parse_opts.parse_at() and
1736 self.token_type == Id.Lit_AtLBraceDot):
1737 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1738
1739 elif (is_first and self.parse_opts.parse_at_all() and
1740 self.token_type == Id.Lit_At):
1741 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1742 # at the beginning of a word to be reserved.
1743
1744 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1745 # @_argv and
1746 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1747 self.cur_token)
1748
1749 else:
1750 # not a literal with lookahead; append it
1751 parts.append(part)
1752
1753 return done
1754
1755 def _ReadCompoundWord(self, lex_mode):
1756 # type: (lex_mode_t) -> CompoundWord
1757 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1758
1759 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1760 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1761 """
1762 Precondition: Looking at the first token of the first word part
1763 Postcondition: Looking at the token after, e.g. space or operator
1764
1765 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1766 could be an operator delimiting a compound word. Can we change lexer modes
1767 and remove this special case?
1768 """
1769 w = CompoundWord([])
1770 num_parts = 0
1771 brace_count = 0
1772 done = False
1773 is_triple_quoted = None # type: Optional[BoolParamBox]
1774
1775 while not done:
1776 self._GetToken()
1777
1778 allow_done = empty_ok or num_parts != 0
1779 if allow_done and self.token_type == eof_type:
1780 done = True # e.g. for ${foo//pat/replace}
1781
1782 # Keywords like "for" are treated like literals
1783 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1784 Kind.ControlFlow, Kind.BoolUnary,
1785 Kind.BoolBinary):
1786
1787 # Syntax error for { and }
1788 if self.token_type == Id.Lit_LBrace:
1789 brace_count += 1
1790 elif self.token_type == Id.Lit_RBrace:
1791 brace_count -= 1
1792 elif self.token_type == Id.Lit_Dollar:
1793 if not self.parse_opts.parse_dollar():
1794 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1795 next_byte = self.lexer.ByteLookAhead()
1796 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1797 if next_byte == '/':
1798 #log('next_byte %r', next_byte)
1799 pass
1800
1801 p_die('Literal $ should be quoted like \$',
1802 self.cur_token)
1803
1804 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1805 w.parts)
1806
1807 elif self.token_kind == Kind.VSub:
1808 vsub_token = self.cur_token
1809
1810 part = SimpleVarSub(vsub_token) # type: word_part_t
1811 w.parts.append(part)
1812
1813 elif self.token_kind == Kind.ExtGlob:
1814 # If parse_at, we can take over @( to start @(seq 3)
1815 # Users can also use look at ,(*.py|*.sh)
1816 if (self.parse_opts.parse_at() and
1817 self.token_type == Id.ExtGlob_At and num_parts == 0):
1818 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1819 d_quoted=False)
1820 # RARE mutation of tok.id!
1821 cs_part.left_token.id = Id.Left_AtParen
1822 part = cs_part # for type safety
1823
1824 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1825 # a=(one two)x and @arrayfunc(3)x.
1826 self._GetToken()
1827 if self.token_kind not in KINDS_THAT_END_WORDS:
1828 p_die('Unexpected token after @()', self.cur_token)
1829 done = True
1830
1831 else:
1832 part = self._ReadExtGlob()
1833 w.parts.append(part)
1834
1835 elif self.token_kind == Kind.BashRegex:
1836 if self.token_type == Id.BashRegex_LParen: # Opening (
1837 part = self._ReadBashRegexGroup()
1838 w.parts.append(part)
1839 else:
1840 assert self.token_type == Id.BashRegex_AllowedInParens
1841 p_die('Invalid token in bash regex', self.cur_token)
1842
1843 elif self.token_kind == Kind.Left:
1844 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1845 lex_mode == lex_mode_e.ShCommand and
1846 num_parts == 0)
1847
1848 # Save allocation
1849 if try_triple_quote:
1850 is_triple_quoted = BoolParamBox(False)
1851
1852 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1853 w.parts.append(part)
1854
1855 # NOT done yet, will advance below
1856 elif self.token_kind == Kind.Right:
1857 # Still part of the word; will be done on the next iter.
1858 if self.token_type == Id.Right_DoubleQuote:
1859 pass
1860 # Never happens, no PushHint for this case.
1861 #elif self.token_type == Id.Right_DollarParen:
1862 # pass
1863 elif self.token_type == Id.Right_Subshell:
1864 # LEXER HACK for (case x in x) ;; esac )
1865 # Rewind before it's used
1866 assert self.next_lex_mode == lex_mode_e.Undefined
1867 if self.lexer.MaybeUnreadOne():
1868 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1869 self._SetNext(lex_mode)
1870 done = True
1871 else:
1872 done = True
1873
1874 elif self.token_kind == Kind.Ignored:
1875 done = True
1876
1877 else:
1878 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1879 # so to test for ESAC, we can read ) before getting a chance to
1880 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1881 # token and do it again.
1882
1883 # We get Id.Op_RParen at top level: case x in x) ;; esac
1884 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1885 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1886 # Rewind before it's used
1887 assert self.next_lex_mode == lex_mode_e.Undefined
1888 if self.lexer.MaybeUnreadOne():
1889 if self.token_type == Id.Eof_RParen:
1890 # Redo translation
1891 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1892 self._SetNext(lex_mode)
1893
1894 done = True # anything we don't recognize means we're done
1895
1896 if not done:
1897 self._SetNext(lex_mode)
1898 num_parts += 1
1899
1900 if (self.parse_opts.parse_brace() and num_parts > 1 and
1901 brace_count != 0):
1902 # accept { and }, but not foo{
1903 p_die(
1904 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1905 loc.Word(w))
1906
1907 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1908 p_die('Unexpected parts after triple quoted string',
1909 loc.WordPart(w.parts[-1]))
1910
1911 if 0:
1912 from _devbuild.gen.syntax_asdl import word_part_str
1913 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1914 WORD_HIST[word_key] += 1
1915 return w
1916
1917 def _ReadArithWord(self):
1918 # type: () -> Optional[word_t]
1919 """ Helper for ReadArithWord() """
1920 self._GetToken()
1921
1922 if self.token_kind == Kind.Unknown:
1923 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1924 p_die(
1925 'Unexpected token while parsing arithmetic: %r' %
1926 lexer.TokenVal(self.cur_token), self.cur_token)
1927
1928 elif self.token_kind == Kind.Eof:
1929 return self.cur_token
1930
1931 elif self.token_kind == Kind.Ignored:
1932 # Space should be ignored.
1933 self._SetNext(lex_mode_e.Arith)
1934 return None
1935
1936 elif self.token_kind in (Kind.Arith, Kind.Right):
1937 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1938 self._SetNext(lex_mode_e.Arith)
1939 return self.cur_token
1940
1941 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1942 return self._ReadCompoundWord(lex_mode_e.Arith)
1943
1944 else:
1945 raise AssertionError(self.cur_token)
1946
1947 def _ReadWord(self, word_mode):
1948 # type: (lex_mode_t) -> Optional[word_t]
1949 """Helper function for ReadWord()."""
1950
1951 # Change the pseudo lexer mode to a real lexer mode
1952 if word_mode == lex_mode_e.ShCommandFakeBrack:
1953 lex_mode = lex_mode_e.ShCommand
1954 else:
1955 lex_mode = word_mode
1956
1957 self._GetToken()
1958
1959 if self.token_kind == Kind.Eof:
1960 # No advance
1961 return self.cur_token
1962
1963 # Allow Arith for ) at end of for loop?
1964 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1965 self._SetNext(lex_mode)
1966
1967 # Newlines are complicated. See 3x2 matrix in the comment about
1968 # self.multiline and self.newline_state above.
1969 if self.token_type == Id.Op_Newline:
1970 if self.multiline:
1971 if self.newline_state > 1:
1972 # This points at a blank line, but at least it gives the line number
1973 p_die('Invalid blank line in multiline mode',
1974 self.cur_token)
1975 return None
1976
1977 if self.returned_newline: # skip
1978 return None
1979
1980 return self.cur_token
1981
1982 elif self.token_kind == Kind.Right:
1983 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1984 Id.Right_CasePat,
1985 Id.Right_ShArrayLiteral):
1986 raise AssertionError(self.cur_token)
1987
1988 self._SetNext(lex_mode)
1989 return self.cur_token
1990
1991 elif self.token_kind in (Kind.Ignored, Kind.WS):
1992 self._SetNext(lex_mode)
1993 return None
1994
1995 else:
1996 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1997 Kind.Left, Kind.KW, Kind.ControlFlow,
1998 Kind.BoolUnary, Kind.BoolBinary,
1999 Kind.ExtGlob,
2000 Kind.BashRegex), 'Unhandled token kind'
2001
2002 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2003 self.parse_opts.parse_bracket() and
2004 self.token_type == Id.Lit_LBracket):
2005 # Change [ from Kind.Lit -> Kind.Op
2006 # So CommandParser can treat
2007 # assert [42 === x]
2008 # like
2009 # json write (x)
2010 bracket_word = self.cur_token
2011 bracket_word.id = Id.Op_LBracket
2012
2013 self._SetNext(lex_mode)
2014 return bracket_word
2015
2016 # We're beginning a word. If we see Id.Lit_Pound, change to
2017 # lex_mode_e.Comment and read until end of line.
2018 if self.token_type == Id.Lit_Pound:
2019 self._SetNext(lex_mode_e.Comment)
2020 self._GetToken()
2021
2022 # NOTE: The # could be the last character in the file. It can't be
2023 # Eof_{RParen,Backtick} because #) and #` are comments.
2024 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2025 self.cur_token
2026
2027 # The next iteration will go into Kind.Ignored and set lex state to
2028 # lex_mode_e.ShCommand/etc.
2029 return None # tell ReadWord() to try again after comment
2030
2031 elif self.token_type == Id.Lit_TPound: ### doc comment
2032 self._SetNext(lex_mode_e.Comment)
2033 self._GetToken()
2034
2035 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2036 return self.cur_token
2037
2038 return None # tell ReadWord() to try again after comment
2039
2040 else:
2041 # r'' u'' b''
2042 if (self.token_type == Id.Lit_Chars and
2043 self.lexer.LookAheadOne(
2044 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2045
2046 # When shopt -s parse_raw_string:
2047 # echo r'hi' is like echo 'hi'
2048 #
2049 # echo u'\u{3bc}' b'\yff' works
2050
2051 tok = self.cur_token
2052 if self.parse_opts.parse_ysh_string():
2053 if lexer.TokenEquals(tok, 'r'):
2054 left_id = Id.Left_RSingleQuote
2055 elif lexer.TokenEquals(tok, 'u'):
2056 left_id = Id.Left_USingleQuote
2057 elif lexer.TokenEquals(tok, 'b'):
2058 left_id = Id.Left_BSingleQuote
2059 else:
2060 left_id = Id.Undefined_Tok
2061
2062 if left_id != Id.Undefined_Tok:
2063 # skip the r, and then 'foo' will be read as normal
2064 self._SetNext(lex_mode_e.ShCommand)
2065
2066 self._GetToken()
2067 assert self.token_type == Id.Left_SingleQuote, self.token_type
2068
2069 # Read the word in a different lexer mode
2070 return self._ReadYshSingleQuoted(left_id)
2071
2072 return self._ReadCompoundWord(lex_mode)
2073
2074 def ParseVarRef(self):
2075 # type: () -> BracedVarSub
2076 """DYNAMIC parsing of what's inside ${!ref}
2077
2078 # Same as VarOf production
2079 VarRefExpr = VarOf EOF
2080 """
2081 self._SetNext(lex_mode_e.VSub_1)
2082
2083 self._GetToken()
2084 if self.token_kind != Kind.VSub:
2085 p_die('Expected var name', self.cur_token)
2086
2087 part = self._ParseVarOf()
2088 # NOTE: no ${ } means no part.left and part.right
2089 part.left = part.token # cheat to make test pass
2090 part.right = part.token
2091
2092 self._GetToken()
2093 if self.token_type != Id.Eof_Real:
2094 p_die('Expected end of var ref expression', self.cur_token)
2095 return part
2096
2097 def LookPastSpace(self):
2098 # type: () -> Id_t
2099 """Look ahead to the next token.
2100
2101 For the CommandParser to recognize
2102 array= (1 2 3)
2103 YSH for ( versus bash for ((
2104 YSH if ( versus if test
2105 YSH while ( versus while test
2106 YSH bare assignment 'grep =' versus 'grep foo'
2107 """
2108 assert self.token_type != Id.Undefined_Tok
2109 if self.cur_token.id == Id.WS_Space:
2110 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2111 else:
2112 id_ = self.cur_token.id
2113 return id_
2114
2115 def LookAheadFuncParens(self):
2116 # type: () -> bool
2117 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2118 assert self.token_type != Id.Undefined_Tok
2119
2120 # We have to handle 2 cases because we buffer a token
2121 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2122 return self.lexer.LookAheadFuncParens(1) # go back one char
2123
2124 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2125 return self.lexer.LookAheadFuncParens(0)
2126
2127 else:
2128 return False
2129
2130 def ReadWord(self, word_mode):
2131 # type: (lex_mode_t) -> word_t
2132 """Read the next word, using the given lexer mode.
2133
2134 This is a stateful wrapper for the stateless _ReadWord function.
2135 """
2136 assert word_mode in (lex_mode_e.ShCommand,
2137 lex_mode_e.ShCommandFakeBrack,
2138 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2139
2140 if self.buffered_word: # For integration with pgen2
2141 w = self.buffered_word
2142 self.buffered_word = None
2143 else:
2144 while True:
2145 w = self._ReadWord(word_mode)
2146 if w is not None:
2147 break
2148
2149 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2150 return w
2151
2152 def ReadArithWord(self):
2153 # type: () -> word_t
2154 while True:
2155 w = self._ReadArithWord()
2156 if w is not None:
2157 break
2158 return w
2159
2160 def ReadHereDocBody(self, parts):
2161 # type: (List[word_part_t]) -> None
2162 """
2163 A here doc is like a double quoted context, except " isn't special.
2164 """
2165 self._ReadLikeDQ(None, False, parts)
2166 # Returns nothing
2167
2168 def ReadForPlugin(self):
2169 # type: () -> CompoundWord
2170 """For $PS1, $PS4, etc.
2171
2172 This is just like reading a here doc line. "\n" is allowed, as
2173 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2174 """
2175 w = CompoundWord([])
2176 self._ReadLikeDQ(None, False, w.parts)
2177 return w
2178
2179 def EmitDocToken(self, b):
2180 # type: (bool) -> None
2181 self.emit_doc_token = b
2182
2183 def Multiline(self, b):
2184 # type: (bool) -> None
2185 self.multiline = b
2186
2187
2188if 0:
2189 import collections
2190 WORD_HIST = collections.Counter()
2191
2192# vim: sw=4