OILS / osh / word_parse.py View on Github | oilshell.org

2212 lines, 1179 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from core import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from typing import List, Optional, Tuple, cast
108from typing import TYPE_CHECKING
109if TYPE_CHECKING:
110 from frontend.lexer import Lexer
111 from frontend.parse_lib import ParseContext
112 from frontend.reader import _Reader
113 from osh.cmd_parse import VarChecker
114
115unused1 = log
116unused2 = Id_str
117
118KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121class WordEmitter(object):
122 """Common interface for [ and [["""
123
124 def __init__(self):
125 # type: () -> None
126 """Empty constructor for mycpp."""
127 pass
128
129 def ReadWord(self, lex_mode):
130 # type: (lex_mode_t) -> word_t
131 raise NotImplementedError()
132
133
134class WordParser(WordEmitter):
135
136 def __init__(self, parse_ctx, lexer, line_reader):
137 # type: (ParseContext, Lexer, _Reader) -> None
138 self.parse_ctx = parse_ctx
139 self.lexer = lexer
140 self.line_reader = line_reader
141 self.arena = line_reader.arena
142
143 self.parse_opts = parse_ctx.parse_opts
144 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145 self.parse_opts)
146 self.Reset()
147
148 def Init(self, lex_mode):
149 # type: (lex_mode_t) -> None
150 """Used to parse arithmetic, see ParseContext."""
151 self.next_lex_mode = lex_mode
152
153 def Reset(self):
154 # type: () -> None
155 """Called by interactive loop."""
156 # For _GetToken()
157 self.cur_token = None # type: Token
158 self.token_kind = Kind.Undefined
159 self.token_type = Id.Undefined_Tok
160
161 self.next_lex_mode = lex_mode_e.ShCommand
162
163 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164 # comments
165 self.emit_doc_token = False
166 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167 # multiline mode.
168 self.multiline = False
169
170 # For detecting invalid \n\n in multiline mode. Counts what we got
171 # directly from the lexer.
172 self.newline_state = 0
173 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174 # that consume words.
175 self.returned_newline = False
176
177 # For integration with pgen2
178 self.buffered_word = None # type: word_t
179
180 def _GetToken(self):
181 # type: () -> None
182 """Call this when you need to make a decision based on any of:
183
184 self.token_type
185 self.token_kind
186 self.cur_token
187 """
188 if self.next_lex_mode == lex_mode_e.Undefined:
189 return # _SetNext() not called, so do nothing
190
191 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194 self.cur_token = self.lexer.Read(real_mode)
195
196 # MUTATE TOKEN for fake lexer mode.
197 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198 if (is_fake and self.cur_token.id
199 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200 self.cur_token.id = Id.Lit_Chars
201
202 self.token_type = self.cur_token.id
203 self.token_kind = consts.GetKind(self.token_type)
204
205 # number of consecutive newlines, ignoring whitespace
206 if self.token_type == Id.Op_Newline:
207 self.newline_state += 1
208 elif self.token_kind != Kind.WS:
209 self.newline_state = 0
210
211 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212 self.next_lex_mode = lex_mode_e.Undefined
213
214 def _SetNext(self, lex_mode):
215 # type: (lex_mode_t) -> None
216 """Set the next lex state, but don't actually read a token.
217
218 We need this for proper interactive parsing.
219 """
220 self.next_lex_mode = lex_mode
221
222 def _ReadVarOpArg(self, arg_lex_mode):
223 # type: (lex_mode_t) -> rhs_word_t
224
225 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
226 # valid, even when unquoted.
227 self._SetNext(arg_lex_mode)
228 self._GetToken()
229
230 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231 True) # empty_ok
232
233 # If the Compound has no parts, and we're in a double-quoted VarSub
234 # arg, and empty_ok, then return Empty. This is so it can evaluate to
235 # the empty string and not get elided.
236 #
237 # Examples:
238 # - "${s:-}", "${s/%pat/}"
239 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240 # has the same potential problem of not having Token location info.
241 #
242 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243 # return a Compound with no parts, which is explicitly checked with a
244 # custom error message.
245 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246 return rhs_word.Empty
247
248 return w
249
250 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
252 """Return a CompoundWord.
253
254 Helper function for _ReadVarOpArg and used directly by
255 _ReadPatSubVarOp.
256 """
257 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258 #log('w %s', w)
259 tilde = word_.TildeDetect(w)
260 if tilde:
261 w = tilde
262 return w
263
264 def _ReadSliceVarOp(self):
265 # type: () -> suffix_op.Slice
266 """
267 Looking token after first ':'
268
269 ArithExpr? (':' ArithExpr? )? '}'
270 """
271 self._NextNonSpace()
272
273 cur_id = self.token_type
274
275 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276 begin = arith_expr.EmptyZero # type: arith_expr_t
277 else:
278 begin = self.a_parser.Parse()
279 cur_id = self.a_parser.CurrentId() # advance
280
281 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282 no_length = None # type: Optional[arith_expr_t] # No length specified
283 return suffix_op.Slice(begin, no_length)
284
285 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
286 self._SetNext(lex_mode_e.Arith)
287 self._GetToken()
288
289 if self.token_type != Id.Arith_RBrace:
290 length = self._ReadArithExpr(Id.Arith_RBrace)
291 else:
292 # quirky bash behavior:
293 # ${a:1:} or ${a::} means length ZERO
294 # but ${a:1} or ${a:} means length N
295 length = arith_expr.EmptyZero
296
297 return suffix_op.Slice(begin, length)
298
299 else:
300 p_die("Expected : or } in slice", self.cur_token)
301
302 raise AssertionError() # for MyPy
303
304 def _ReadPatSubVarOp(self):
305 # type: () -> suffix_op.PatSub
306 """Looking at the first '/' after VarOf:
307
308 VarSub = ...
309 | VarOf '/' Match ( '/' WORD? )?
310 Match = '/' WORD # can't be empty
311 | '#' WORD? # may be empty
312 | '%' WORD?
313 """
314 slash_tok = self.cur_token # location info
315 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
316
317 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
318
319 self._GetToken()
320 if self.token_type == Id.Right_DollarBrace:
321 pat = CompoundWord([])
322 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
323 slash_tok)
324
325 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
326 replace_mode = self.token_type
327 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
328
329 # Bash quirk:
330 # echo ${x/#/replace} has an empty pattern
331 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
332 empty_ok = replace_mode != Id.Lit_Slash
333 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
334 empty_ok)
335 #log('pat 1 %r', pat)
336
337 if self.token_type == Id.Lit_Slash:
338 # read until }
339 replace = self._ReadVarOpArg(
340 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
341 #log('r 1 %r', replace)
342 else:
343 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
344 replace = rhs_word.Empty
345
346 self._GetToken()
347 if self.token_type != Id.Right_DollarBrace:
348 # This happens on invalid code
349 p_die(
350 "Expected } after replacement string, got %s" %
351 ui.PrettyId(self.token_type), self.cur_token)
352
353 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
354
355 def _ReadSubscript(self):
356 # type: () -> bracket_op_t
357 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
358 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
359 # expression.
360 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
361 if next_id in (Id.Lit_At, Id.Arith_Star):
362 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
363
364 self._SetNext(lex_mode_e.Arith) # skip past [
365 self._GetToken()
366 self._SetNext(lex_mode_e.Arith) # skip past @
367 self._GetToken()
368 else:
369 self._SetNext(lex_mode_e.Arith) # skip past [
370 anode = self._ReadArithExpr(Id.Arith_RBracket)
371 op = bracket_op.ArrayIndex(anode)
372
373 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
374 p_die('Expected ] to close subscript', self.cur_token)
375
376 self._SetNext(lex_mode_e.VSub_2) # skip past ]
377 self._GetToken() # Needed to be in the same spot as no subscript
378
379 return op
380
381 def _ParseVarOf(self):
382 # type: () -> BracedVarSub
383 """
384 VarOf = NAME Subscript?
385 | NUMBER # no subscript allowed, none of these are arrays
386 # ${@[1]} doesn't work, even though slicing does
387 | VarSymbol
388 """
389 self._GetToken()
390 name_token = self.cur_token
391 self._SetNext(lex_mode_e.VSub_2)
392
393 self._GetToken() # Check for []
394 if self.token_type == Id.VOp2_LBracket:
395 bracket_op = self._ReadSubscript()
396 else:
397 bracket_op = None
398
399 part = BracedVarSub.CreateNull()
400 part.token = name_token
401 part.var_name = lexer.TokenVal(name_token)
402 part.bracket_op = bracket_op
403 return part
404
405 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
406 # type: (lex_mode_t, bool) -> BracedVarSub
407 """Start parsing at the op -- we already skipped past the name."""
408 part = self._ParseVarOf()
409
410 self._GetToken()
411 if self.token_type == Id.Right_DollarBrace:
412 return part # no ops
413
414 op_kind = self.token_kind
415
416 if op_kind == Kind.VTest:
417 tok = self.cur_token
418 arg_word = self._ReadVarOpArg(arg_lex_mode)
419 if self.token_type != Id.Right_DollarBrace:
420 p_die('Expected } to close ${', self.cur_token)
421
422 part.suffix_op = suffix_op.Unary(tok, arg_word)
423
424 elif op_kind == Kind.VOpYsh:
425 tok = self.cur_token
426 arg_word = self._ReadVarOpArg(arg_lex_mode)
427 if self.token_type != Id.Right_DollarBrace:
428 p_die('Expected } to close ${', self.cur_token)
429
430 UP_arg_word = arg_word
431 with tagswitch(arg_word) as case:
432 if case(rhs_word_e.Empty):
433 pass
434 elif case(rhs_word_e.Compound):
435 arg_word = cast(CompoundWord, UP_arg_word)
436 # This handles ${x|html} and ${x %.3f} now
437 # However I think ${x %.3f} should be statically parsed? It can enter
438 # the printf lexer modes.
439 ok, arg, quoted = word_.StaticEval(arg_word)
440 if not ok or quoted:
441 p_die('Expected a constant argument',
442 loc.Word(arg_word))
443
444 part.suffix_op = suffix_op.Static(tok, arg)
445
446 elif op_kind == Kind.VOp0:
447 part.suffix_op = self.cur_token # Nullary
448 self._SetNext(lex_mode_e.VSub_2) # Expecting }
449 self._GetToken()
450
451 elif op_kind == Kind.VOp1: # % %% # ## etc.
452 tok = self.cur_token
453 # Weird exception that all shells have: these operators take a glob
454 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
455 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
456 if self.token_type != Id.Right_DollarBrace:
457 p_die('Expected } to close ${', self.cur_token)
458
459 part.suffix_op = suffix_op.Unary(tok, arg_word)
460
461 elif op_kind == Kind.VOp2: # / : [ ]
462 if self.token_type == Id.VOp2_Slash:
463 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
464 part.suffix_op = patsub_op
465
466 # Checked by the method above
467 assert self.token_type == Id.Right_DollarBrace, self.cur_token
468
469 elif self.token_type == Id.VOp2_Colon:
470 part.suffix_op = self._ReadSliceVarOp()
471 # NOTE: } in arithmetic mode.
472 if self.token_type != Id.Arith_RBrace:
473 # Token seems off; doesn't point to X in # ${a:1:2 X
474 p_die('Expected } to close ${', self.cur_token)
475
476 else:
477 # TODO: Does this ever happen?
478 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
479
480 elif op_kind == Kind.VOp3: # ${prefix@} etc.
481 if allow_query:
482 part.suffix_op = self.cur_token # Nullary
483 self._SetNext(lex_mode_e.VSub_2) # Expecting }
484 self._GetToken()
485 else:
486 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
487
488 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
489 # mode. It's redundantly checked above.
490 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
491 # ${a.} or ${!a.}
492 p_die('Expected } to close ${', self.cur_token)
493
494 # Now look for ops
495 return part
496
497 def _ReadZshVarSub(self, left_token):
498 # type: (Token) -> word_part.ZshVarSub
499
500 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
501
502 # Can be empty
503 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
504 True)
505 self._GetToken()
506 return word_part.ZshVarSub(left_token, w, self.cur_token)
507
508 def ReadBracedVarSub(self, left_token):
509 # type: (Token) -> Tuple[BracedVarSub, Token]
510 """ For YSH expressions like var x = ${x:-"default"}. """
511 part = self._ReadBracedVarSub(left_token, d_quoted=False)
512 last_token = self.cur_token
513 return part, last_token
514
515 def _ReadBracedVarSub(self, left_token, d_quoted):
516 # type: (Token, bool) -> BracedVarSub
517 """For the ${} expression language.
518
519 NAME = [a-zA-Z_][a-zA-Z0-9_]*
520 NUMBER = [0-9]+ # ${10}, ${11}, ...
521
522 Subscript = '[' ('@' | '*' | ArithExpr) ']'
523 VarSymbol = '!' | '@' | '#' | ...
524 VarOf = NAME Subscript?
525 | NUMBER # no subscript allowed, none of these are arrays
526 # ${@[1]} doesn't work, even though slicing does
527 | VarSymbol
528
529 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
530
531 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
532 STRIP_OP = '#' | '##' | '%' | '%%'
533 CASE_OP = ',' | ',,' | '^' | '^^'
534 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
535
536 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
537 # SPACE is operator not %
538 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
539 VarExpr = VarOf
540 | VarOf NULLARY_OP
541 | VarOf UnaryOp WORD
542 | VarOf YSH_UNARY STATIC_WORD
543 | VarOf ':' ArithExpr (':' ArithExpr )?
544 | VarOf '/' Match '/' WORD
545
546 LengthExpr = '#' VarOf # can't apply operators after length
547
548 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
549 # ${!ref[0]} vs ${!keys[@]} resolved later
550
551 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
552
553 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
554
555 VarSub = LengthExpr
556 | RefOrKeys
557 | PrefixQuery
558 | VarExpr
559 | BuiltinSub
560
561 NOTES:
562 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
563 slicing ${a:x+1:y+2}
564 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
565 - @ and * are technically arithmetic expressions in this implementation
566 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
567 it's also vectorized.
568
569 Strictness over bash:
570 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
571 grammar
572 - ! and # prefixes can't be composed, even though named refs can be
573 composed with other operators
574 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
575 a prefix, and it can also be a literal part of WORD.
576
577 From the parser's point of view, the prefix # can't be combined with
578 UnaryOp/slicing/matching, and the ! can. However
579
580 - ${a[@]:1:2} is not allowed
581 - ${#a[@]:1:2} is allowed, but gives the wrong answer
582 """
583 if d_quoted:
584 arg_lex_mode = lex_mode_e.VSub_ArgDQ
585 else:
586 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
587
588 self._SetNext(lex_mode_e.VSub_1)
589 self._GetToken()
590
591 ty = self.token_type
592 first_tok = self.cur_token
593
594 if ty == Id.VSub_Pound:
595 # Disambiguate
596 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
597 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
598 # e.g. a name, '#' is the prefix
599 self._SetNext(lex_mode_e.VSub_1)
600 part = self._ParseVarOf()
601
602 self._GetToken()
603 if self.token_type != Id.Right_DollarBrace:
604 p_die('Expected } after length expression', self.cur_token)
605
606 part.prefix_op = first_tok
607
608 else: # not a prefix, '#' is the variable
609 part = self._ParseVarExpr(arg_lex_mode)
610
611 elif ty == Id.VSub_Bang:
612 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
613 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
614 # e.g. a name, '!' is the prefix
615 # ${!a} -- this is a ref
616 # ${!3} -- this is ref
617 # ${!a[1]} -- this is a ref
618 # ${!a[@]} -- this is a keys
619 # No lookahead -- do it in a second step, or at runtime
620 self._SetNext(lex_mode_e.VSub_1)
621 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
622
623 part.prefix_op = first_tok
624
625 else: # not a prefix, '!' is the variable
626 part = self._ParseVarExpr(arg_lex_mode)
627
628 elif ty == Id.VSub_Dot:
629 # Note: this will become a new builtin_sub type, so this method must
630 # return word_part_t rather than BracedVarSub. I don't think that
631 # should cause problems.
632 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
633
634 # VS_NAME, VS_NUMBER, symbol that isn't # or !
635 elif self.token_kind == Kind.VSub:
636 part = self._ParseVarExpr(arg_lex_mode)
637
638 else:
639 # e.g. ${^}
640 p_die('Unexpected token in ${}', self.cur_token)
641
642 part.left = left_token # attach the argument
643 part.right = self.cur_token
644 return part
645
646 def _ReadSingleQuoted(self, left_token, lex_mode):
647 # type: (Token, lex_mode_t) -> SingleQuoted
648 """Internal method to read a word_part."""
649 tokens = [] # type: List[Token]
650 # In command mode, we never disallow backslashes like '\'
651 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
652 False)
653 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
654 node = SingleQuoted(left_token, sval, right_quote)
655 return node
656
657 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
658 # type: (lex_mode_t, Token, List[Token], bool) -> Token
659 """Appends to out_tokens; returns last token
660
661 Used by expr_parse.py
662 """
663 # TODO: Remove and use out_tokens
664 tokens = [] # type: List[Token]
665
666 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
667 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
668
669 expected_end_tokens = 3 if left_token.id in (
670 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
671 Id.Left_BTSingleQuote) else 1
672 num_end_tokens = 0
673
674 while num_end_tokens < expected_end_tokens:
675 self._SetNext(lex_mode)
676 self._GetToken()
677
678 # Kind.Char emitted in lex_mode.SQ_C
679 if self.token_kind in (Kind.Lit, Kind.Char):
680 tok = self.cur_token
681 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
682 # r'one\two' or c'one\\two'
683 if no_backslashes and lexer.TokenContains(tok, '\\'):
684 p_die(
685 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
686 tok)
687
688 if is_ysh_expr:
689 # Disallow var x = $'\001'. Arguably we don't need these
690 # checks because u'\u{1}' is the way to write it.
691 if self.token_type == Id.Char_Octal3:
692 p_die(
693 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
694 tok)
695
696 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
697 # disallow \xH
698 p_die(
699 r'Invalid hex escape in YSH string (must be \xHH)',
700 tok)
701
702 tokens.append(tok)
703
704 elif self.token_kind == Kind.Unknown:
705 tok = self.cur_token
706 assert tok.id == Id.Unknown_Backslash, tok
707
708 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
709 if is_ysh_expr or not self.parse_opts.parse_backslash():
710 p_die(
711 "Invalid char escape in C-style string literal (OILS-ERR-11)",
712 tok)
713
714 tokens.append(tok)
715
716 elif self.token_kind == Kind.Eof:
717 p_die('Unexpected EOF in single-quoted string that began here',
718 left_token)
719
720 elif self.token_kind == Kind.Right:
721 # assume Id.Right_SingleQuote
722 num_end_tokens += 1
723 tokens.append(self.cur_token)
724
725 else:
726 raise AssertionError(self.cur_token)
727
728 if self.token_kind != Kind.Right:
729 num_end_tokens = 0 # we need three in a ROW
730
731 if expected_end_tokens == 1:
732 tokens.pop()
733 elif expected_end_tokens == 3: # Get rid of spurious end tokens
734 tokens.pop()
735 tokens.pop()
736 tokens.pop()
737
738 # Remove space from ''' r''' $''' in both expression mode and command mode
739 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
740 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
741 word_compile.RemoveLeadingSpaceSQ(tokens)
742
743 # Validation after lexing - same 2 checks in j8.LexerDecoder
744 is_u_string = left_token.id in (Id.Left_USingleQuote,
745 Id.Left_UTSingleQuote)
746
747 for tok in tokens:
748 # u'\yff' is not valid, but b'\yff' is
749 if is_u_string and tok.id == Id.Char_YHex:
750 p_die(
751 r"%s escapes not allowed in u'' strings" %
752 lexer.TokenVal(tok), tok)
753
754 out_tokens.extend(tokens)
755 return self.cur_token
756
757 def _ReadDoubleQuotedLeftParts(self):
758 # type: () -> word_part_t
759 """Read substitution parts in a double quoted context."""
760 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
761 return self._ReadCommandSub(self.token_type, d_quoted=True)
762
763 if self.token_type == Id.Left_DollarBrace:
764 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
765
766 if self.token_type == Id.Left_DollarDParen:
767 return self._ReadArithSub()
768
769 if self.token_type == Id.Left_DollarBracket:
770 return self._ReadExprSub(lex_mode_e.DQ)
771
772 raise AssertionError(self.cur_token)
773
774 def _ReadYshSingleQuoted(self, left_id):
775 # type: (Id_t) -> CompoundWord
776 """Read YSH style strings
777
778 r'' u'' b''
779 r''' ''' u''' ''' b''' '''
780 """
781 #log('BEF self.cur_token %s', self.cur_token)
782 if left_id == Id.Left_RSingleQuote:
783 lexer_mode = lex_mode_e.SQ_Raw
784 triple_left_id = Id.Left_RTSingleQuote
785 elif left_id == Id.Left_USingleQuote:
786 lexer_mode = lex_mode_e.J8_Str
787 triple_left_id = Id.Left_UTSingleQuote
788 elif left_id == Id.Left_BSingleQuote:
789 lexer_mode = lex_mode_e.J8_Str
790 triple_left_id = Id.Left_BTSingleQuote
791 else:
792 raise AssertionError(left_id)
793
794 # Needed for syntax checks
795 left_tok = self.cur_token
796 left_tok.id = left_id
797
798 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
799
800 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
801 self._SetNext(lex_mode_e.ShCommand)
802 self._GetToken()
803
804 assert self.token_type == Id.Left_SingleQuote
805 # HACK: magically transform the third ' in u''' to
806 # Id.Left_UTSingleQuote, so that ''' is the terminator
807 left_tok = self.cur_token
808 left_tok.id = triple_left_id
809
810 # Handles stripping leading whitespace
811 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
812
813 # Advance and validate
814 self._SetNext(lex_mode_e.ShCommand)
815
816 self._GetToken()
817 if self.token_kind not in KINDS_THAT_END_WORDS:
818 p_die('Unexpected token after YSH single-quoted string',
819 self.cur_token)
820
821 return CompoundWord([sq_part])
822
823 def _ReadUnquotedLeftParts(self, triple_out):
824 # type: (Optional[BoolParamBox]) -> word_part_t
825 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
826
827 If triple_out is set, then we try parsing triple quoted strings,
828 and set its value to True if we got one.
829 """
830 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
831 # Note: $"" is a synonym for "". It might make sense if it added
832 # \n \0 \x00 \u{123} etc. But that's not what bash does!
833 dq_part = self._ReadDoubleQuoted(self.cur_token)
834 # Got empty word "" and there's a " after
835 if (triple_out and len(dq_part.parts) == 0 and
836 self.lexer.ByteLookAhead() == '"'):
837
838 self._SetNext(lex_mode_e.ShCommand)
839 self._GetToken()
840 # HACK: magically transform the third " in """ to
841 # Id.Left_TDoubleQuote, so that """ is the terminator
842 left_dq_token = self.cur_token
843 left_dq_token.id = Id.Left_TDoubleQuote
844 triple_out.b = True # let caller know we got it
845 return self._ReadDoubleQuoted(left_dq_token)
846
847 return dq_part
848
849 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
850 Id.Left_DollarSingleQuote):
851 if self.token_type == Id.Left_SingleQuote:
852 lexer_mode = lex_mode_e.SQ_Raw
853 triple_left_id = Id.Left_TSingleQuote
854 elif self.token_type == Id.Left_RSingleQuote:
855 lexer_mode = lex_mode_e.SQ_Raw
856 triple_left_id = Id.Left_RTSingleQuote
857 else:
858 lexer_mode = lex_mode_e.SQ_C
859 # there is no such thing as $'''
860 triple_left_id = Id.Undefined_Tok
861
862 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
863
864 # Got empty '' or r'' and there's a ' after
865 # u'' and b'' are handled in _ReadYshSingleQuoted
866 if (triple_left_id != Id.Undefined_Tok and
867 triple_out is not None and len(sq_part.sval) == 0 and
868 self.lexer.ByteLookAhead() == "'"):
869
870 self._SetNext(lex_mode_e.ShCommand)
871 self._GetToken()
872
873 # HACK: magically transform the third ' in ''' to
874 # Id.Left_TSingleQuote, so that ''' is the terminator
875 left_sq_token = self.cur_token
876 left_sq_token.id = triple_left_id
877
878 triple_out.b = True # let caller know we got it
879 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
880
881 return sq_part
882
883 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
884 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
885 return self._ReadCommandSub(self.token_type, d_quoted=False)
886
887 if self.token_type == Id.Left_DollarBrace:
888 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
889
890 if self.token_type == Id.Left_DollarDParen:
891 return self._ReadArithSub()
892
893 if self.token_type == Id.Left_DollarBracket:
894 return self._ReadExprSub(lex_mode_e.ShCommand)
895
896 if self.token_type == Id.Left_DollarBraceZsh:
897 return self._ReadZshVarSub(self.cur_token)
898
899 raise AssertionError(self.cur_token)
900
901 def _ReadExtGlob(self):
902 # type: () -> word_part.ExtGlob
903 """
904 Grammar:
905 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
906 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
907 RIGHT = ')'
908 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
909 Compound includes ExtGlob
910 """
911 left_token = self.cur_token
912 right_token = None # type: Token
913 arms = [] # type: List[CompoundWord]
914
915 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
916 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
917
918 read_word = False # did we just a read a word? To handle @(||).
919
920 while True:
921 self._GetToken()
922
923 if self.token_type == Id.Right_ExtGlob:
924 if not read_word:
925 arms.append(CompoundWord([]))
926 right_token = self.cur_token
927 break
928
929 elif self.token_type == Id.Op_Pipe:
930 if not read_word:
931 arms.append(CompoundWord([]))
932 read_word = False
933 self._SetNext(lex_mode_e.ExtGlob)
934
935 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
936 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
937 Kind.ExtGlob):
938 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
939 arms.append(w)
940 read_word = True
941
942 elif self.token_kind == Kind.Eof:
943 p_die('Unexpected EOF reading extended glob that began here',
944 left_token)
945
946 else:
947 raise AssertionError(self.cur_token)
948
949 return word_part.ExtGlob(left_token, arms, right_token)
950
951 def _ReadBashRegexGroup(self):
952 # type: () -> word_part.BashRegexGroup
953 """
954 Grammar:
955 BashRegexGroup = '(' WORD? ')
956 """
957 left_token = self.cur_token
958 assert left_token.id == Id.BashRegex_LParen, left_token
959
960 right_token = None # type: Token
961 arms = [] # type: List[CompoundWord]
962
963 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
964 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
965
966 self._GetToken()
967 if self.token_type == Id.Right_BashRegexGroup: # empty ()
968 return word_part.BashRegexGroup(left_token, None, self.cur_token)
969
970 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
971 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
972 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
973 # To allow bash style [[ s =~ (a b) ]]
974 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
975 arms.append(w)
976
977 self._GetToken()
978 if self.token_type != Id.Right_BashRegexGroup:
979 p_die('Expected ) to close bash regex group', self.cur_token)
980
981 return word_part.BashRegexGroup(left_token, w, self.cur_token)
982
983 p_die('Expected word after ( opening bash regex group', self.cur_token)
984
985 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
986 # type: (Optional[Token], bool, List[word_part_t]) -> None
987 """
988 Args:
989 left_token: A token if we are reading a double quoted part, or None if
990 we're reading a here doc.
991 is_ysh_expr: Whether to disallow backticks and invalid char escapes
992 out_parts: list of word_part to append to
993 """
994 if left_token:
995 if left_token.id in (Id.Left_TDoubleQuote,
996 Id.Left_DollarTDoubleQuote):
997 expected_end_tokens = 3
998 else:
999 expected_end_tokens = 1
1000 else:
1001 expected_end_tokens = 1000 # here doc will break
1002
1003 num_end_tokens = 0
1004 while num_end_tokens < expected_end_tokens:
1005 self._SetNext(lex_mode_e.DQ)
1006 self._GetToken()
1007
1008 if self.token_kind == Kind.Lit:
1009 if self.token_type == Id.Lit_EscapedChar:
1010 tok = self.cur_token
1011 ch = lexer.TokenSliceLeft(tok, 1)
1012 part = word_part.EscapedLiteral(tok,
1013 ch) # type: word_part_t
1014 else:
1015 if self.token_type == Id.Lit_BadBackslash:
1016 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1017 # YSH.
1018 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1019 # recursion (unless parse_backslash)
1020 if (is_ysh_expr or
1021 not self.parse_opts.parse_backslash()):
1022 p_die(
1023 "Invalid char escape in double quoted string (OILS-ERR-12)",
1024 self.cur_token)
1025 elif self.token_type == Id.Lit_Dollar:
1026 if is_ysh_expr or not self.parse_opts.parse_dollar():
1027 p_die("Literal $ should be quoted like \$",
1028 self.cur_token)
1029
1030 part = self.cur_token
1031 out_parts.append(part)
1032
1033 elif self.token_kind == Kind.Left:
1034 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1035 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1036 self.cur_token)
1037
1038 part = self._ReadDoubleQuotedLeftParts()
1039 out_parts.append(part)
1040
1041 elif self.token_kind == Kind.VSub:
1042 tok = self.cur_token
1043 part = SimpleVarSub(tok)
1044 out_parts.append(part)
1045 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1046 # later.
1047
1048 elif self.token_kind == Kind.Right:
1049 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1050 if left_token:
1051 num_end_tokens += 1
1052
1053 # In a here doc, the right quote is literal!
1054 out_parts.append(self.cur_token)
1055
1056 elif self.token_kind == Kind.Eof:
1057 if left_token:
1058 p_die(
1059 'Unexpected EOF reading double-quoted string that began here',
1060 left_token)
1061 else: # here docs will have an EOF in their token stream
1062 break
1063
1064 else:
1065 raise AssertionError(self.cur_token)
1066
1067 if self.token_kind != Kind.Right:
1068 num_end_tokens = 0 # """ must be CONSECUTIVE
1069
1070 if expected_end_tokens == 1:
1071 out_parts.pop()
1072 elif expected_end_tokens == 3:
1073 out_parts.pop()
1074 out_parts.pop()
1075 out_parts.pop()
1076
1077 # Remove space from """ in both expression mode and command mode
1078 if (left_token and left_token.id
1079 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1080 word_compile.RemoveLeadingSpaceDQ(out_parts)
1081
1082 # Return nothing, since we appended to 'out_parts'
1083
1084 def _ReadDoubleQuoted(self, left_token):
1085 # type: (Token) -> DoubleQuoted
1086 """Helper function for "hello $name".
1087
1088 Args:
1089 eof_type: for stopping at }, Id.Lit_RBrace
1090 here_doc: Whether we are reading in a here doc context
1091
1092 Also ${foo%%a b c} # treat this as double quoted. until you hit
1093 """
1094 parts = [] # type: List[word_part_t]
1095 self._ReadLikeDQ(left_token, False, parts)
1096
1097 right_quote = self.cur_token
1098 return DoubleQuoted(left_token, parts, right_quote)
1099
1100 def ReadDoubleQuoted(self, left_token, parts):
1101 # type: (Token, List[word_part_t]) -> Token
1102 """For expression mode.
1103
1104 Read var x = "${dir:-}/$name"; etc.
1105 """
1106 self._ReadLikeDQ(left_token, True, parts)
1107 return self.cur_token
1108
1109 def _ReadCommandSub(self, left_id, d_quoted=False):
1110 # type: (Id_t, bool) -> CommandSub
1111 """
1112 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1113
1114 command_sub = '$(' command_list ')'
1115 | '@(' command_list ')'
1116 | '<(' command_list ')'
1117 | '>(' command_list ')'
1118 | ` command_list `
1119 """
1120 left_token = self.cur_token
1121
1122 # Set the lexer in a state so ) becomes the EOF token.
1123 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1124 Id.Left_ProcSubOut):
1125 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1126
1127 right_id = Id.Eof_RParen
1128 self.lexer.PushHint(Id.Op_RParen, right_id)
1129 c_parser = self.parse_ctx.MakeParserForCommandSub(
1130 self.line_reader, self.lexer, right_id)
1131 # NOTE: This doesn't use something like main_loop because we don't want
1132 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1133 node = c_parser.ParseCommandSub()
1134
1135 right_token = c_parser.w_parser.cur_token
1136
1137 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1138 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1139 # test/osh2oil.
1140
1141 right_id = Id.Eof_Backtick
1142 self.lexer.PushHint(Id.Left_Backtick, right_id)
1143 c_parser = self.parse_ctx.MakeParserForCommandSub(
1144 self.line_reader, self.lexer, right_id)
1145 node = c_parser.ParseCommandSub()
1146 right_token = c_parser.w_parser.cur_token
1147
1148 elif left_id == Id.Left_Backtick:
1149 if not self.parse_opts.parse_backticks():
1150 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1151 left_token)
1152
1153 self._SetNext(lex_mode_e.Backtick) # advance past `
1154
1155 parts = [] # type: List[str]
1156 while True:
1157 self._GetToken()
1158 #log("TOK %s", self.cur_token)
1159
1160 if self.token_type == Id.Backtick_Quoted:
1161 # Remove leading \
1162 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1163
1164 elif self.token_type == Id.Backtick_DoubleQuote:
1165 # Compatibility: If backticks are double quoted, then double quotes
1166 # within them have to be \"
1167 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1168 # is)
1169 if d_quoted:
1170 # Remove leading \
1171 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1172 else:
1173 parts.append(lexer.TokenVal(self.cur_token))
1174
1175 elif self.token_type == Id.Backtick_Other:
1176 parts.append(lexer.TokenVal(self.cur_token))
1177
1178 elif self.token_type == Id.Backtick_Right:
1179 break
1180
1181 elif self.token_type == Id.Eof_Real:
1182 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1183 p_die('Unexpected EOF while looking for closing backtick',
1184 left_token)
1185
1186 else:
1187 raise AssertionError(self.cur_token)
1188
1189 self._SetNext(lex_mode_e.Backtick)
1190
1191 # Calculate right SPID on CommandSub BEFORE re-parsing.
1192 right_token = self.cur_token
1193
1194 code_str = ''.join(parts)
1195 #log('code %r', code_str)
1196
1197 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1198 # won't have the same location info as MakeParserForCommandSub(), because
1199 # the lexer is different.
1200 arena = self.parse_ctx.arena
1201 #arena = alloc.Arena()
1202 line_reader = reader.StringLineReader(code_str, arena)
1203 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1204 src = source.Reparsed('backticks', left_token, right_token)
1205 with alloc.ctx_SourceCode(arena, src):
1206 node = c_parser.ParseCommandSub()
1207
1208 else:
1209 raise AssertionError(left_id)
1210
1211 return CommandSub(left_token, node, right_token)
1212
1213 def _ReadExprSub(self, lex_mode):
1214 # type: (lex_mode_t) -> word_part.ExprSub
1215 """$[d->key] $[obj.method()] etc."""
1216 left_token = self.cur_token
1217
1218 self._SetNext(lex_mode_e.Expr)
1219 enode, right_token = self.parse_ctx.ParseYshExpr(
1220 self.lexer, grammar_nt.ysh_expr_sub)
1221
1222 self._SetNext(lex_mode) # Move past ]
1223 return word_part.ExprSub(left_token, enode, right_token)
1224
1225 def ParseVarDecl(self, kw_token):
1226 # type: (Token) -> command.VarDecl
1227 """
1228 oil_var_decl: name_type_list '=' testlist end_stmt
1229
1230 Note that assignments must end with \n ; } or EOF. Unlike shell
1231 assignments, we disallow:
1232
1233 var x = 42 | wc -l
1234 var x = 42 && echo hi
1235 """
1236 self._SetNext(lex_mode_e.Expr)
1237 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1238 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1239 # wants
1240 if last_token.id == Id.Op_RBrace:
1241 last_token.id = Id.Lit_RBrace
1242
1243 # Let the CommandParser see the Op_Semi or Op_Newline.
1244 self.buffered_word = last_token
1245 self._SetNext(lex_mode_e.ShCommand) # always back to this
1246 return enode
1247
1248 def ParseMutation(self, kw_token, var_checker):
1249 # type: (Token, VarChecker) -> command.Mutation
1250 """
1251 setvar i = 42
1252 setvar i += 1
1253 setvar a[i] = 42
1254 setvar a[i] += 1
1255 setvar d.key = 42
1256 setvar d.key += 1
1257 """
1258 self._SetNext(lex_mode_e.Expr)
1259 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1260 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1261 # wants
1262 if last_token.id == Id.Op_RBrace:
1263 last_token.id = Id.Lit_RBrace
1264
1265 for lhs in enode.lhs:
1266 UP_lhs = lhs
1267 with tagswitch(lhs) as case:
1268 if case(y_lhs_e.Var):
1269 lhs = cast(Token, UP_lhs)
1270 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1271
1272 # Note: this does not cover cases like
1273 # setvar (a[0])[1] = v
1274 # setvar (d.key).other = v
1275 # This leaks into catching all typos statically, which may be
1276 # possible if 'use' makes all names explicit.
1277 elif case(y_lhs_e.Subscript):
1278 lhs = cast(Subscript, UP_lhs)
1279 if lhs.obj.tag() == expr_e.Var:
1280 v = cast(expr.Var, lhs.obj)
1281 var_checker.Check(kw_token.id, v.name, v.left)
1282
1283 elif case(y_lhs_e.Attribute):
1284 lhs = cast(Attribute, UP_lhs)
1285 if lhs.obj.tag() == expr_e.Var:
1286 v = cast(expr.Var, lhs.obj)
1287 var_checker.Check(kw_token.id, v.name, v.left)
1288
1289 # Let the CommandParser see the Op_Semi or Op_Newline.
1290 self.buffered_word = last_token
1291 self._SetNext(lex_mode_e.ShCommand) # always back to this
1292 return enode
1293
1294 def ParseBareDecl(self):
1295 # type: () -> expr_t
1296 """
1297 x = {name: val}
1298 """
1299 self._SetNext(lex_mode_e.Expr)
1300 self._GetToken()
1301 enode, last_token = self.parse_ctx.ParseYshExpr(
1302 self.lexer, grammar_nt.command_expr)
1303 if last_token.id == Id.Op_RBrace:
1304 last_token.id = Id.Lit_RBrace
1305 self.buffered_word = last_token
1306 self._SetNext(lex_mode_e.ShCommand)
1307 return enode
1308
1309 def ParseYshExprForCommand(self):
1310 # type: () -> expr_t
1311
1312 # Fudge for this case
1313 # for x in(y) {
1314 # versus
1315 # for x in (y) {
1316 #
1317 # In the former case, ReadWord on 'in' puts the lexer past (.
1318 # Also see LookPastSpace in CommandParers.
1319 # A simpler solution would be nicer.
1320
1321 if self.token_type == Id.Op_LParen:
1322 self.lexer.MaybeUnreadOne()
1323
1324 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1325
1326 self._SetNext(lex_mode_e.ShCommand)
1327 return enode
1328
1329 def ParseCommandExpr(self):
1330 # type: () -> expr_t
1331 """
1332 = 1+2
1333 """
1334 enode, last_token = self.parse_ctx.ParseYshExpr(
1335 self.lexer, grammar_nt.command_expr)
1336
1337 # In some cases, such as the case statement, we expect *the lexer* to be
1338 # pointing at the token right after the expression. But the expression
1339 # parser must have read to the `last_token`. Unreading places the lexer
1340 # back in the expected state. Ie:
1341 #
1342 # case (x) { case (x) {
1343 # (else) { = x } (else) { = x }
1344 # ^ The lexer is here ^ Unread to here
1345 # } }
1346 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1347 Id.Op_RBrace), last_token
1348 if last_token.id != Id.Eof_Real:
1349 # Eof_Real is the only token we cannot unread
1350 self.lexer.MaybeUnreadOne()
1351
1352 return enode
1353
1354 def ParseProc(self, node):
1355 # type: (Proc) -> None
1356
1357 # proc name-with-hyphens() must be accepted
1358 self._SetNext(lex_mode_e.ShCommand)
1359 self._GetToken()
1360 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1361 if self.token_type != Id.Lit_Chars:
1362 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1363 self.cur_token)
1364
1365 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1366 # for shell functions. Similar to IsValidVarName().
1367 node.name = self.cur_token
1368
1369 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1370
1371 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1372 assert last_token.id == Id.Op_LBrace
1373 last_token.id = Id.Lit_LBrace
1374 self.buffered_word = last_token
1375
1376 self._SetNext(lex_mode_e.ShCommand)
1377
1378 def ParseFunc(self, node):
1379 # type: (Func) -> None
1380 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1381
1382 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1383 assert last_token.id == Id.Op_LBrace
1384 last_token.id = Id.Lit_LBrace
1385 self.buffered_word = last_token
1386
1387 self._SetNext(lex_mode_e.ShCommand)
1388
1389 def ParseYshCasePattern(self):
1390 # type: () -> Tuple[pat_t, Token]
1391 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1392 self.lexer)
1393
1394 if last_token.id == Id.Op_LBrace:
1395 last_token.id = Id.Lit_LBrace
1396 self.buffered_word = last_token
1397
1398 return pat, left_tok
1399
1400 def NewlineOkForYshCase(self):
1401 # type: () -> Id_t
1402 """Check for optional newline and consume it.
1403
1404 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1405 which crop up while parsing Ysh Case Arms. For more details, see
1406 #oil-dev > Progress On YSH Case Grammar on zulip.
1407
1408 Returns a token id which is filled with the choice of
1409
1410 word { echo word }
1411 (3) { echo expr }
1412 /e/ { echo eggex }
1413 } # right brace
1414 """
1415 while True:
1416 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1417
1418 # Cannot lookahead past lines
1419 if next_id == Id.Unknown_Tok:
1420 self.lexer.MoveToNextLine()
1421 continue
1422
1423 next_kind = consts.GetKind(next_id)
1424 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1425 break
1426
1427 self.lexer.Read(lex_mode_e.Expr)
1428
1429 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1430 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1431 else:
1432 # Consume the trailing Op_Newline
1433 self._SetNext(lex_mode_e.ShCommand)
1434 self._GetToken()
1435
1436 return next_id
1437
1438 def _ReadArithExpr(self, end_id):
1439 # type: (Id_t) -> arith_expr_t
1440 """Read and parse an arithmetic expression in various contexts.
1441
1442 $(( 1+2 ))
1443 (( a=1+2 ))
1444 ${a[ 1+2 ]}
1445 ${a : 1+2 : 1+2}
1446
1447 See tests/arith-context.test.sh for ambiguous cases.
1448
1449 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1450
1451 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1452
1453 See the assertion in ArithParser.Parse() -- unexpected extra input.
1454 """
1455 # calls self.ReadWord(lex_mode_e.Arith)
1456 anode = self.a_parser.Parse()
1457 cur_id = self.a_parser.CurrentId()
1458 if end_id != Id.Undefined_Tok and cur_id != end_id:
1459 p_die(
1460 'Unexpected token after arithmetic expression (%s != %s)' %
1461 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1462 loc.Word(self.a_parser.cur_word))
1463 return anode
1464
1465 def _ReadArithSub(self):
1466 # type: () -> word_part.ArithSub
1467 """Read an arith substitution, which contains an arith expression, e.g.
1468
1469 $((a + 1)).
1470 """
1471 left_tok = self.cur_token
1472
1473 # The second one needs to be disambiguated in stuff like stuff like:
1474 # $(echo $(( 1+2 )) )
1475 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1476
1477 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1478 # could save the lexer/reader state here, and retry if the arithmetic parse
1479 # fails. But we can almost always catch this at parse time. There could
1480 # be some exceptions like:
1481 # $((echo * foo)) # looks like multiplication
1482 # $((echo / foo)) # looks like division
1483
1484 # $(( )) is valid
1485 anode = arith_expr.EmptyZero # type: arith_expr_t
1486
1487 self._NextNonSpace()
1488 if self.token_type != Id.Arith_RParen:
1489 anode = self._ReadArithExpr(Id.Arith_RParen)
1490
1491 self._SetNext(lex_mode_e.ShCommand)
1492
1493 # Ensure we get closing )
1494 self._GetToken()
1495 if self.token_type != Id.Right_DollarDParen:
1496 p_die('Expected second ) to end arith sub', self.cur_token)
1497
1498 right_tok = self.cur_token
1499 return word_part.ArithSub(left_tok, anode, right_tok)
1500
1501 def ReadDParen(self):
1502 # type: () -> Tuple[arith_expr_t, Token]
1503 """Read ((1+ 2)) -- command context.
1504
1505 We're using the word parser because it's very similar to _ReadArithExpr
1506 above.
1507
1508 This also returns the terminating Id.Op_DRightParen token for location
1509 info.
1510 """
1511 # (( )) is valid
1512 anode = arith_expr.EmptyZero # type: arith_expr_t
1513
1514 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1515
1516 self._NextNonSpace()
1517 if self.token_type != Id.Arith_RParen:
1518 anode = self._ReadArithExpr(Id.Arith_RParen)
1519
1520 self._SetNext(lex_mode_e.ShCommand)
1521
1522 # Ensure we get the second )
1523 self._GetToken()
1524 right = self.cur_token
1525 if right.id != Id.Op_DRightParen:
1526 p_die('Expected second ) to end arith statement', right)
1527
1528 self._SetNext(lex_mode_e.ShCommand)
1529
1530 return anode, right
1531
1532 def _NextNonSpace(self):
1533 # type: () -> None
1534 """Advance in lex_mode_e.Arith until non-space token.
1535
1536 Same logic as _ReadWord, but used in
1537 $(( ))
1538 (( ))
1539 for (( ))
1540
1541 You can read self.token_type after this, without calling _GetToken.
1542 """
1543 while True:
1544 self._SetNext(lex_mode_e.Arith)
1545 self._GetToken()
1546 if self.token_kind not in (Kind.Ignored, Kind.WS):
1547 break
1548
1549 def ReadForExpression(self):
1550 # type: () -> command.ForExpr
1551 """Read ((i=0; i<5; ++i)) -- part of command context."""
1552 self._NextNonSpace() # skip over ((
1553 cur_id = self.token_type # for end of arith expressions
1554
1555 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1556 init_node = arith_expr.EmptyZero # type: arith_expr_t
1557 else:
1558 init_node = self.a_parser.Parse()
1559 cur_id = self.a_parser.CurrentId()
1560 self._NextNonSpace()
1561
1562 # It's odd to keep track of both cur_id and self.token_type in this
1563 # function, but it works, and is tested in 'test/parse_error.sh
1564 # arith-integration'
1565 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1566 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1567
1568 self._GetToken()
1569 cur_id = self.token_type
1570
1571 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1572 # empty condition is TRUE
1573 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1574 else:
1575 cond_node = self.a_parser.Parse()
1576 cur_id = self.a_parser.CurrentId()
1577
1578 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1579 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1580
1581 self._NextNonSpace()
1582 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1583 update_node = arith_expr.EmptyZero # type: arith_expr_t
1584 else:
1585 update_node = self._ReadArithExpr(Id.Arith_RParen)
1586
1587 self._NextNonSpace()
1588 if self.token_type != Id.Arith_RParen:
1589 p_die('Expected ) to end for loop expression', self.cur_token)
1590 self._SetNext(lex_mode_e.ShCommand)
1591
1592 # redirects is None, will be assigned in CommandEvaluator
1593 node = command.ForExpr.CreateNull()
1594 node.init = init_node
1595 node.cond = cond_node
1596 node.update = update_node
1597 return node
1598
1599 def _ReadArrayLiteral(self):
1600 # type: () -> word_part_t
1601 """a=(1 2 3)
1602
1603 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1604
1605 We want:
1606
1607 A=(['x']=1 ["x"]=2 [$x$y]=3)
1608
1609 Maybe allow this as a literal string? Because I think I've seen it before?
1610 Or maybe force people to patch to learn the rule.
1611
1612 A=([x]=4)
1613
1614 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1615 Maybe enforce that ALL have keys or NONE of have keys.
1616 """
1617 self._SetNext(lex_mode_e.ShCommand) # advance past (
1618 self._GetToken()
1619 if self.cur_token.id != Id.Op_LParen:
1620 p_die('Expected ( after =', self.cur_token)
1621 left_token = self.cur_token
1622 right_token = None # type: Token
1623
1624 # MUST use a new word parser (with same lexer).
1625 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1626 words = [] # type: List[CompoundWord]
1627 done = False
1628 while not done:
1629 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1630 with tagswitch(w) as case:
1631 if case(word_e.Operator):
1632 tok = cast(Token, w)
1633 if tok.id == Id.Right_ShArrayLiteral:
1634 right_token = tok
1635 done = True # can't use break here
1636 # Unlike command parsing, array parsing allows embedded \n.
1637 elif tok.id == Id.Op_Newline:
1638 continue
1639 else:
1640 p_die('Unexpected token in array literal', loc.Word(w))
1641
1642 elif case(word_e.Compound):
1643 words.append(cast(CompoundWord, w))
1644
1645 else:
1646 raise AssertionError()
1647
1648 if len(words) == 0: # a=() is empty indexed array
1649 # Needed for type safety, doh
1650 no_words = [] # type: List[word_t]
1651 node = ShArrayLiteral(left_token, no_words, right_token)
1652 return node
1653
1654 pairs = [] # type: List[AssocPair]
1655 # If the first one is a key/value pair, then the rest are assumed to be.
1656 pair = word_.DetectAssocPair(words[0])
1657 if pair:
1658 pairs.append(pair)
1659
1660 n = len(words)
1661 for i in xrange(1, n):
1662 w2 = words[i]
1663 pair = word_.DetectAssocPair(w2)
1664 if not pair:
1665 p_die("Expected associative array pair", loc.Word(w2))
1666
1667 pairs.append(pair)
1668
1669 # invariant List?
1670 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1671
1672 # Brace detection for arrays but NOT associative arrays
1673 words2 = braces.BraceDetectAll(words)
1674 words3 = word_.TildeDetectAll(words2)
1675 return ShArrayLiteral(left_token, words3, right_token)
1676
1677 def ParseProcCallArgs(self, start_symbol):
1678 # type: (int) -> ArgList
1679 """ json write (x) """
1680 self.lexer.MaybeUnreadOne()
1681
1682 arg_list = ArgList.CreateNull(alloc_lists=True)
1683 arg_list.left = self.cur_token
1684 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1685 return arg_list
1686
1687 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1688 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1689 """Helper for _ReadCompoundWord3."""
1690 done = False
1691
1692 if self.token_type == Id.Lit_EscapedChar:
1693 tok = self.cur_token
1694 assert tok.length == 2
1695 ch = lexer.TokenSliceLeft(tok, 1)
1696 if not self.parse_opts.parse_backslash():
1697 if not pyutil.IsValidCharEscape(ch):
1698 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1699 self.cur_token)
1700
1701 part = word_part.EscapedLiteral(self.cur_token,
1702 ch) # type: word_part_t
1703 else:
1704 part = self.cur_token
1705
1706 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1707 parts.append(part)
1708 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1709 # _ReadWord.
1710 next_id = self.lexer.LookPastSpace(lex_mode)
1711 if next_id == Id.Op_LParen:
1712 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1713 part2 = self._ReadArrayLiteral()
1714 parts.append(part2)
1715
1716 # Array literal must be the last part of the word.
1717 self._SetNext(lex_mode)
1718 self._GetToken()
1719 # EOF, whitespace, newline, Right_Subshell
1720 if self.token_kind not in KINDS_THAT_END_WORDS:
1721 p_die('Unexpected token after array literal',
1722 self.cur_token)
1723 done = True
1724
1725 elif (is_first and self.parse_opts.parse_at() and
1726 self.token_type == Id.Lit_Splice):
1727
1728 splice_tok = self.cur_token
1729 part2 = word_part.Splice(splice_tok,
1730 lexer.TokenSliceLeft(splice_tok, 1))
1731
1732 parts.append(part2)
1733
1734 # @words must be the last part of the word
1735 self._SetNext(lex_mode)
1736 self._GetToken()
1737 # EOF, whitespace, newline, Right_Subshell
1738 if self.token_kind not in KINDS_THAT_END_WORDS:
1739 p_die('Unexpected token after array splice', self.cur_token)
1740 done = True
1741
1742 elif (is_first and self.parse_opts.parse_at() and
1743 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1744 part2 = self._ReadExprSub(lex_mode_e.DQ)
1745 parts.append(part2)
1746
1747 # @[split(x)]
1748 self._SetNext(lex_mode)
1749 self._GetToken()
1750 # EOF, whitespace, newline, Right_Subshell
1751 if self.token_kind not in KINDS_THAT_END_WORDS:
1752 p_die('Unexpected token after Expr splice', self.cur_token)
1753 done = True
1754
1755 elif (is_first and self.parse_opts.parse_at() and
1756 self.token_type == Id.Lit_AtLBraceDot):
1757 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1758
1759 elif (is_first and self.parse_opts.parse_at_all() and
1760 self.token_type == Id.Lit_At):
1761 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1762 # at the beginning of a word to be reserved.
1763
1764 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1765 # @_argv and
1766 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1767 self.cur_token)
1768
1769 else:
1770 # not a literal with lookahead; append it
1771 parts.append(part)
1772
1773 return done
1774
1775 def _ReadCompoundWord(self, lex_mode):
1776 # type: (lex_mode_t) -> CompoundWord
1777 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1778
1779 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1780 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1781 """
1782 Precondition: Looking at the first token of the first word part
1783 Postcondition: Looking at the token after, e.g. space or operator
1784
1785 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1786 could be an operator delimiting a compound word. Can we change lexer modes
1787 and remove this special case?
1788 """
1789 w = CompoundWord([])
1790 num_parts = 0
1791 brace_count = 0
1792 done = False
1793 is_triple_quoted = None # type: Optional[BoolParamBox]
1794
1795 while not done:
1796 self._GetToken()
1797
1798 allow_done = empty_ok or num_parts != 0
1799 if allow_done and self.token_type == eof_type:
1800 done = True # e.g. for ${foo//pat/replace}
1801
1802 # Keywords like "for" are treated like literals
1803 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1804 Kind.ControlFlow, Kind.BoolUnary,
1805 Kind.BoolBinary):
1806
1807 # Syntax error for { and }
1808 if self.token_type == Id.Lit_LBrace:
1809 brace_count += 1
1810 elif self.token_type == Id.Lit_RBrace:
1811 brace_count -= 1
1812 elif self.token_type == Id.Lit_Dollar:
1813 if not self.parse_opts.parse_dollar():
1814 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1815 next_byte = self.lexer.ByteLookAhead()
1816 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1817 if next_byte == '/':
1818 #log('next_byte %r', next_byte)
1819 pass
1820
1821 p_die('Literal $ should be quoted like \$',
1822 self.cur_token)
1823
1824 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1825 w.parts)
1826
1827 elif self.token_kind == Kind.VSub:
1828 vsub_token = self.cur_token
1829
1830 part = SimpleVarSub(vsub_token) # type: word_part_t
1831 w.parts.append(part)
1832
1833 elif self.token_kind == Kind.ExtGlob:
1834 # If parse_at, we can take over @( to start @(seq 3)
1835 # Users can also use look at ,(*.py|*.sh)
1836 if (self.parse_opts.parse_at() and
1837 self.token_type == Id.ExtGlob_At and num_parts == 0):
1838 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1839 d_quoted=False)
1840 # RARE mutation of tok.id!
1841 cs_part.left_token.id = Id.Left_AtParen
1842 part = cs_part # for type safety
1843
1844 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1845 # a=(one two)x and @arrayfunc(3)x.
1846 self._GetToken()
1847 if self.token_kind not in KINDS_THAT_END_WORDS:
1848 p_die('Unexpected token after @()', self.cur_token)
1849 done = True
1850
1851 else:
1852 part = self._ReadExtGlob()
1853 w.parts.append(part)
1854
1855 elif self.token_kind == Kind.BashRegex:
1856 if self.token_type == Id.BashRegex_LParen: # Opening (
1857 part = self._ReadBashRegexGroup()
1858 w.parts.append(part)
1859 else:
1860 assert self.token_type == Id.BashRegex_AllowedInParens
1861 p_die('Invalid token in bash regex', self.cur_token)
1862
1863 elif self.token_kind == Kind.Left:
1864 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1865 lex_mode == lex_mode_e.ShCommand and
1866 num_parts == 0)
1867
1868 # Save allocation
1869 if try_triple_quote:
1870 is_triple_quoted = BoolParamBox(False)
1871
1872 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1873 w.parts.append(part)
1874
1875 # NOT done yet, will advance below
1876 elif self.token_kind == Kind.Right:
1877 # Still part of the word; will be done on the next iter.
1878 if self.token_type == Id.Right_DoubleQuote:
1879 pass
1880 # Never happens, no PushHint for this case.
1881 #elif self.token_type == Id.Right_DollarParen:
1882 # pass
1883 elif self.token_type == Id.Right_Subshell:
1884 # LEXER HACK for (case x in x) ;; esac )
1885 # Rewind before it's used
1886 assert self.next_lex_mode == lex_mode_e.Undefined
1887 if self.lexer.MaybeUnreadOne():
1888 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1889 self._SetNext(lex_mode)
1890 done = True
1891 else:
1892 done = True
1893
1894 elif self.token_kind == Kind.Ignored:
1895 done = True
1896
1897 else:
1898 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1899 # so to test for ESAC, we can read ) before getting a chance to
1900 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1901 # token and do it again.
1902
1903 # We get Id.Op_RParen at top level: case x in x) ;; esac
1904 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1905 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1906 # Rewind before it's used
1907 assert self.next_lex_mode == lex_mode_e.Undefined
1908 if self.lexer.MaybeUnreadOne():
1909 if self.token_type == Id.Eof_RParen:
1910 # Redo translation
1911 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1912 self._SetNext(lex_mode)
1913
1914 done = True # anything we don't recognize means we're done
1915
1916 if not done:
1917 self._SetNext(lex_mode)
1918 num_parts += 1
1919
1920 if (self.parse_opts.parse_brace() and num_parts > 1 and
1921 brace_count != 0):
1922 # accept { and }, but not foo{
1923 p_die(
1924 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1925 loc.Word(w))
1926
1927 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1928 p_die('Unexpected parts after triple quoted string',
1929 loc.WordPart(w.parts[-1]))
1930
1931 if 0:
1932 from _devbuild.gen.syntax_asdl import word_part_str
1933 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1934 WORD_HIST[word_key] += 1
1935 return w
1936
1937 def _ReadArithWord(self):
1938 # type: () -> Optional[word_t]
1939 """ Helper for ReadArithWord() """
1940 self._GetToken()
1941
1942 if self.token_kind == Kind.Unknown:
1943 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1944 p_die(
1945 'Unexpected token while parsing arithmetic: %r' %
1946 lexer.TokenVal(self.cur_token), self.cur_token)
1947
1948 elif self.token_kind == Kind.Eof:
1949 return self.cur_token
1950
1951 elif self.token_kind == Kind.Ignored:
1952 # Space should be ignored.
1953 self._SetNext(lex_mode_e.Arith)
1954 return None
1955
1956 elif self.token_kind in (Kind.Arith, Kind.Right):
1957 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1958 self._SetNext(lex_mode_e.Arith)
1959 return self.cur_token
1960
1961 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1962 return self._ReadCompoundWord(lex_mode_e.Arith)
1963
1964 else:
1965 raise AssertionError(self.cur_token)
1966
1967 def _ReadWord(self, word_mode):
1968 # type: (lex_mode_t) -> Optional[word_t]
1969 """Helper function for ReadWord()."""
1970
1971 # Change the pseudo lexer mode to a real lexer mode
1972 if word_mode == lex_mode_e.ShCommandFakeBrack:
1973 lex_mode = lex_mode_e.ShCommand
1974 else:
1975 lex_mode = word_mode
1976
1977 self._GetToken()
1978
1979 if self.token_kind == Kind.Eof:
1980 # No advance
1981 return self.cur_token
1982
1983 # Allow Arith for ) at end of for loop?
1984 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1985 self._SetNext(lex_mode)
1986
1987 # Newlines are complicated. See 3x2 matrix in the comment about
1988 # self.multiline and self.newline_state above.
1989 if self.token_type == Id.Op_Newline:
1990 if self.multiline:
1991 if self.newline_state > 1:
1992 # This points at a blank line, but at least it gives the line number
1993 p_die('Invalid blank line in multiline mode',
1994 self.cur_token)
1995 return None
1996
1997 if self.returned_newline: # skip
1998 return None
1999
2000 return self.cur_token
2001
2002 elif self.token_kind == Kind.Right:
2003 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2004 Id.Right_CasePat,
2005 Id.Right_ShArrayLiteral):
2006 raise AssertionError(self.cur_token)
2007
2008 self._SetNext(lex_mode)
2009 return self.cur_token
2010
2011 elif self.token_kind in (Kind.Ignored, Kind.WS):
2012 self._SetNext(lex_mode)
2013 return None
2014
2015 else:
2016 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2017 Kind.Left, Kind.KW, Kind.ControlFlow,
2018 Kind.BoolUnary, Kind.BoolBinary,
2019 Kind.ExtGlob,
2020 Kind.BashRegex), 'Unhandled token kind'
2021
2022 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2023 self.parse_opts.parse_bracket() and
2024 self.token_type == Id.Lit_LBracket):
2025 # Change [ from Kind.Lit -> Kind.Op
2026 # So CommandParser can treat
2027 # assert [42 === x]
2028 # like
2029 # json write (x)
2030 bracket_word = self.cur_token
2031 bracket_word.id = Id.Op_LBracket
2032
2033 self._SetNext(lex_mode)
2034 return bracket_word
2035
2036 # We're beginning a word. If we see Id.Lit_Pound, change to
2037 # lex_mode_e.Comment and read until end of line.
2038 if self.token_type == Id.Lit_Pound:
2039 self._SetNext(lex_mode_e.Comment)
2040 self._GetToken()
2041
2042 # NOTE: The # could be the last character in the file. It can't be
2043 # Eof_{RParen,Backtick} because #) and #` are comments.
2044 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2045 self.cur_token
2046
2047 # The next iteration will go into Kind.Ignored and set lex state to
2048 # lex_mode_e.ShCommand/etc.
2049 return None # tell ReadWord() to try again after comment
2050
2051 elif self.token_type == Id.Lit_TPound: ### doc comment
2052 self._SetNext(lex_mode_e.Comment)
2053 self._GetToken()
2054
2055 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2056 return self.cur_token
2057
2058 return None # tell ReadWord() to try again after comment
2059
2060 else:
2061 # r'' u'' b''
2062 if (self.token_type == Id.Lit_Chars and
2063 self.lexer.LookAheadOne(
2064 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2065
2066 # When shopt -s parse_raw_string:
2067 # echo r'hi' is like echo 'hi'
2068 #
2069 # echo u'\u{3bc}' b'\yff' works
2070
2071 tok = self.cur_token
2072 if self.parse_opts.parse_ysh_string():
2073 if lexer.TokenEquals(tok, 'r'):
2074 left_id = Id.Left_RSingleQuote
2075 elif lexer.TokenEquals(tok, 'u'):
2076 left_id = Id.Left_USingleQuote
2077 elif lexer.TokenEquals(tok, 'b'):
2078 left_id = Id.Left_BSingleQuote
2079 else:
2080 left_id = Id.Undefined_Tok
2081
2082 if left_id != Id.Undefined_Tok:
2083 # skip the r, and then 'foo' will be read as normal
2084 self._SetNext(lex_mode_e.ShCommand)
2085
2086 self._GetToken()
2087 assert self.token_type == Id.Left_SingleQuote, self.token_type
2088
2089 # Read the word in a different lexer mode
2090 return self._ReadYshSingleQuoted(left_id)
2091
2092 return self._ReadCompoundWord(lex_mode)
2093
2094 def ParseVarRef(self):
2095 # type: () -> BracedVarSub
2096 """DYNAMIC parsing of what's inside ${!ref}
2097
2098 # Same as VarOf production
2099 VarRefExpr = VarOf EOF
2100 """
2101 self._SetNext(lex_mode_e.VSub_1)
2102
2103 self._GetToken()
2104 if self.token_kind != Kind.VSub:
2105 p_die('Expected var name', self.cur_token)
2106
2107 part = self._ParseVarOf()
2108 # NOTE: no ${ } means no part.left and part.right
2109 part.left = part.token # cheat to make test pass
2110 part.right = part.token
2111
2112 self._GetToken()
2113 if self.token_type != Id.Eof_Real:
2114 p_die('Expected end of var ref expression', self.cur_token)
2115 return part
2116
2117 def LookPastSpace(self):
2118 # type: () -> Id_t
2119 """Look ahead to the next token.
2120
2121 For the CommandParser to recognize
2122 array= (1 2 3)
2123 YSH for ( versus bash for ((
2124 YSH if ( versus if test
2125 YSH while ( versus while test
2126 YSH bare assignment 'grep =' versus 'grep foo'
2127 """
2128 assert self.token_type != Id.Undefined_Tok
2129 if self.cur_token.id == Id.WS_Space:
2130 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2131 else:
2132 id_ = self.cur_token.id
2133 return id_
2134
2135 def LookAheadFuncParens(self):
2136 # type: () -> bool
2137 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2138 assert self.token_type != Id.Undefined_Tok
2139
2140 # We have to handle 2 cases because we buffer a token
2141 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2142 return self.lexer.LookAheadFuncParens(1) # go back one char
2143
2144 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2145 return self.lexer.LookAheadFuncParens(0)
2146
2147 else:
2148 return False
2149
2150 def ReadWord(self, word_mode):
2151 # type: (lex_mode_t) -> word_t
2152 """Read the next word, using the given lexer mode.
2153
2154 This is a stateful wrapper for the stateless _ReadWord function.
2155 """
2156 assert word_mode in (lex_mode_e.ShCommand,
2157 lex_mode_e.ShCommandFakeBrack,
2158 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2159
2160 if self.buffered_word: # For integration with pgen2
2161 w = self.buffered_word
2162 self.buffered_word = None
2163 else:
2164 while True:
2165 w = self._ReadWord(word_mode)
2166 if w is not None:
2167 break
2168
2169 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2170 return w
2171
2172 def ReadArithWord(self):
2173 # type: () -> word_t
2174 while True:
2175 w = self._ReadArithWord()
2176 if w is not None:
2177 break
2178 return w
2179
2180 def ReadHereDocBody(self, parts):
2181 # type: (List[word_part_t]) -> None
2182 """
2183 A here doc is like a double quoted context, except " isn't special.
2184 """
2185 self._ReadLikeDQ(None, False, parts)
2186 # Returns nothing
2187
2188 def ReadForPlugin(self):
2189 # type: () -> CompoundWord
2190 """For $PS1, $PS4, etc.
2191
2192 This is just like reading a here doc line. "\n" is allowed, as
2193 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2194 """
2195 w = CompoundWord([])
2196 self._ReadLikeDQ(None, False, w.parts)
2197 return w
2198
2199 def EmitDocToken(self, b):
2200 # type: (bool) -> None
2201 self.emit_doc_token = b
2202
2203 def Multiline(self, b):
2204 # type: (bool) -> None
2205 self.multiline = b
2206
2207
2208if 0:
2209 import collections
2210 WORD_HIST = collections.Counter()
2211
2212# vim: sw=4