OILS / osh / word_parse.py View on Github | oilshell.org

2218 lines, 1183 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91)
92from core import alloc
93from core.error import p_die
94from mycpp.mylib import log
95from core import pyutil
96from core import ui
97from frontend import consts
98from frontend import lexer
99from frontend import reader
100from osh import tdop
101from osh import arith_parse
102from osh import braces
103from osh import word_
104from osh import word_compile
105from mycpp.mylib import tagswitch
106
107from typing import List, Optional, Tuple, cast
108from typing import TYPE_CHECKING
109if TYPE_CHECKING:
110 from frontend.lexer import Lexer
111 from frontend.parse_lib import ParseContext
112 from frontend.reader import _Reader
113 from osh.cmd_parse import VarChecker
114
115unused1 = log
116unused2 = Id_str
117
118KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
119
120
121class WordEmitter(object):
122 """Common interface for [ and [["""
123
124 def __init__(self):
125 # type: () -> None
126 """Empty constructor for mycpp."""
127 pass
128
129 def ReadWord(self, lex_mode):
130 # type: (lex_mode_t) -> word_t
131 raise NotImplementedError()
132
133
134class WordParser(WordEmitter):
135
136 def __init__(self, parse_ctx, lexer, line_reader):
137 # type: (ParseContext, Lexer, _Reader) -> None
138 self.parse_ctx = parse_ctx
139 self.lexer = lexer
140 self.line_reader = line_reader
141 self.arena = line_reader.arena
142
143 self.parse_opts = parse_ctx.parse_opts
144 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
145 self.parse_opts)
146 self.Reset()
147
148 def Init(self, lex_mode):
149 # type: (lex_mode_t) -> None
150 """Used to parse arithmetic, see ParseContext."""
151 self.next_lex_mode = lex_mode
152
153 def Reset(self):
154 # type: () -> None
155 """Called by interactive loop."""
156 # For _GetToken()
157 self.cur_token = None # type: Token
158 self.token_kind = Kind.Undefined
159 self.token_type = Id.Undefined_Tok
160
161 self.next_lex_mode = lex_mode_e.ShCommand
162
163 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
164 # comments
165 self.emit_doc_token = False
166 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
167 # multiline mode.
168 self.multiline = False
169
170 # For detecting invalid \n\n in multiline mode. Counts what we got
171 # directly from the lexer.
172 self.newline_state = 0
173 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
174 # that consume words.
175 self.returned_newline = False
176
177 # For integration with pgen2
178 self.buffered_word = None # type: word_t
179
180 def _GetToken(self):
181 # type: () -> None
182 """Call this when you need to make a decision based on any of:
183
184 self.token_type
185 self.token_kind
186 self.cur_token
187 """
188 if self.next_lex_mode == lex_mode_e.Undefined:
189 return # _SetNext() not called, so do nothing
190
191 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
192 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
193
194 self.cur_token = self.lexer.Read(real_mode)
195
196 # MUTATE TOKEN for fake lexer mode.
197 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
198 if (is_fake and self.cur_token.id
199 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
200 self.cur_token.id = Id.Lit_Chars
201
202 self.token_type = self.cur_token.id
203 self.token_kind = consts.GetKind(self.token_type)
204
205 # number of consecutive newlines, ignoring whitespace
206 if self.token_type == Id.Op_Newline:
207 self.newline_state += 1
208 elif self.token_kind != Kind.WS:
209 self.newline_state = 0
210
211 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
212 self.next_lex_mode = lex_mode_e.Undefined
213
214 def _SetNext(self, lex_mode):
215 # type: (lex_mode_t) -> None
216 """Set the next lex state, but don't actually read a token.
217
218 We need this for proper interactive parsing.
219 """
220 self.next_lex_mode = lex_mode
221
222 def _ReadVarOpArg(self, arg_lex_mode):
223 # type: (lex_mode_t) -> rhs_word_t
224
225 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
226 # valid, even when unquoted.
227 self._SetNext(arg_lex_mode)
228 self._GetToken()
229
230 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
231 True) # empty_ok
232
233 # If the Compound has no parts, and we're in a double-quoted VarSub
234 # arg, and empty_ok, then return Empty. This is so it can evaluate to
235 # the empty string and not get elided.
236 #
237 # Examples:
238 # - "${s:-}", "${s/%pat/}"
239 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
240 # has the same potential problem of not having Token location info.
241 #
242 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
243 # return a Compound with no parts, which is explicitly checked with a
244 # custom error message.
245 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
246 return rhs_word.Empty
247
248 return w
249
250 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
251 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
252 """Return a CompoundWord.
253
254 Helper function for _ReadVarOpArg and used directly by
255 _ReadPatSubVarOp.
256 """
257 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
258 #log('w %s', w)
259 tilde = word_.TildeDetect(w)
260 if tilde:
261 w = tilde
262 return w
263
264 def _ReadSliceVarOp(self):
265 # type: () -> suffix_op.Slice
266 """
267 Looking token after first ':'
268
269 ArithExpr? (':' ArithExpr? )? '}'
270 """
271 self._NextNonSpace()
272
273 cur_id = self.token_type
274
275 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
276 begin = arith_expr.EmptyZero # type: arith_expr_t
277 else:
278 begin = self.a_parser.Parse()
279 cur_id = self.a_parser.CurrentId() # advance
280
281 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
282 # No length specified, so it's N
283 no_length = None # type: Optional[arith_expr_t]
284 return suffix_op.Slice(begin, no_length)
285
286 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
287 colon_tok = self.cur_token
288 self._NextNonSpace()
289
290 if self.token_type == Id.Arith_RBrace:
291 # quirky bash behavior:
292 # ${a:1:} or ${a::} means length ZERO
293 # but ${a:1} or ${a:} means length N
294 if self.parse_opts.strict_parse_slice():
295 p_die(
296 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
297 colon_tok)
298
299 length = arith_expr.EmptyZero # type: arith_expr_t
300 else:
301 length = self._ReadArithExpr(Id.Arith_RBrace)
302
303 return suffix_op.Slice(begin, length)
304
305 else:
306 p_die("Expected : or } in slice", self.cur_token)
307
308 raise AssertionError() # for MyPy
309
310 def _ReadPatSubVarOp(self):
311 # type: () -> suffix_op.PatSub
312 """Looking at the first '/' after VarOf:
313
314 VarSub = ...
315 | VarOf '/' Match ( '/' WORD? )?
316 Match = '/' WORD # can't be empty
317 | '#' WORD? # may be empty
318 | '%' WORD?
319 """
320 slash_tok = self.cur_token # location info
321 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
322
323 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
324
325 self._GetToken()
326 if self.token_type == Id.Right_DollarBrace:
327 pat = CompoundWord([])
328 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
329 slash_tok)
330
331 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
332 replace_mode = self.token_type
333 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
334
335 # Bash quirk:
336 # echo ${x/#/replace} has an empty pattern
337 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
338 empty_ok = replace_mode != Id.Lit_Slash
339 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
340 empty_ok)
341 #log('pat 1 %r', pat)
342
343 if self.token_type == Id.Lit_Slash:
344 # read until }
345 replace = self._ReadVarOpArg(
346 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
347 #log('r 1 %r', replace)
348 else:
349 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
350 replace = rhs_word.Empty
351
352 self._GetToken()
353 if self.token_type != Id.Right_DollarBrace:
354 # This happens on invalid code
355 p_die(
356 "Expected } after replacement string, got %s" %
357 ui.PrettyId(self.token_type), self.cur_token)
358
359 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
360
361 def _ReadSubscript(self):
362 # type: () -> bracket_op_t
363 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
364 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
365 # expression.
366 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
367 if next_id in (Id.Lit_At, Id.Arith_Star):
368 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
369
370 self._SetNext(lex_mode_e.Arith) # skip past [
371 self._GetToken()
372 self._SetNext(lex_mode_e.Arith) # skip past @
373 self._GetToken()
374 else:
375 self._SetNext(lex_mode_e.Arith) # skip past [
376 anode = self._ReadArithExpr(Id.Arith_RBracket)
377 op = bracket_op.ArrayIndex(anode)
378
379 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
380 p_die('Expected ] to close subscript', self.cur_token)
381
382 self._SetNext(lex_mode_e.VSub_2) # skip past ]
383 self._GetToken() # Needed to be in the same spot as no subscript
384
385 return op
386
387 def _ParseVarOf(self):
388 # type: () -> BracedVarSub
389 """
390 VarOf = NAME Subscript?
391 | NUMBER # no subscript allowed, none of these are arrays
392 # ${@[1]} doesn't work, even though slicing does
393 | VarSymbol
394 """
395 self._GetToken()
396 name_token = self.cur_token
397 self._SetNext(lex_mode_e.VSub_2)
398
399 self._GetToken() # Check for []
400 if self.token_type == Id.VOp2_LBracket:
401 bracket_op = self._ReadSubscript()
402 else:
403 bracket_op = None
404
405 part = BracedVarSub.CreateNull()
406 part.token = name_token
407 part.var_name = lexer.TokenVal(name_token)
408 part.bracket_op = bracket_op
409 return part
410
411 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
412 # type: (lex_mode_t, bool) -> BracedVarSub
413 """Start parsing at the op -- we already skipped past the name."""
414 part = self._ParseVarOf()
415
416 self._GetToken()
417 if self.token_type == Id.Right_DollarBrace:
418 return part # no ops
419
420 op_kind = self.token_kind
421
422 if op_kind == Kind.VTest:
423 tok = self.cur_token
424 arg_word = self._ReadVarOpArg(arg_lex_mode)
425 if self.token_type != Id.Right_DollarBrace:
426 p_die('Expected } to close ${', self.cur_token)
427
428 part.suffix_op = suffix_op.Unary(tok, arg_word)
429
430 elif op_kind == Kind.VOpYsh:
431 tok = self.cur_token
432 arg_word = self._ReadVarOpArg(arg_lex_mode)
433 if self.token_type != Id.Right_DollarBrace:
434 p_die('Expected } to close ${', self.cur_token)
435
436 UP_arg_word = arg_word
437 with tagswitch(arg_word) as case:
438 if case(rhs_word_e.Empty):
439 pass
440 elif case(rhs_word_e.Compound):
441 arg_word = cast(CompoundWord, UP_arg_word)
442 # This handles ${x|html} and ${x %.3f} now
443 # However I think ${x %.3f} should be statically parsed? It can enter
444 # the printf lexer modes.
445 ok, arg, quoted = word_.StaticEval(arg_word)
446 if not ok or quoted:
447 p_die('Expected a constant argument',
448 loc.Word(arg_word))
449
450 part.suffix_op = suffix_op.Static(tok, arg)
451
452 elif op_kind == Kind.VOp0:
453 part.suffix_op = self.cur_token # Nullary
454 self._SetNext(lex_mode_e.VSub_2) # Expecting }
455 self._GetToken()
456
457 elif op_kind == Kind.VOp1: # % %% # ## etc.
458 tok = self.cur_token
459 # Weird exception that all shells have: these operators take a glob
460 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
461 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
462 if self.token_type != Id.Right_DollarBrace:
463 p_die('Expected } to close ${', self.cur_token)
464
465 part.suffix_op = suffix_op.Unary(tok, arg_word)
466
467 elif op_kind == Kind.VOp2: # / : [ ]
468 if self.token_type == Id.VOp2_Slash:
469 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
470 part.suffix_op = patsub_op
471
472 # Checked by the method above
473 assert self.token_type == Id.Right_DollarBrace, self.cur_token
474
475 elif self.token_type == Id.VOp2_Colon:
476 part.suffix_op = self._ReadSliceVarOp()
477 # NOTE: } in arithmetic mode.
478 if self.token_type != Id.Arith_RBrace:
479 # Token seems off; doesn't point to X in # ${a:1:2 X
480 p_die('Expected } to close ${', self.cur_token)
481
482 else:
483 # TODO: Does this ever happen?
484 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
485
486 elif op_kind == Kind.VOp3: # ${prefix@} etc.
487 if allow_query:
488 part.suffix_op = self.cur_token # Nullary
489 self._SetNext(lex_mode_e.VSub_2) # Expecting }
490 self._GetToken()
491 else:
492 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
493
494 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
495 # mode. It's redundantly checked above.
496 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
497 # ${a.} or ${!a.}
498 p_die('Expected } to close ${', self.cur_token)
499
500 # Now look for ops
501 return part
502
503 def _ReadZshVarSub(self, left_token):
504 # type: (Token) -> word_part.ZshVarSub
505
506 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
507
508 # Can be empty
509 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
510 True)
511 self._GetToken()
512 return word_part.ZshVarSub(left_token, w, self.cur_token)
513
514 def ReadBracedVarSub(self, left_token):
515 # type: (Token) -> Tuple[BracedVarSub, Token]
516 """ For YSH expressions like var x = ${x:-"default"}. """
517 part = self._ReadBracedVarSub(left_token, d_quoted=False)
518 last_token = self.cur_token
519 return part, last_token
520
521 def _ReadBracedVarSub(self, left_token, d_quoted):
522 # type: (Token, bool) -> BracedVarSub
523 """For the ${} expression language.
524
525 NAME = [a-zA-Z_][a-zA-Z0-9_]*
526 NUMBER = [0-9]+ # ${10}, ${11}, ...
527
528 Subscript = '[' ('@' | '*' | ArithExpr) ']'
529 VarSymbol = '!' | '@' | '#' | ...
530 VarOf = NAME Subscript?
531 | NUMBER # no subscript allowed, none of these are arrays
532 # ${@[1]} doesn't work, even though slicing does
533 | VarSymbol
534
535 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
536
537 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
538 STRIP_OP = '#' | '##' | '%' | '%%'
539 CASE_OP = ',' | ',,' | '^' | '^^'
540 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
541
542 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
543 # SPACE is operator not %
544 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
545 VarExpr = VarOf
546 | VarOf NULLARY_OP
547 | VarOf UnaryOp WORD
548 | VarOf YSH_UNARY STATIC_WORD
549 | VarOf ':' ArithExpr (':' ArithExpr )?
550 | VarOf '/' Match '/' WORD
551
552 LengthExpr = '#' VarOf # can't apply operators after length
553
554 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
555 # ${!ref[0]} vs ${!keys[@]} resolved later
556
557 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
558
559 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
560
561 VarSub = LengthExpr
562 | RefOrKeys
563 | PrefixQuery
564 | VarExpr
565 | BuiltinSub
566
567 NOTES:
568 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
569 slicing ${a:x+1:y+2}
570 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
571 - @ and * are technically arithmetic expressions in this implementation
572 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
573 it's also vectorized.
574
575 Strictness over bash:
576 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
577 grammar
578 - ! and # prefixes can't be composed, even though named refs can be
579 composed with other operators
580 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
581 a prefix, and it can also be a literal part of WORD.
582
583 From the parser's point of view, the prefix # can't be combined with
584 UnaryOp/slicing/matching, and the ! can. However
585
586 - ${a[@]:1:2} is not allowed
587 - ${#a[@]:1:2} is allowed, but gives the wrong answer
588 """
589 if d_quoted:
590 arg_lex_mode = lex_mode_e.VSub_ArgDQ
591 else:
592 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
593
594 self._SetNext(lex_mode_e.VSub_1)
595 self._GetToken()
596
597 ty = self.token_type
598 first_tok = self.cur_token
599
600 if ty == Id.VSub_Pound:
601 # Disambiguate
602 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
603 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
604 # e.g. a name, '#' is the prefix
605 self._SetNext(lex_mode_e.VSub_1)
606 part = self._ParseVarOf()
607
608 self._GetToken()
609 if self.token_type != Id.Right_DollarBrace:
610 p_die('Expected } after length expression', self.cur_token)
611
612 part.prefix_op = first_tok
613
614 else: # not a prefix, '#' is the variable
615 part = self._ParseVarExpr(arg_lex_mode)
616
617 elif ty == Id.VSub_Bang:
618 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
619 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
620 # e.g. a name, '!' is the prefix
621 # ${!a} -- this is a ref
622 # ${!3} -- this is ref
623 # ${!a[1]} -- this is a ref
624 # ${!a[@]} -- this is a keys
625 # No lookahead -- do it in a second step, or at runtime
626 self._SetNext(lex_mode_e.VSub_1)
627 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
628
629 part.prefix_op = first_tok
630
631 else: # not a prefix, '!' is the variable
632 part = self._ParseVarExpr(arg_lex_mode)
633
634 elif ty == Id.VSub_Dot:
635 # Note: this will become a new builtin_sub type, so this method must
636 # return word_part_t rather than BracedVarSub. I don't think that
637 # should cause problems.
638 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
639
640 # VS_NAME, VS_NUMBER, symbol that isn't # or !
641 elif self.token_kind == Kind.VSub:
642 part = self._ParseVarExpr(arg_lex_mode)
643
644 else:
645 # e.g. ${^}
646 p_die('Unexpected token in ${}', self.cur_token)
647
648 part.left = left_token # attach the argument
649 part.right = self.cur_token
650 return part
651
652 def _ReadSingleQuoted(self, left_token, lex_mode):
653 # type: (Token, lex_mode_t) -> SingleQuoted
654 """Internal method to read a word_part."""
655 tokens = [] # type: List[Token]
656 # In command mode, we never disallow backslashes like '\'
657 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
658 False)
659 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
660 node = SingleQuoted(left_token, sval, right_quote)
661 return node
662
663 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
664 # type: (lex_mode_t, Token, List[Token], bool) -> Token
665 """Appends to out_tokens; returns last token
666
667 Used by expr_parse.py
668 """
669 # TODO: Remove and use out_tokens
670 tokens = [] # type: List[Token]
671
672 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
673 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
674
675 expected_end_tokens = 3 if left_token.id in (
676 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
677 Id.Left_BTSingleQuote) else 1
678 num_end_tokens = 0
679
680 while num_end_tokens < expected_end_tokens:
681 self._SetNext(lex_mode)
682 self._GetToken()
683
684 # Kind.Char emitted in lex_mode.SQ_C
685 if self.token_kind in (Kind.Lit, Kind.Char):
686 tok = self.cur_token
687 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
688 # r'one\two' or c'one\\two'
689 if no_backslashes and lexer.TokenContains(tok, '\\'):
690 p_die(
691 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
692 tok)
693
694 if is_ysh_expr:
695 # Disallow var x = $'\001'. Arguably we don't need these
696 # checks because u'\u{1}' is the way to write it.
697 if self.token_type == Id.Char_Octal3:
698 p_die(
699 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
700 tok)
701
702 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
703 # disallow \xH
704 p_die(
705 r'Invalid hex escape in YSH string (must be \xHH)',
706 tok)
707
708 tokens.append(tok)
709
710 elif self.token_kind == Kind.Unknown:
711 tok = self.cur_token
712 assert tok.id == Id.Unknown_Backslash, tok
713
714 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
715 if is_ysh_expr or not self.parse_opts.parse_backslash():
716 p_die(
717 "Invalid char escape in C-style string literal (OILS-ERR-11)",
718 tok)
719
720 tokens.append(tok)
721
722 elif self.token_kind == Kind.Eof:
723 p_die('Unexpected EOF in single-quoted string that began here',
724 left_token)
725
726 elif self.token_kind == Kind.Right:
727 # assume Id.Right_SingleQuote
728 num_end_tokens += 1
729 tokens.append(self.cur_token)
730
731 else:
732 raise AssertionError(self.cur_token)
733
734 if self.token_kind != Kind.Right:
735 num_end_tokens = 0 # we need three in a ROW
736
737 if expected_end_tokens == 1:
738 tokens.pop()
739 elif expected_end_tokens == 3: # Get rid of spurious end tokens
740 tokens.pop()
741 tokens.pop()
742 tokens.pop()
743
744 # Remove space from ''' r''' $''' in both expression mode and command mode
745 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
746 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
747 word_compile.RemoveLeadingSpaceSQ(tokens)
748
749 # Validation after lexing - same 2 checks in j8.LexerDecoder
750 is_u_string = left_token.id in (Id.Left_USingleQuote,
751 Id.Left_UTSingleQuote)
752
753 for tok in tokens:
754 # u'\yff' is not valid, but b'\yff' is
755 if is_u_string and tok.id == Id.Char_YHex:
756 p_die(
757 r"%s escapes not allowed in u'' strings" %
758 lexer.TokenVal(tok), tok)
759
760 out_tokens.extend(tokens)
761 return self.cur_token
762
763 def _ReadDoubleQuotedLeftParts(self):
764 # type: () -> word_part_t
765 """Read substitution parts in a double quoted context."""
766 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
767 return self._ReadCommandSub(self.token_type, d_quoted=True)
768
769 if self.token_type == Id.Left_DollarBrace:
770 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
771
772 if self.token_type == Id.Left_DollarDParen:
773 return self._ReadArithSub()
774
775 if self.token_type == Id.Left_DollarBracket:
776 return self._ReadExprSub(lex_mode_e.DQ)
777
778 raise AssertionError(self.cur_token)
779
780 def _ReadYshSingleQuoted(self, left_id):
781 # type: (Id_t) -> CompoundWord
782 """Read YSH style strings
783
784 r'' u'' b''
785 r''' ''' u''' ''' b''' '''
786 """
787 #log('BEF self.cur_token %s', self.cur_token)
788 if left_id == Id.Left_RSingleQuote:
789 lexer_mode = lex_mode_e.SQ_Raw
790 triple_left_id = Id.Left_RTSingleQuote
791 elif left_id == Id.Left_USingleQuote:
792 lexer_mode = lex_mode_e.J8_Str
793 triple_left_id = Id.Left_UTSingleQuote
794 elif left_id == Id.Left_BSingleQuote:
795 lexer_mode = lex_mode_e.J8_Str
796 triple_left_id = Id.Left_BTSingleQuote
797 else:
798 raise AssertionError(left_id)
799
800 # Needed for syntax checks
801 left_tok = self.cur_token
802 left_tok.id = left_id
803
804 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
805
806 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
807 self._SetNext(lex_mode_e.ShCommand)
808 self._GetToken()
809
810 assert self.token_type == Id.Left_SingleQuote
811 # HACK: magically transform the third ' in u''' to
812 # Id.Left_UTSingleQuote, so that ''' is the terminator
813 left_tok = self.cur_token
814 left_tok.id = triple_left_id
815
816 # Handles stripping leading whitespace
817 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
818
819 # Advance and validate
820 self._SetNext(lex_mode_e.ShCommand)
821
822 self._GetToken()
823 if self.token_kind not in KINDS_THAT_END_WORDS:
824 p_die('Unexpected token after YSH single-quoted string',
825 self.cur_token)
826
827 return CompoundWord([sq_part])
828
829 def _ReadUnquotedLeftParts(self, triple_out):
830 # type: (Optional[BoolParamBox]) -> word_part_t
831 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
832
833 If triple_out is set, then we try parsing triple quoted strings,
834 and set its value to True if we got one.
835 """
836 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
837 # Note: $"" is a synonym for "". It might make sense if it added
838 # \n \0 \x00 \u{123} etc. But that's not what bash does!
839 dq_part = self._ReadDoubleQuoted(self.cur_token)
840 # Got empty word "" and there's a " after
841 if (triple_out and len(dq_part.parts) == 0 and
842 self.lexer.ByteLookAhead() == '"'):
843
844 self._SetNext(lex_mode_e.ShCommand)
845 self._GetToken()
846 # HACK: magically transform the third " in """ to
847 # Id.Left_TDoubleQuote, so that """ is the terminator
848 left_dq_token = self.cur_token
849 left_dq_token.id = Id.Left_TDoubleQuote
850 triple_out.b = True # let caller know we got it
851 return self._ReadDoubleQuoted(left_dq_token)
852
853 return dq_part
854
855 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
856 Id.Left_DollarSingleQuote):
857 if self.token_type == Id.Left_SingleQuote:
858 lexer_mode = lex_mode_e.SQ_Raw
859 triple_left_id = Id.Left_TSingleQuote
860 elif self.token_type == Id.Left_RSingleQuote:
861 lexer_mode = lex_mode_e.SQ_Raw
862 triple_left_id = Id.Left_RTSingleQuote
863 else:
864 lexer_mode = lex_mode_e.SQ_C
865 # there is no such thing as $'''
866 triple_left_id = Id.Undefined_Tok
867
868 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
869
870 # Got empty '' or r'' and there's a ' after
871 # u'' and b'' are handled in _ReadYshSingleQuoted
872 if (triple_left_id != Id.Undefined_Tok and
873 triple_out is not None and len(sq_part.sval) == 0 and
874 self.lexer.ByteLookAhead() == "'"):
875
876 self._SetNext(lex_mode_e.ShCommand)
877 self._GetToken()
878
879 # HACK: magically transform the third ' in ''' to
880 # Id.Left_TSingleQuote, so that ''' is the terminator
881 left_sq_token = self.cur_token
882 left_sq_token.id = triple_left_id
883
884 triple_out.b = True # let caller know we got it
885 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
886
887 return sq_part
888
889 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
890 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
891 return self._ReadCommandSub(self.token_type, d_quoted=False)
892
893 if self.token_type == Id.Left_DollarBrace:
894 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
895
896 if self.token_type == Id.Left_DollarDParen:
897 return self._ReadArithSub()
898
899 if self.token_type == Id.Left_DollarBracket:
900 return self._ReadExprSub(lex_mode_e.ShCommand)
901
902 if self.token_type == Id.Left_DollarBraceZsh:
903 return self._ReadZshVarSub(self.cur_token)
904
905 raise AssertionError(self.cur_token)
906
907 def _ReadExtGlob(self):
908 # type: () -> word_part.ExtGlob
909 """
910 Grammar:
911 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
912 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
913 RIGHT = ')'
914 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
915 Compound includes ExtGlob
916 """
917 left_token = self.cur_token
918 right_token = None # type: Token
919 arms = [] # type: List[CompoundWord]
920
921 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
922 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
923
924 read_word = False # did we just a read a word? To handle @(||).
925
926 while True:
927 self._GetToken()
928
929 if self.token_type == Id.Right_ExtGlob:
930 if not read_word:
931 arms.append(CompoundWord([]))
932 right_token = self.cur_token
933 break
934
935 elif self.token_type == Id.Op_Pipe:
936 if not read_word:
937 arms.append(CompoundWord([]))
938 read_word = False
939 self._SetNext(lex_mode_e.ExtGlob)
940
941 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
942 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
943 Kind.ExtGlob):
944 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
945 arms.append(w)
946 read_word = True
947
948 elif self.token_kind == Kind.Eof:
949 p_die('Unexpected EOF reading extended glob that began here',
950 left_token)
951
952 else:
953 raise AssertionError(self.cur_token)
954
955 return word_part.ExtGlob(left_token, arms, right_token)
956
957 def _ReadBashRegexGroup(self):
958 # type: () -> word_part.BashRegexGroup
959 """
960 Grammar:
961 BashRegexGroup = '(' WORD? ')
962 """
963 left_token = self.cur_token
964 assert left_token.id == Id.BashRegex_LParen, left_token
965
966 right_token = None # type: Token
967 arms = [] # type: List[CompoundWord]
968
969 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
970 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
971
972 self._GetToken()
973 if self.token_type == Id.Right_BashRegexGroup: # empty ()
974 return word_part.BashRegexGroup(left_token, None, self.cur_token)
975
976 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
977 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
978 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
979 # To allow bash style [[ s =~ (a b) ]]
980 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
981 arms.append(w)
982
983 self._GetToken()
984 if self.token_type != Id.Right_BashRegexGroup:
985 p_die('Expected ) to close bash regex group', self.cur_token)
986
987 return word_part.BashRegexGroup(left_token, w, self.cur_token)
988
989 p_die('Expected word after ( opening bash regex group', self.cur_token)
990
991 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
992 # type: (Optional[Token], bool, List[word_part_t]) -> None
993 """
994 Args:
995 left_token: A token if we are reading a double quoted part, or None if
996 we're reading a here doc.
997 is_ysh_expr: Whether to disallow backticks and invalid char escapes
998 out_parts: list of word_part to append to
999 """
1000 if left_token:
1001 if left_token.id in (Id.Left_TDoubleQuote,
1002 Id.Left_DollarTDoubleQuote):
1003 expected_end_tokens = 3
1004 else:
1005 expected_end_tokens = 1
1006 else:
1007 expected_end_tokens = 1000 # here doc will break
1008
1009 num_end_tokens = 0
1010 while num_end_tokens < expected_end_tokens:
1011 self._SetNext(lex_mode_e.DQ)
1012 self._GetToken()
1013
1014 if self.token_kind == Kind.Lit:
1015 if self.token_type == Id.Lit_EscapedChar:
1016 tok = self.cur_token
1017 ch = lexer.TokenSliceLeft(tok, 1)
1018 part = word_part.EscapedLiteral(tok,
1019 ch) # type: word_part_t
1020 else:
1021 if self.token_type == Id.Lit_BadBackslash:
1022 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1023 # YSH.
1024 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1025 # recursion (unless parse_backslash)
1026 if (is_ysh_expr or
1027 not self.parse_opts.parse_backslash()):
1028 p_die(
1029 "Invalid char escape in double quoted string (OILS-ERR-12)",
1030 self.cur_token)
1031 elif self.token_type == Id.Lit_Dollar:
1032 if is_ysh_expr or not self.parse_opts.parse_dollar():
1033 p_die("Literal $ should be quoted like \$",
1034 self.cur_token)
1035
1036 part = self.cur_token
1037 out_parts.append(part)
1038
1039 elif self.token_kind == Kind.Left:
1040 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1041 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1042 self.cur_token)
1043
1044 part = self._ReadDoubleQuotedLeftParts()
1045 out_parts.append(part)
1046
1047 elif self.token_kind == Kind.VSub:
1048 tok = self.cur_token
1049 part = SimpleVarSub(tok)
1050 out_parts.append(part)
1051 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1052 # later.
1053
1054 elif self.token_kind == Kind.Right:
1055 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1056 if left_token:
1057 num_end_tokens += 1
1058
1059 # In a here doc, the right quote is literal!
1060 out_parts.append(self.cur_token)
1061
1062 elif self.token_kind == Kind.Eof:
1063 if left_token:
1064 p_die(
1065 'Unexpected EOF reading double-quoted string that began here',
1066 left_token)
1067 else: # here docs will have an EOF in their token stream
1068 break
1069
1070 else:
1071 raise AssertionError(self.cur_token)
1072
1073 if self.token_kind != Kind.Right:
1074 num_end_tokens = 0 # """ must be CONSECUTIVE
1075
1076 if expected_end_tokens == 1:
1077 out_parts.pop()
1078 elif expected_end_tokens == 3:
1079 out_parts.pop()
1080 out_parts.pop()
1081 out_parts.pop()
1082
1083 # Remove space from """ in both expression mode and command mode
1084 if (left_token and left_token.id
1085 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1086 word_compile.RemoveLeadingSpaceDQ(out_parts)
1087
1088 # Return nothing, since we appended to 'out_parts'
1089
1090 def _ReadDoubleQuoted(self, left_token):
1091 # type: (Token) -> DoubleQuoted
1092 """Helper function for "hello $name".
1093
1094 Args:
1095 eof_type: for stopping at }, Id.Lit_RBrace
1096 here_doc: Whether we are reading in a here doc context
1097
1098 Also ${foo%%a b c} # treat this as double quoted. until you hit
1099 """
1100 parts = [] # type: List[word_part_t]
1101 self._ReadLikeDQ(left_token, False, parts)
1102
1103 right_quote = self.cur_token
1104 return DoubleQuoted(left_token, parts, right_quote)
1105
1106 def ReadDoubleQuoted(self, left_token, parts):
1107 # type: (Token, List[word_part_t]) -> Token
1108 """For expression mode.
1109
1110 Read var x = "${dir:-}/$name"; etc.
1111 """
1112 self._ReadLikeDQ(left_token, True, parts)
1113 return self.cur_token
1114
1115 def _ReadCommandSub(self, left_id, d_quoted=False):
1116 # type: (Id_t, bool) -> CommandSub
1117 """
1118 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1119
1120 command_sub = '$(' command_list ')'
1121 | '@(' command_list ')'
1122 | '<(' command_list ')'
1123 | '>(' command_list ')'
1124 | ` command_list `
1125 """
1126 left_token = self.cur_token
1127
1128 # Set the lexer in a state so ) becomes the EOF token.
1129 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1130 Id.Left_ProcSubOut):
1131 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1132
1133 right_id = Id.Eof_RParen
1134 self.lexer.PushHint(Id.Op_RParen, right_id)
1135 c_parser = self.parse_ctx.MakeParserForCommandSub(
1136 self.line_reader, self.lexer, right_id)
1137 # NOTE: This doesn't use something like main_loop because we don't want
1138 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1139 node = c_parser.ParseCommandSub()
1140
1141 right_token = c_parser.w_parser.cur_token
1142
1143 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1144 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1145 # test/osh2oil.
1146
1147 right_id = Id.Eof_Backtick
1148 self.lexer.PushHint(Id.Left_Backtick, right_id)
1149 c_parser = self.parse_ctx.MakeParserForCommandSub(
1150 self.line_reader, self.lexer, right_id)
1151 node = c_parser.ParseCommandSub()
1152 right_token = c_parser.w_parser.cur_token
1153
1154 elif left_id == Id.Left_Backtick:
1155 if not self.parse_opts.parse_backticks():
1156 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1157 left_token)
1158
1159 self._SetNext(lex_mode_e.Backtick) # advance past `
1160
1161 parts = [] # type: List[str]
1162 while True:
1163 self._GetToken()
1164 #log("TOK %s", self.cur_token)
1165
1166 if self.token_type == Id.Backtick_Quoted:
1167 # Remove leading \
1168 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1169
1170 elif self.token_type == Id.Backtick_DoubleQuote:
1171 # Compatibility: If backticks are double quoted, then double quotes
1172 # within them have to be \"
1173 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1174 # is)
1175 if d_quoted:
1176 # Remove leading \
1177 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1178 else:
1179 parts.append(lexer.TokenVal(self.cur_token))
1180
1181 elif self.token_type == Id.Backtick_Other:
1182 parts.append(lexer.TokenVal(self.cur_token))
1183
1184 elif self.token_type == Id.Backtick_Right:
1185 break
1186
1187 elif self.token_type == Id.Eof_Real:
1188 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1189 p_die('Unexpected EOF while looking for closing backtick',
1190 left_token)
1191
1192 else:
1193 raise AssertionError(self.cur_token)
1194
1195 self._SetNext(lex_mode_e.Backtick)
1196
1197 # Calculate right SPID on CommandSub BEFORE re-parsing.
1198 right_token = self.cur_token
1199
1200 code_str = ''.join(parts)
1201 #log('code %r', code_str)
1202
1203 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1204 # won't have the same location info as MakeParserForCommandSub(), because
1205 # the lexer is different.
1206 arena = self.parse_ctx.arena
1207 #arena = alloc.Arena()
1208 line_reader = reader.StringLineReader(code_str, arena)
1209 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1210 src = source.Reparsed('backticks', left_token, right_token)
1211 with alloc.ctx_SourceCode(arena, src):
1212 node = c_parser.ParseCommandSub()
1213
1214 else:
1215 raise AssertionError(left_id)
1216
1217 return CommandSub(left_token, node, right_token)
1218
1219 def _ReadExprSub(self, lex_mode):
1220 # type: (lex_mode_t) -> word_part.ExprSub
1221 """$[d->key] $[obj.method()] etc."""
1222 left_token = self.cur_token
1223
1224 self._SetNext(lex_mode_e.Expr)
1225 enode, right_token = self.parse_ctx.ParseYshExpr(
1226 self.lexer, grammar_nt.ysh_expr_sub)
1227
1228 self._SetNext(lex_mode) # Move past ]
1229 return word_part.ExprSub(left_token, enode, right_token)
1230
1231 def ParseVarDecl(self, kw_token):
1232 # type: (Token) -> command.VarDecl
1233 """
1234 oil_var_decl: name_type_list '=' testlist end_stmt
1235
1236 Note that assignments must end with \n ; } or EOF. Unlike shell
1237 assignments, we disallow:
1238
1239 var x = 42 | wc -l
1240 var x = 42 && echo hi
1241 """
1242 self._SetNext(lex_mode_e.Expr)
1243 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1244 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1245 # wants
1246 if last_token.id == Id.Op_RBrace:
1247 last_token.id = Id.Lit_RBrace
1248
1249 # Let the CommandParser see the Op_Semi or Op_Newline.
1250 self.buffered_word = last_token
1251 self._SetNext(lex_mode_e.ShCommand) # always back to this
1252 return enode
1253
1254 def ParseMutation(self, kw_token, var_checker):
1255 # type: (Token, VarChecker) -> command.Mutation
1256 """
1257 setvar i = 42
1258 setvar i += 1
1259 setvar a[i] = 42
1260 setvar a[i] += 1
1261 setvar d.key = 42
1262 setvar d.key += 1
1263 """
1264 self._SetNext(lex_mode_e.Expr)
1265 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1266 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1267 # wants
1268 if last_token.id == Id.Op_RBrace:
1269 last_token.id = Id.Lit_RBrace
1270
1271 for lhs in enode.lhs:
1272 UP_lhs = lhs
1273 with tagswitch(lhs) as case:
1274 if case(y_lhs_e.Var):
1275 lhs = cast(Token, UP_lhs)
1276 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1277
1278 # Note: this does not cover cases like
1279 # setvar (a[0])[1] = v
1280 # setvar (d.key).other = v
1281 # This leaks into catching all typos statically, which may be
1282 # possible if 'use' makes all names explicit.
1283 elif case(y_lhs_e.Subscript):
1284 lhs = cast(Subscript, UP_lhs)
1285 if lhs.obj.tag() == expr_e.Var:
1286 v = cast(expr.Var, lhs.obj)
1287 var_checker.Check(kw_token.id, v.name, v.left)
1288
1289 elif case(y_lhs_e.Attribute):
1290 lhs = cast(Attribute, UP_lhs)
1291 if lhs.obj.tag() == expr_e.Var:
1292 v = cast(expr.Var, lhs.obj)
1293 var_checker.Check(kw_token.id, v.name, v.left)
1294
1295 # Let the CommandParser see the Op_Semi or Op_Newline.
1296 self.buffered_word = last_token
1297 self._SetNext(lex_mode_e.ShCommand) # always back to this
1298 return enode
1299
1300 def ParseBareDecl(self):
1301 # type: () -> expr_t
1302 """
1303 x = {name: val}
1304 """
1305 self._SetNext(lex_mode_e.Expr)
1306 self._GetToken()
1307 enode, last_token = self.parse_ctx.ParseYshExpr(
1308 self.lexer, grammar_nt.command_expr)
1309 if last_token.id == Id.Op_RBrace:
1310 last_token.id = Id.Lit_RBrace
1311 self.buffered_word = last_token
1312 self._SetNext(lex_mode_e.ShCommand)
1313 return enode
1314
1315 def ParseYshExprForCommand(self):
1316 # type: () -> expr_t
1317
1318 # Fudge for this case
1319 # for x in(y) {
1320 # versus
1321 # for x in (y) {
1322 #
1323 # In the former case, ReadWord on 'in' puts the lexer past (.
1324 # Also see LookPastSpace in CommandParers.
1325 # A simpler solution would be nicer.
1326
1327 if self.token_type == Id.Op_LParen:
1328 self.lexer.MaybeUnreadOne()
1329
1330 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1331
1332 self._SetNext(lex_mode_e.ShCommand)
1333 return enode
1334
1335 def ParseCommandExpr(self):
1336 # type: () -> expr_t
1337 """
1338 = 1+2
1339 """
1340 enode, last_token = self.parse_ctx.ParseYshExpr(
1341 self.lexer, grammar_nt.command_expr)
1342
1343 # In some cases, such as the case statement, we expect *the lexer* to be
1344 # pointing at the token right after the expression. But the expression
1345 # parser must have read to the `last_token`. Unreading places the lexer
1346 # back in the expected state. Ie:
1347 #
1348 # case (x) { case (x) {
1349 # (else) { = x } (else) { = x }
1350 # ^ The lexer is here ^ Unread to here
1351 # } }
1352 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1353 Id.Op_RBrace), last_token
1354 if last_token.id != Id.Eof_Real:
1355 # Eof_Real is the only token we cannot unread
1356 self.lexer.MaybeUnreadOne()
1357
1358 return enode
1359
1360 def ParseProc(self, node):
1361 # type: (Proc) -> None
1362
1363 # proc name-with-hyphens() must be accepted
1364 self._SetNext(lex_mode_e.ShCommand)
1365 self._GetToken()
1366 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1367 if self.token_type != Id.Lit_Chars:
1368 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1369 self.cur_token)
1370
1371 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1372 # for shell functions. Similar to IsValidVarName().
1373 node.name = self.cur_token
1374
1375 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1376
1377 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1378 assert last_token.id == Id.Op_LBrace
1379 last_token.id = Id.Lit_LBrace
1380 self.buffered_word = last_token
1381
1382 self._SetNext(lex_mode_e.ShCommand)
1383
1384 def ParseFunc(self, node):
1385 # type: (Func) -> None
1386 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1387
1388 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1389 assert last_token.id == Id.Op_LBrace
1390 last_token.id = Id.Lit_LBrace
1391 self.buffered_word = last_token
1392
1393 self._SetNext(lex_mode_e.ShCommand)
1394
1395 def ParseYshCasePattern(self):
1396 # type: () -> Tuple[pat_t, Token]
1397 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1398 self.lexer)
1399
1400 if last_token.id == Id.Op_LBrace:
1401 last_token.id = Id.Lit_LBrace
1402 self.buffered_word = last_token
1403
1404 return pat, left_tok
1405
1406 def NewlineOkForYshCase(self):
1407 # type: () -> Id_t
1408 """Check for optional newline and consume it.
1409
1410 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1411 which crop up while parsing Ysh Case Arms. For more details, see
1412 #oil-dev > Progress On YSH Case Grammar on zulip.
1413
1414 Returns a token id which is filled with the choice of
1415
1416 word { echo word }
1417 (3) { echo expr }
1418 /e/ { echo eggex }
1419 } # right brace
1420 """
1421 while True:
1422 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1423
1424 # Cannot lookahead past lines
1425 if next_id == Id.Unknown_Tok:
1426 self.lexer.MoveToNextLine()
1427 continue
1428
1429 next_kind = consts.GetKind(next_id)
1430 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1431 break
1432
1433 self.lexer.Read(lex_mode_e.Expr)
1434
1435 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1436 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1437 else:
1438 # Consume the trailing Op_Newline
1439 self._SetNext(lex_mode_e.ShCommand)
1440 self._GetToken()
1441
1442 return next_id
1443
1444 def _ReadArithExpr(self, end_id):
1445 # type: (Id_t) -> arith_expr_t
1446 """Read and parse an arithmetic expression in various contexts.
1447
1448 $(( 1+2 ))
1449 (( a=1+2 ))
1450 ${a[ 1+2 ]}
1451 ${a : 1+2 : 1+2}
1452
1453 See tests/arith-context.test.sh for ambiguous cases.
1454
1455 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1456
1457 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1458
1459 See the assertion in ArithParser.Parse() -- unexpected extra input.
1460 """
1461 # calls self.ReadWord(lex_mode_e.Arith)
1462 anode = self.a_parser.Parse()
1463 cur_id = self.a_parser.CurrentId()
1464 if end_id != Id.Undefined_Tok and cur_id != end_id:
1465 p_die(
1466 'Unexpected token after arithmetic expression (%s != %s)' %
1467 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1468 loc.Word(self.a_parser.cur_word))
1469 return anode
1470
1471 def _ReadArithSub(self):
1472 # type: () -> word_part.ArithSub
1473 """Read an arith substitution, which contains an arith expression, e.g.
1474
1475 $((a + 1)).
1476 """
1477 left_tok = self.cur_token
1478
1479 # The second one needs to be disambiguated in stuff like stuff like:
1480 # $(echo $(( 1+2 )) )
1481 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1482
1483 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1484 # could save the lexer/reader state here, and retry if the arithmetic parse
1485 # fails. But we can almost always catch this at parse time. There could
1486 # be some exceptions like:
1487 # $((echo * foo)) # looks like multiplication
1488 # $((echo / foo)) # looks like division
1489
1490 # $(( )) is valid
1491 anode = arith_expr.EmptyZero # type: arith_expr_t
1492
1493 self._NextNonSpace()
1494 if self.token_type != Id.Arith_RParen:
1495 anode = self._ReadArithExpr(Id.Arith_RParen)
1496
1497 self._SetNext(lex_mode_e.ShCommand)
1498
1499 # Ensure we get closing )
1500 self._GetToken()
1501 if self.token_type != Id.Right_DollarDParen:
1502 p_die('Expected second ) to end arith sub', self.cur_token)
1503
1504 right_tok = self.cur_token
1505 return word_part.ArithSub(left_tok, anode, right_tok)
1506
1507 def ReadDParen(self):
1508 # type: () -> Tuple[arith_expr_t, Token]
1509 """Read ((1+ 2)) -- command context.
1510
1511 We're using the word parser because it's very similar to _ReadArithExpr
1512 above.
1513
1514 This also returns the terminating Id.Op_DRightParen token for location
1515 info.
1516 """
1517 # (( )) is valid
1518 anode = arith_expr.EmptyZero # type: arith_expr_t
1519
1520 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1521
1522 self._NextNonSpace()
1523 if self.token_type != Id.Arith_RParen:
1524 anode = self._ReadArithExpr(Id.Arith_RParen)
1525
1526 self._SetNext(lex_mode_e.ShCommand)
1527
1528 # Ensure we get the second )
1529 self._GetToken()
1530 right = self.cur_token
1531 if right.id != Id.Op_DRightParen:
1532 p_die('Expected second ) to end arith statement', right)
1533
1534 self._SetNext(lex_mode_e.ShCommand)
1535
1536 return anode, right
1537
1538 def _NextNonSpace(self):
1539 # type: () -> None
1540 """Advance in lex_mode_e.Arith until non-space token.
1541
1542 Same logic as _ReadWord, but used in
1543 $(( ))
1544 (( ))
1545 for (( ))
1546
1547 You can read self.token_type after this, without calling _GetToken.
1548 """
1549 while True:
1550 self._SetNext(lex_mode_e.Arith)
1551 self._GetToken()
1552 if self.token_kind not in (Kind.Ignored, Kind.WS):
1553 break
1554
1555 def ReadForExpression(self):
1556 # type: () -> command.ForExpr
1557 """Read ((i=0; i<5; ++i)) -- part of command context."""
1558 self._NextNonSpace() # skip over ((
1559 cur_id = self.token_type # for end of arith expressions
1560
1561 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1562 init_node = arith_expr.EmptyZero # type: arith_expr_t
1563 else:
1564 init_node = self.a_parser.Parse()
1565 cur_id = self.a_parser.CurrentId()
1566 self._NextNonSpace()
1567
1568 # It's odd to keep track of both cur_id and self.token_type in this
1569 # function, but it works, and is tested in 'test/parse_error.sh
1570 # arith-integration'
1571 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1572 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1573
1574 self._GetToken()
1575 cur_id = self.token_type
1576
1577 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1578 # empty condition is TRUE
1579 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1580 else:
1581 cond_node = self.a_parser.Parse()
1582 cur_id = self.a_parser.CurrentId()
1583
1584 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1585 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1586
1587 self._NextNonSpace()
1588 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1589 update_node = arith_expr.EmptyZero # type: arith_expr_t
1590 else:
1591 update_node = self._ReadArithExpr(Id.Arith_RParen)
1592
1593 self._NextNonSpace()
1594 if self.token_type != Id.Arith_RParen:
1595 p_die('Expected ) to end for loop expression', self.cur_token)
1596 self._SetNext(lex_mode_e.ShCommand)
1597
1598 # redirects is None, will be assigned in CommandEvaluator
1599 node = command.ForExpr.CreateNull()
1600 node.init = init_node
1601 node.cond = cond_node
1602 node.update = update_node
1603 return node
1604
1605 def _ReadArrayLiteral(self):
1606 # type: () -> word_part_t
1607 """a=(1 2 3)
1608
1609 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1610
1611 We want:
1612
1613 A=(['x']=1 ["x"]=2 [$x$y]=3)
1614
1615 Maybe allow this as a literal string? Because I think I've seen it before?
1616 Or maybe force people to patch to learn the rule.
1617
1618 A=([x]=4)
1619
1620 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1621 Maybe enforce that ALL have keys or NONE of have keys.
1622 """
1623 self._SetNext(lex_mode_e.ShCommand) # advance past (
1624 self._GetToken()
1625 if self.cur_token.id != Id.Op_LParen:
1626 p_die('Expected ( after =', self.cur_token)
1627 left_token = self.cur_token
1628 right_token = None # type: Token
1629
1630 # MUST use a new word parser (with same lexer).
1631 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1632 words = [] # type: List[CompoundWord]
1633 done = False
1634 while not done:
1635 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1636 with tagswitch(w) as case:
1637 if case(word_e.Operator):
1638 tok = cast(Token, w)
1639 if tok.id == Id.Right_ShArrayLiteral:
1640 right_token = tok
1641 done = True # can't use break here
1642 # Unlike command parsing, array parsing allows embedded \n.
1643 elif tok.id == Id.Op_Newline:
1644 continue
1645 else:
1646 p_die('Unexpected token in array literal', loc.Word(w))
1647
1648 elif case(word_e.Compound):
1649 words.append(cast(CompoundWord, w))
1650
1651 else:
1652 raise AssertionError()
1653
1654 if len(words) == 0: # a=() is empty indexed array
1655 # Needed for type safety, doh
1656 no_words = [] # type: List[word_t]
1657 node = ShArrayLiteral(left_token, no_words, right_token)
1658 return node
1659
1660 pairs = [] # type: List[AssocPair]
1661 # If the first one is a key/value pair, then the rest are assumed to be.
1662 pair = word_.DetectAssocPair(words[0])
1663 if pair:
1664 pairs.append(pair)
1665
1666 n = len(words)
1667 for i in xrange(1, n):
1668 w2 = words[i]
1669 pair = word_.DetectAssocPair(w2)
1670 if not pair:
1671 p_die("Expected associative array pair", loc.Word(w2))
1672
1673 pairs.append(pair)
1674
1675 # invariant List?
1676 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1677
1678 # Brace detection for arrays but NOT associative arrays
1679 words2 = braces.BraceDetectAll(words)
1680 words3 = word_.TildeDetectAll(words2)
1681 return ShArrayLiteral(left_token, words3, right_token)
1682
1683 def ParseProcCallArgs(self, start_symbol):
1684 # type: (int) -> ArgList
1685 """ json write (x) """
1686 self.lexer.MaybeUnreadOne()
1687
1688 arg_list = ArgList.CreateNull(alloc_lists=True)
1689 arg_list.left = self.cur_token
1690 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1691 return arg_list
1692
1693 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1694 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1695 """Helper for _ReadCompoundWord3."""
1696 done = False
1697
1698 if self.token_type == Id.Lit_EscapedChar:
1699 tok = self.cur_token
1700 assert tok.length == 2
1701 ch = lexer.TokenSliceLeft(tok, 1)
1702 if not self.parse_opts.parse_backslash():
1703 if not pyutil.IsValidCharEscape(ch):
1704 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1705 self.cur_token)
1706
1707 part = word_part.EscapedLiteral(self.cur_token,
1708 ch) # type: word_part_t
1709 else:
1710 part = self.cur_token
1711
1712 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1713 parts.append(part)
1714 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1715 # _ReadWord.
1716 next_id = self.lexer.LookPastSpace(lex_mode)
1717 if next_id == Id.Op_LParen:
1718 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1719 part2 = self._ReadArrayLiteral()
1720 parts.append(part2)
1721
1722 # Array literal must be the last part of the word.
1723 self._SetNext(lex_mode)
1724 self._GetToken()
1725 # EOF, whitespace, newline, Right_Subshell
1726 if self.token_kind not in KINDS_THAT_END_WORDS:
1727 p_die('Unexpected token after array literal',
1728 self.cur_token)
1729 done = True
1730
1731 elif (is_first and self.parse_opts.parse_at() and
1732 self.token_type == Id.Lit_Splice):
1733
1734 splice_tok = self.cur_token
1735 part2 = word_part.Splice(splice_tok,
1736 lexer.TokenSliceLeft(splice_tok, 1))
1737
1738 parts.append(part2)
1739
1740 # @words must be the last part of the word
1741 self._SetNext(lex_mode)
1742 self._GetToken()
1743 # EOF, whitespace, newline, Right_Subshell
1744 if self.token_kind not in KINDS_THAT_END_WORDS:
1745 p_die('Unexpected token after array splice', self.cur_token)
1746 done = True
1747
1748 elif (is_first and self.parse_opts.parse_at() and
1749 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1750 part2 = self._ReadExprSub(lex_mode_e.DQ)
1751 parts.append(part2)
1752
1753 # @[split(x)]
1754 self._SetNext(lex_mode)
1755 self._GetToken()
1756 # EOF, whitespace, newline, Right_Subshell
1757 if self.token_kind not in KINDS_THAT_END_WORDS:
1758 p_die('Unexpected token after Expr splice', self.cur_token)
1759 done = True
1760
1761 elif (is_first and self.parse_opts.parse_at() and
1762 self.token_type == Id.Lit_AtLBraceDot):
1763 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1764
1765 elif (is_first and self.parse_opts.parse_at_all() and
1766 self.token_type == Id.Lit_At):
1767 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1768 # at the beginning of a word to be reserved.
1769
1770 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1771 # @_argv and
1772 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1773 self.cur_token)
1774
1775 else:
1776 # not a literal with lookahead; append it
1777 parts.append(part)
1778
1779 return done
1780
1781 def _ReadCompoundWord(self, lex_mode):
1782 # type: (lex_mode_t) -> CompoundWord
1783 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1784
1785 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1786 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1787 """
1788 Precondition: Looking at the first token of the first word part
1789 Postcondition: Looking at the token after, e.g. space or operator
1790
1791 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1792 could be an operator delimiting a compound word. Can we change lexer modes
1793 and remove this special case?
1794 """
1795 w = CompoundWord([])
1796 num_parts = 0
1797 brace_count = 0
1798 done = False
1799 is_triple_quoted = None # type: Optional[BoolParamBox]
1800
1801 while not done:
1802 self._GetToken()
1803
1804 allow_done = empty_ok or num_parts != 0
1805 if allow_done and self.token_type == eof_type:
1806 done = True # e.g. for ${foo//pat/replace}
1807
1808 # Keywords like "for" are treated like literals
1809 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1810 Kind.ControlFlow, Kind.BoolUnary,
1811 Kind.BoolBinary):
1812
1813 # Syntax error for { and }
1814 if self.token_type == Id.Lit_LBrace:
1815 brace_count += 1
1816 elif self.token_type == Id.Lit_RBrace:
1817 brace_count -= 1
1818 elif self.token_type == Id.Lit_Dollar:
1819 if not self.parse_opts.parse_dollar():
1820 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1821 next_byte = self.lexer.ByteLookAhead()
1822 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1823 if next_byte == '/':
1824 #log('next_byte %r', next_byte)
1825 pass
1826
1827 p_die('Literal $ should be quoted like \$',
1828 self.cur_token)
1829
1830 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1831 w.parts)
1832
1833 elif self.token_kind == Kind.VSub:
1834 vsub_token = self.cur_token
1835
1836 part = SimpleVarSub(vsub_token) # type: word_part_t
1837 w.parts.append(part)
1838
1839 elif self.token_kind == Kind.ExtGlob:
1840 # If parse_at, we can take over @( to start @(seq 3)
1841 # Users can also use look at ,(*.py|*.sh)
1842 if (self.parse_opts.parse_at() and
1843 self.token_type == Id.ExtGlob_At and num_parts == 0):
1844 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1845 d_quoted=False)
1846 # RARE mutation of tok.id!
1847 cs_part.left_token.id = Id.Left_AtParen
1848 part = cs_part # for type safety
1849
1850 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1851 # a=(one two)x and @arrayfunc(3)x.
1852 self._GetToken()
1853 if self.token_kind not in KINDS_THAT_END_WORDS:
1854 p_die('Unexpected token after @()', self.cur_token)
1855 done = True
1856
1857 else:
1858 part = self._ReadExtGlob()
1859 w.parts.append(part)
1860
1861 elif self.token_kind == Kind.BashRegex:
1862 if self.token_type == Id.BashRegex_LParen: # Opening (
1863 part = self._ReadBashRegexGroup()
1864 w.parts.append(part)
1865 else:
1866 assert self.token_type == Id.BashRegex_AllowedInParens
1867 p_die('Invalid token in bash regex', self.cur_token)
1868
1869 elif self.token_kind == Kind.Left:
1870 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1871 lex_mode == lex_mode_e.ShCommand and
1872 num_parts == 0)
1873
1874 # Save allocation
1875 if try_triple_quote:
1876 is_triple_quoted = BoolParamBox(False)
1877
1878 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1879 w.parts.append(part)
1880
1881 # NOT done yet, will advance below
1882 elif self.token_kind == Kind.Right:
1883 # Still part of the word; will be done on the next iter.
1884 if self.token_type == Id.Right_DoubleQuote:
1885 pass
1886 # Never happens, no PushHint for this case.
1887 #elif self.token_type == Id.Right_DollarParen:
1888 # pass
1889 elif self.token_type == Id.Right_Subshell:
1890 # LEXER HACK for (case x in x) ;; esac )
1891 # Rewind before it's used
1892 assert self.next_lex_mode == lex_mode_e.Undefined
1893 if self.lexer.MaybeUnreadOne():
1894 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1895 self._SetNext(lex_mode)
1896 done = True
1897 else:
1898 done = True
1899
1900 elif self.token_kind == Kind.Ignored:
1901 done = True
1902
1903 else:
1904 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1905 # so to test for ESAC, we can read ) before getting a chance to
1906 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1907 # token and do it again.
1908
1909 # We get Id.Op_RParen at top level: case x in x) ;; esac
1910 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1911 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1912 # Rewind before it's used
1913 assert self.next_lex_mode == lex_mode_e.Undefined
1914 if self.lexer.MaybeUnreadOne():
1915 if self.token_type == Id.Eof_RParen:
1916 # Redo translation
1917 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1918 self._SetNext(lex_mode)
1919
1920 done = True # anything we don't recognize means we're done
1921
1922 if not done:
1923 self._SetNext(lex_mode)
1924 num_parts += 1
1925
1926 if (self.parse_opts.parse_brace() and num_parts > 1 and
1927 brace_count != 0):
1928 # accept { and }, but not foo{
1929 p_die(
1930 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1931 loc.Word(w))
1932
1933 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1934 p_die('Unexpected parts after triple quoted string',
1935 loc.WordPart(w.parts[-1]))
1936
1937 if 0:
1938 from _devbuild.gen.syntax_asdl import word_part_str
1939 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1940 WORD_HIST[word_key] += 1
1941 return w
1942
1943 def _ReadArithWord(self):
1944 # type: () -> Optional[word_t]
1945 """ Helper for ReadArithWord() """
1946 self._GetToken()
1947
1948 if self.token_kind == Kind.Unknown:
1949 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1950 p_die(
1951 'Unexpected token while parsing arithmetic: %r' %
1952 lexer.TokenVal(self.cur_token), self.cur_token)
1953
1954 elif self.token_kind == Kind.Eof:
1955 return self.cur_token
1956
1957 elif self.token_kind == Kind.Ignored:
1958 # Space should be ignored.
1959 self._SetNext(lex_mode_e.Arith)
1960 return None
1961
1962 elif self.token_kind in (Kind.Arith, Kind.Right):
1963 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1964 self._SetNext(lex_mode_e.Arith)
1965 return self.cur_token
1966
1967 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1968 return self._ReadCompoundWord(lex_mode_e.Arith)
1969
1970 else:
1971 raise AssertionError(self.cur_token)
1972
1973 def _ReadWord(self, word_mode):
1974 # type: (lex_mode_t) -> Optional[word_t]
1975 """Helper function for ReadWord()."""
1976
1977 # Change the pseudo lexer mode to a real lexer mode
1978 if word_mode == lex_mode_e.ShCommandFakeBrack:
1979 lex_mode = lex_mode_e.ShCommand
1980 else:
1981 lex_mode = word_mode
1982
1983 self._GetToken()
1984
1985 if self.token_kind == Kind.Eof:
1986 # No advance
1987 return self.cur_token
1988
1989 # Allow Arith for ) at end of for loop?
1990 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1991 self._SetNext(lex_mode)
1992
1993 # Newlines are complicated. See 3x2 matrix in the comment about
1994 # self.multiline and self.newline_state above.
1995 if self.token_type == Id.Op_Newline:
1996 if self.multiline:
1997 if self.newline_state > 1:
1998 # This points at a blank line, but at least it gives the line number
1999 p_die('Invalid blank line in multiline mode',
2000 self.cur_token)
2001 return None
2002
2003 if self.returned_newline: # skip
2004 return None
2005
2006 return self.cur_token
2007
2008 elif self.token_kind == Kind.Right:
2009 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2010 Id.Right_CasePat,
2011 Id.Right_ShArrayLiteral):
2012 raise AssertionError(self.cur_token)
2013
2014 self._SetNext(lex_mode)
2015 return self.cur_token
2016
2017 elif self.token_kind in (Kind.Ignored, Kind.WS):
2018 self._SetNext(lex_mode)
2019 return None
2020
2021 else:
2022 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2023 Kind.Left, Kind.KW, Kind.ControlFlow,
2024 Kind.BoolUnary, Kind.BoolBinary,
2025 Kind.ExtGlob,
2026 Kind.BashRegex), 'Unhandled token kind'
2027
2028 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2029 self.parse_opts.parse_bracket() and
2030 self.token_type == Id.Lit_LBracket):
2031 # Change [ from Kind.Lit -> Kind.Op
2032 # So CommandParser can treat
2033 # assert [42 === x]
2034 # like
2035 # json write (x)
2036 bracket_word = self.cur_token
2037 bracket_word.id = Id.Op_LBracket
2038
2039 self._SetNext(lex_mode)
2040 return bracket_word
2041
2042 # We're beginning a word. If we see Id.Lit_Pound, change to
2043 # lex_mode_e.Comment and read until end of line.
2044 if self.token_type == Id.Lit_Pound:
2045 self._SetNext(lex_mode_e.Comment)
2046 self._GetToken()
2047
2048 # NOTE: The # could be the last character in the file. It can't be
2049 # Eof_{RParen,Backtick} because #) and #` are comments.
2050 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2051 self.cur_token
2052
2053 # The next iteration will go into Kind.Ignored and set lex state to
2054 # lex_mode_e.ShCommand/etc.
2055 return None # tell ReadWord() to try again after comment
2056
2057 elif self.token_type == Id.Lit_TPound: ### doc comment
2058 self._SetNext(lex_mode_e.Comment)
2059 self._GetToken()
2060
2061 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2062 return self.cur_token
2063
2064 return None # tell ReadWord() to try again after comment
2065
2066 else:
2067 # r'' u'' b''
2068 if (self.token_type == Id.Lit_Chars and
2069 self.lexer.LookAheadOne(
2070 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2071
2072 # When shopt -s parse_raw_string:
2073 # echo r'hi' is like echo 'hi'
2074 #
2075 # echo u'\u{3bc}' b'\yff' works
2076
2077 tok = self.cur_token
2078 if self.parse_opts.parse_ysh_string():
2079 if lexer.TokenEquals(tok, 'r'):
2080 left_id = Id.Left_RSingleQuote
2081 elif lexer.TokenEquals(tok, 'u'):
2082 left_id = Id.Left_USingleQuote
2083 elif lexer.TokenEquals(tok, 'b'):
2084 left_id = Id.Left_BSingleQuote
2085 else:
2086 left_id = Id.Undefined_Tok
2087
2088 if left_id != Id.Undefined_Tok:
2089 # skip the r, and then 'foo' will be read as normal
2090 self._SetNext(lex_mode_e.ShCommand)
2091
2092 self._GetToken()
2093 assert self.token_type == Id.Left_SingleQuote, self.token_type
2094
2095 # Read the word in a different lexer mode
2096 return self._ReadYshSingleQuoted(left_id)
2097
2098 return self._ReadCompoundWord(lex_mode)
2099
2100 def ParseVarRef(self):
2101 # type: () -> BracedVarSub
2102 """DYNAMIC parsing of what's inside ${!ref}
2103
2104 # Same as VarOf production
2105 VarRefExpr = VarOf EOF
2106 """
2107 self._SetNext(lex_mode_e.VSub_1)
2108
2109 self._GetToken()
2110 if self.token_kind != Kind.VSub:
2111 p_die('Expected var name', self.cur_token)
2112
2113 part = self._ParseVarOf()
2114 # NOTE: no ${ } means no part.left and part.right
2115 part.left = part.token # cheat to make test pass
2116 part.right = part.token
2117
2118 self._GetToken()
2119 if self.token_type != Id.Eof_Real:
2120 p_die('Expected end of var ref expression', self.cur_token)
2121 return part
2122
2123 def LookPastSpace(self):
2124 # type: () -> Id_t
2125 """Look ahead to the next token.
2126
2127 For the CommandParser to recognize
2128 array= (1 2 3)
2129 YSH for ( versus bash for ((
2130 YSH if ( versus if test
2131 YSH while ( versus while test
2132 YSH bare assignment 'grep =' versus 'grep foo'
2133 """
2134 assert self.token_type != Id.Undefined_Tok
2135 if self.cur_token.id == Id.WS_Space:
2136 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2137 else:
2138 id_ = self.cur_token.id
2139 return id_
2140
2141 def LookAheadFuncParens(self):
2142 # type: () -> bool
2143 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2144 assert self.token_type != Id.Undefined_Tok
2145
2146 # We have to handle 2 cases because we buffer a token
2147 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2148 return self.lexer.LookAheadFuncParens(1) # go back one char
2149
2150 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2151 return self.lexer.LookAheadFuncParens(0)
2152
2153 else:
2154 return False
2155
2156 def ReadWord(self, word_mode):
2157 # type: (lex_mode_t) -> word_t
2158 """Read the next word, using the given lexer mode.
2159
2160 This is a stateful wrapper for the stateless _ReadWord function.
2161 """
2162 assert word_mode in (lex_mode_e.ShCommand,
2163 lex_mode_e.ShCommandFakeBrack,
2164 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2165
2166 if self.buffered_word: # For integration with pgen2
2167 w = self.buffered_word
2168 self.buffered_word = None
2169 else:
2170 while True:
2171 w = self._ReadWord(word_mode)
2172 if w is not None:
2173 break
2174
2175 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2176 return w
2177
2178 def ReadArithWord(self):
2179 # type: () -> word_t
2180 while True:
2181 w = self._ReadArithWord()
2182 if w is not None:
2183 break
2184 return w
2185
2186 def ReadHereDocBody(self, parts):
2187 # type: (List[word_part_t]) -> None
2188 """
2189 A here doc is like a double quoted context, except " isn't special.
2190 """
2191 self._ReadLikeDQ(None, False, parts)
2192 # Returns nothing
2193
2194 def ReadForPlugin(self):
2195 # type: () -> CompoundWord
2196 """For $PS1, $PS4, etc.
2197
2198 This is just like reading a here doc line. "\n" is allowed, as
2199 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2200 """
2201 w = CompoundWord([])
2202 self._ReadLikeDQ(None, False, w.parts)
2203 return w
2204
2205 def EmitDocToken(self, b):
2206 # type: (bool) -> None
2207 self.emit_doc_token = b
2208
2209 def Multiline(self, b):
2210 # type: (bool) -> None
2211 self.multiline = b
2212
2213
2214if 0:
2215 import collections
2216 WORD_HIST = collections.Counter()
2217
2218# vim: sw=4